Exemplo n.º 1
0
 def __init__(self,
              rom_path=_default_rom_path,
              frame_skip=4, history_length=4,
              resize_mode='scale', resized_rows=84, resized_cols=84, crop_offset=8,
              display_screen=False, max_null_op=30,
              replay_memory_size=1000000,
              replay_start_size=100,
              death_end_episode=True):
     super(AtariGame, self).__init__()
     self.rng = get_numpy_rng()
     self.ale = ale_load_from_rom(rom_path=rom_path, display_screen=display_screen)
     self.start_lives = self.ale.lives()
     self.action_set = self.ale.getMinimalActionSet()
     self.resize_mode = resize_mode
     self.resized_rows = resized_rows
     self.resized_cols = resized_cols
     self.crop_offset = crop_offset
     self.frame_skip = frame_skip
     self.history_length = history_length
     self.max_null_op = max_null_op
     self.death_end_episode = death_end_episode
     self.screen_buffer_length = 2
     self.screen_buffer = numpy.empty((self.screen_buffer_length,
                                       self.ale.getScreenDims()[1], self.ale.getScreenDims()[0]),
                                      dtype='uint8')
     self.replay_memory = ReplayMemory(state_dim=(resized_rows, resized_cols),
                                       history_length=history_length,
                                       memory_size=replay_memory_size,
                                       replay_start_size=replay_start_size)
     self.start()
Exemplo n.º 2
0
    def __init__(self, args):
        '''Constructor'''
        self.WARM_UP = 0
        self.QUALIFYING = 1
        self.RACE = 2
        self.UNKNOWN = 3
        self.stage = args.stage
        
        self.parser = msgParser.MsgParser()
        self.state = carState.CarState()
        self.control = carControl.CarControl()

        self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0]
        self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0]
        self.num_inputs = 19
        self.num_steers = len(self.steers)
        self.num_speeds = len(self.speeds)
        self.num_actions = self.num_steers + self.num_speeds
        
        self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args)
        self.mem = ReplayMemory(args.replay_size, self.num_inputs, args)
        self.minibatch_size = args.batch_size

        if args.load_replay:
            self.mem.load(args.load_replay)
        if args.load_weights:
            self.net.load_weights(args.load_weights)
        self.save_weights_prefix = args.save_weights_prefix
        self.save_interval = args.save_interval
        self.save_replay = args.save_replay

        self.enable_training = args.enable_training
        self.enable_exploration = args.enable_exploration
        self.save_csv = args.save_csv
        if self.save_csv:
          self.csv_file = open(args.save_csv, "wb")
          self.csv_writer = csv.writer(self.csv_file)
          self.csv_writer.writerow(['episode', 'distFormStart', 'distRaced', 'curLapTime', 'lastLapTime', 'racePos', 'epsilon', 'replay_memory', 'train_steps'])

        self.total_train_steps = 0
        self.exploration_decay_steps = args.exploration_decay_steps
        self.exploration_rate_start = args.exploration_rate_start
        self.exploration_rate_end = args.exploration_rate_end
        self.skip = args.skip

        self.show_sensors = args.show_sensors
        self.show_qvalues = args.show_qvalues

        self.episode = 0
        self.distances = []
        self.onRestart()
        
        if self.show_sensors:
            from sensorstats import Stats
            self.stats = Stats(inevery=8)
        
        if self.show_qvalues:
            from plotq import PlotQ
            self.plotq = PlotQ(self.num_steers, self.num_speeds)
Exemplo n.º 3
0
 def test_get_minibatch(self):                                     
     replay_memory = ReplayMemory(None,
                                      self.use_gpu_replay_mem,
                                      self.max_replay_memory, 
                                      self.train_batch_size,
                                      self.screen_history,
                                      self.screen_width,
                                      self.screen_height,
                                      self.minibatch_random,
                                      self.screen_order)
     
     for i in range(255):
         screen = np.zeros((self.screen_height, self.screen_width))
         screen.fill(i + 1)
         replay_memory.add(i + 1, 10 * (i + 1), screen, False)
     
         if i > self.train_batch_size + self.screen_history:
             prestates, actions, rewards, poststates, terminals = replay_memory.get_minibatch()
             for b in range(self.train_batch_size-1):
                 for h in range(self.screen_history-1):
                     self.assertTrue(prestates[b+1, 0, 0, h] < prestates[b, 0, 0, h])
                     self.assertTrue(prestates[b, 0, 0, h+1] > prestates[b, 0, 0, h])
Exemplo n.º 4
0
class TestBinaryHeap(unittest.TestCase):
    def setUp(self):
        self.heap = BinaryHeap()
        self.replayMemory = ReplayMemory(10, 32, 4, 84, 84)

    def test_Add(self):
        totalNo = 10
        for i in range(totalNo):
            state = np.zeros((84, 84), dtype=np.int)
            state.fill(i)
            td = i
            
            addedIndex = self.replayMemory.add(0, 0, state, 0)
            self.heap.add(addedIndex, td)
            
        for i in range(totalNo):
            topItem = self.heap.getTop()
            self.assertEqual(totalNo - i - 1, topItem[0])
            self.heap.remove(0)
Exemplo n.º 5
0
  def _train_minibatch(self, minibatch_size):
    if self.replay_memory.size() < minibatch_size:
      return

    # Sample a minibatch from replay memory
    non_terminal_minibatch, terminal_minibatch = \
                    self.replay_memory.get_minibatch(minibatch_size)
    non_terminal_minibatch, terminal_minibatch = \
                    list(non_terminal_minibatch), list(terminal_minibatch)

    # Compute max q-values for the non-terminal next states based
    # on the target network
    next_states = list(ReplayMemory.get_next_states(non_terminal_minibatch))
    q_values = self._predict_q_values(next_states, use_target_network=True)
    max_q_values = q_values.max(axis=1)

    # Gradient descent
    feed_dict = self._get_minibatch_feed_dict(
      max_q_values,
      non_terminal_minibatch,
      terminal_minibatch,
    )

    if self._should_log_summary():
      _, summary = self.session.run(
        [self.network.train_op, self.network.summary_op],
        feed_dict=feed_dict,
      )
      self.summary_writer.add_summary(summary, self.training_steps)
    else:
      self.session.run(self.network.train_op, feed_dict=feed_dict)

    self.training_steps += 1

    # Update the target network if needed
    self._update_target_network()
if __name__ == "__main__":
    #MODEL = importlib.import_module(FLAGS.model_file) # import network module
    #MODEL_FILE = os.path.join(BASE_DIR, 'models', FLAGS.model_file+'.py')

    ####### log writing
    FLAGS.LOG_DIR = FLAGS.LOG_DIR + '/' + FLAGS.task_name
    #FLAGS.CHECKPOINT_DIR = os.path.join(FLAGS.CHECKPOINT_DIR, FLAGS.task_name)
    #tf_util.mkdir(FLAGS.CHECKPOINT_DIR)

    if not FLAGS.is_training:
        agent = ActiveMVnet(FLAGS)
        senv = ShapeNetEnv(FLAGS)
        if FLAGS.pretrain_restore:
            restore_pretrain(agent)
        else:
            restore_from_iter(agent, FLAGS.test_iter)
        replay_mem = ReplayMemory(FLAGS)
        rollout_obj = Rollout(agent, senv, replay_mem, FLAGS)
        if FLAGS.test_random:
            test_random(agent, FLAGS.test_episode_num, replay_mem,
                        FLAGS.test_iter, rollout_obj)
        elif FLAGS.test_oneway:
            test_oneway(agent, FLAGS.test_episode_num, replay_mem,
                        FLAGS.test_iter, rollout_obj)
        else:
            test_active(agent, FLAGS.test_episode_num, replay_mem,
                        FLAGS.test_iter, rollout_obj)

        sys.exit()
Exemplo n.º 7
0
    def __init__(self, args):
        '''Constructor'''
        self.WARM_UP = 0
        self.QUALIFYING = 1
        self.RACE = 2
        self.UNKNOWN = 3
        self.stage = args.stage
        
        self.parser = msgParser.MsgParser()
        self.state = carState.CarState()
        self.control = carControl.CarControl()

        self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0]
        self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0]
        self.num_inputs = 19
        self.num_steers = len(self.steers)
        self.num_speeds = len(self.speeds)
        self.num_actions = self.num_steers + self.num_speeds
        
        self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args)
        self.mem = ReplayMemory(args.replay_size, self.num_inputs, args)
        self.minibatch_size = args.batch_size

        if args.load_weights:
            self.net.load_weights(args.load_weights)
        self.save_weights_prefix = args.save_weights_prefix
        self.pretrained_network = args.pretrained_network

        self.steer_lock = 0.785398
        self.max_speed = 100

        self.algorithm = args.algorithm
        self.device = args.device
        self.mode = args.mode
        self.maxwheelsteps = args.maxwheelsteps
        
        self.enable_training = args.enable_training
        self.enable_exploration = args.enable_exploration

        self.total_train_steps = 0
        self.exploration_decay_steps = args.exploration_decay_steps
        self.exploration_rate_start = args.exploration_rate_start
        self.exploration_rate_end = args.exploration_rate_end

        self.show_sensors = args.show_sensors
        self.show_qvalues = args.show_qvalues

        self.episode = 0
        self.onRestart()
        
        if self.show_sensors:
            from sensorstats import Stats
            self.stats = Stats(inevery=8)
        
        if self.show_qvalues:
            from plotq import PlotQ
            self.plotq = PlotQ(self.num_steers, self.num_speeds)

        if self.device == 'wheel':
            from wheel import Wheel
            self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force)
Exemplo n.º 8
0
        limit = 4000
    elif args.buffer_type == 'optimal_final':
        limit = 12000
    else:
        limit = np.inf

    # Agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    #TesnorboardX
    writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
        args.policy, "autotune" if args.automatic_entropy_tuning else ""))

    # Memory
    memory = ReplayMemory(args.replay_size)

    # Training Loop
    total_numsteps = 0
    updates = 0

    for i_episode in itertools.count(1):
        episode_reward = 0
        episode_steps = 0
        done = False
        state = env.reset()

        while not done:
            if args.start_steps > total_numsteps:
                action = env.action_space.sample()  # Sample random action
            else:
Exemplo n.º 9
0
                    default="INFO",
                    help="Log level.")
args = parser.parse_args()

logger = logging.getLogger()
logger.setLevel(args.log_level)

if args.random_seed:
    random.seed(args.random_seed)

# instantiate classes
env = GymEnvironment(args.game, args)
logger.info("Using Gym Environment")
net = DeepQNetwork(env.numActions(), args)
statistics = Statistics(net)
mem = ReplayMemory(args.replay_size, args)
agent = DqnAgent(env, mem, net, args, statistics=statistics)

if args.load_weights:
    logger.info("Loading weights from %s" % args.load_weights)
    net.load_weights(args.load_weights)

if args.play_games:
    logger.info("Playing for {} game(s)".format(args.play_games))
    agent.play(args.play_games)
    sys.exit()

for epoch in xrange(args.start_epoch, args.epochs):
    logger.info("Epoch #{}/{}".format(epoch + 1, args.epochs))

    if args.train_steps:
Exemplo n.º 10
0
    def _initialize(self, game=None, network_args=None, actions=None, name=None,
                    net_type="dqn",  # TODO change to the actual class name?
                    reshaped_x=None,
                    reshaped_y=None,
                    skiprate=3,
                    history_length=4,
                    batchsize=64,
                    update_pattern=(1, 1),
                    replay_memory_size=10000,
                    backprop_start_step=10000,
                    start_epsilon=1.0,
                    end_epsilon=0.1,
                    epsilon_decay_start_step=50000,
                    epsilon_decay_steps=100000,
                    reward_scale=1.0,  # TODO useless?
                    melt_steps=10000,

                    shaping_on=False,
                    count_time=False,
                    one_hot_time=False,
                    count_time_interval=1,
                    count_time_max=2100,

                    use_game_variables=True,
                    rearrange_misc=False,

                    remember_n_actions=4,
                    one_hot_nactions=False,

                    misc_scale=None,  # TODO seems useless
                    results_file=None,
                    params_file=None,
                    config_file=None,

                    no_timeout_terminal=False  # TODO seems useless
                    ):

        if game is not None:
            self.game = game
            self.config_file = None
        elif config_file is not None:
            self.config_file = config_file
            self.game = initialize_doom(self.config_file)
        else:
            raise Exception("No game, no config file. Dunno how to initialize doom.")

        if network_args is None:
            network_args = dict()

        if count_time:
            self.count_time = bool(count_time)
            if self.count_time:
                self.one_hot_time = one_hot_time
                self.count_time_max = int(count_time_max)
                self.count_time_interval = int(count_time_interval)
                if one_hot_time:
                    self.count_time_len = int(self.count_time_max / self.count_time_interval)
                else:
                    self.count_time_len = 1
        else:
            self.count_time_len = 0
            self.count_time = False

        self.name = name
        if reward_scale is not None:
            self.reward_scale = reward_scale
        else:
            self.reward_scale = 1.0
        self.rearrange_misc = rearrange_misc
        self.batchsize = batchsize
        self.history_length = max(history_length, 1)
        self.update_pattern = update_pattern
        self.epsilon = max(min(start_epsilon, 1.0), 0.0)
        self.end_epsilon = min(max(end_epsilon, 0.0), self.epsilon)
        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilon_decay_stride = (self.epsilon - end_epsilon) / epsilon_decay_steps
        self.epsilon_decay_start = epsilon_decay_start_step
        self.skiprate = max(skiprate, 0)
        self.shaping_on = shaping_on
        self.steps = 0
        self.melt_steps = melt_steps
        self.backprop_start_step = max(backprop_start_step, batchsize)
        self.one_hot_nactions = one_hot_nactions
        self.no_timeout_terminal = no_timeout_terminal
        if results_file:
            self.results_file = results_file
        else:
            self.results_file = "results/" + name + ".res"
        if params_file:
            self.params_file = params_file
        else:
            self.params_file = "params/" + name

        if self.game.get_available_game_variables_size() > 0 and use_game_variables:
            self.use_game_variables = True
        else:
            self.use_game_variables = False

        self.last_shaping_reward = 0

        self.learning_mode = True

        if actions is None:
            self.actions = generate_default_actions(self.game)
        else:
            self.actions = actions
        self.actions_num = len(self.actions)
        self.actions_stats = np.zeros([self.actions_num], np.int)

        # changes img_shape according to the history size
        self.channels = self.game.get_screen_channels()
        if self.history_length > 1:
            self.channels *= self.history_length

        if reshaped_x is None:
            x = self.game.get_screen_width()
            y = self.game.get_screen_height()
            scale_x = scale_y = 1.0
        else:
            x = reshaped_x
            scale_x = float(x) / self.game.get_screen_width()

            if reshaped_y is None:
                y = int(self.game.get_screen_height() * scale_x)
                scale_y = scale_x
            else:
                y = reshaped_y
                scale_y = float(y) / self.game.get_screen_height()

        img_shape = [self.channels, y, x]

        # TODO check if it is slow (it seems that no)
        if scale_x == 1 and scale_y == 1:
            def convert(img):
                img = img.astype(np.float32) / 255.0
                return img
        else:
            def convert(img):
                img = img.astype(np.float32) / 255.0
                new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype)
                for i in xrange(img.shape[0]):
                    # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True)
                    new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA)
                return new_image
        self.convert_image = convert

        if self.use_game_variables:
            single_state_misc_len = int(self.game.get_available_game_variables_size() + self.count_time_len)
        else:
            single_state_misc_len = int(self.count_time_len)
        self.single_state_misc_len = single_state_misc_len

        self.remember_n_actions = remember_n_actions
        total_misc_len = int(single_state_misc_len * self.history_length)

        if remember_n_actions > 0:
            self.remember_n_actions = remember_n_actions
            if self.one_hot_nactions:
                self.action_len = int(2 ** floor(log(len(self.actions), 2)))
            else:
                self.action_len = len(self.actions[0])
            self.last_action = np.zeros([self.action_len], dtype=np.float32)
            self.last_n_actions = np.zeros([remember_n_actions * self.action_len], dtype=np.float32)
            total_misc_len += len(self.last_n_actions)

        if total_misc_len > 0:
            self.misc_state_included = True
            self.current_misc_state = np.zeros(total_misc_len, dtype=np.float32)
            if single_state_misc_len > 0:
                if misc_scale is not None:
                    self.misc_scale = np.array(misc_scale, dtype=np.float32)
                else:
                    self.misc_scale = None
        else:
            self.misc_state_included = False

        state_format = dict()
        state_format["s_img"] = img_shape
        state_format["s_misc"] = total_misc_len
        self.replay_memory = ReplayMemory(state_format, replay_memory_size, batchsize)

        network_args["state_format"] = state_format
        network_args["actions_number"] = len(self.actions)

        if net_type in ("dqn", None, ""):
            self.approximator = approximators.DQN(**network_args)
        elif net_type in ["duelling", "dueling"]:
            self.approximator = approximators.DuelingDQN(**network_args)
        else:
            if locate('approximators.' + net_type) is not None:
                self.approximator = locate('approximators.' + net_type)(**network_args)
            else:
                raise Exception("Unsupported approximator type.")

        self.current_image_state = np.zeros(img_shape, dtype=np.float32)
Exemplo n.º 11
0
    def train(self, num_run=1):
        in_ts = time.time()
        for i_run in range(num_run):
            self.logger.important(f"START TRAINING RUN {i_run}")
            # Make the environment

            # Set Seed for repeatability
            torch.manual_seed(self.seed + i_run)
            np.random.seed(self.seed + i_run)
            self.env.seed(self.seed + i_run)
            self.env.action_space.np_random.seed(self.seed + i_run)

            # Setup TensorboardX
            writer_train = SummaryWriter(log_dir='runs/' + self.folder +
                                         'run_' + str(i_run) + '/train')
            writer_test = SummaryWriter(log_dir='runs/' + self.folder +
                                        'run_' + str(i_run) + '/test')

            # Setup Replay Memory
            memory = ReplayMemory(self.replay_size)

            # TRAINING LOOP
            total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0
            rewards = []
            i_episode = 0
            last_episode_steps = 0
            while True:
                self.env.stop_all_motors()
                while self.env.is_human_controlled():
                    continue
                if self.env.is_forget_enabled():
                    self.restore_model()
                    memory.forget_last(last_episode_steps)
                    i_episode -= 1
                    self.logger.info("Last Episode Forgotten")
                if self.env.is_test_phase():
                    self.test_phase(i_run, i_episode, writer_test)
                    continue
                if i_episode > self.num_episode:
                    break
                self.backup_model()
                self.logger.important(f"START EPISODE {i_episode}")
                ts = time.time()
                episode_reward = episode_steps = 0
                done = False
                info = {'undo': False}
                state = self.env.reset()
                state_buffer = None
                if self.pics:
                    state_buffer = StateBuffer(self.state_buffer_size, state)
                    state = state_buffer.get_state()

                critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0

                while not done:
                    if self.pics:
                        writer_train.add_image(
                            'episode_{}'.format(str(i_episode)),
                            state_buffer.get_tensor(), episode_steps)
                    if len(memory) < self.warm_up_steps:
                        action = self.env.action_space.sample()
                    else:
                        action = self.select_action(
                            state)  # Sample action from policy
                        if len(memory) > self.batch_size:
                            # Number of updates per step in environment
                            for i in range(self.updates_per_step):
                                # Update parameters of all the networks
                                critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = self.update_parameters(
                                    memory, self.batch_size, updates)

                                critic_1_loss_acc += critic_1_loss
                                critic_2_loss_acc += critic_2_loss
                                policy_loss_acc += policy_loss
                                ent_loss_acc += ent_loss
                                alpha_acc += alpha
                                updates += 1

                    next_state, reward, done, info = self.env.step(
                        action)  # Step
                    if self.pics:
                        state_buffer.push(next_state)
                        next_state = state_buffer.get_state()
                    episode_steps += 1
                    total_numsteps += 1
                    episode_reward += reward
                    mask = 1 if done else float(not done)
                    memory.push(state, action, reward, next_state,
                                mask)  # Append transition to memory

                    state = next_state
                last_episode_steps = episode_steps
                i_episode += 1

                rewards.append(episode_reward)
                running_episode_reward += (episode_reward -
                                           running_episode_reward) / i_episode
                if len(rewards) < 100:
                    running_episode_reward_100 = running_episode_reward
                else:
                    last_100 = rewards[-100:]
                    running_episode_reward_100 = np.array(last_100).mean()
                writer_train.add_scalar('loss/critic_1',
                                        critic_1_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('loss/critic_2',
                                        critic_2_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('loss/policy',
                                        policy_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('loss/entropy_loss',
                                        ent_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('entropy_temperature/alpha',
                                        alpha_acc / episode_steps, i_episode)
                writer_train.add_scalar('reward/train', episode_reward,
                                        i_episode)
                writer_train.add_scalar('reward/running_mean',
                                        running_episode_reward, i_episode)
                writer_train.add_scalar('reward/running_mean_last_100',
                                        running_episode_reward_100, i_episode)
                self.logger.info(
                    "Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s "
                    .format(
                        i_episode, self.num_episode, episode_steps,
                        round(episode_reward, 2),
                        round(running_episode_reward_100, 2),
                        round(time.time() - ts, 2),
                        str(datetime.timedelta(seconds=time.time() - in_ts))))
            self.env.close()
Exemplo n.º 12
0
class neonDQN(object):
    def __init__(self, input_shape, action_space):
        self._debug = 0
        self.mode = 'train'
        self.input_shape = input_shape
        self.action_space = action_space
        self.prev_action = action_space.sample()
        self.action_space_size = action_space.n
        self.steps = 0
        self.prelearning_steps = 50000  #50000
        self.total_steps = 10000  #1000000
        self.history_length = input_shape[0]
        self.history_step = 0
        self.observation_buffer = np.zeros(input_shape)
        # self.prev_state = np.zeros(input_shape[1:])
        # learning related
        self.learning_rate = 0.00025
        self.rmsprop_gamma2 = 1
        # experience replay related
        self.memoryIdx = 0
        self.memoryFillCount = 0
        self.memoryLimit = 50000  #1000000
        self.sampleSize = 32

        self.states = np.zeros((self.memoryLimit, ) + self.input_shape[1:],
                               dtype='uint8')
        self.actions = np.zeros((self.memoryLimit, ), dtype='uint8')
        self.rewards = np.zeros((self.memoryLimit, ))
        self.nextStates = np.zeros_like(self.states, dtype='uint8')
        self.dones = np.zeros_like(self.actions, dtype='bool')
        # target network update related
        self.targetNetC = 4  #10000
        # Q learning related
        self.gamma = 0.99

        #build Q-learning networks
        print "building network......"
        self.args = self.generate_parameter()
        self.net = self.build_network(self.args)
        self.mem = ReplayMemory(self.memoryLimit, self.args)

        np.set_printoptions(precision=4, suppress=True)

    def act(self, observation):
        observation = self.preprocess_state(observation)
        self.observation_buffer[:-1, ...] = self.observation_buffer[1:, ...]
        self.observation_buffer[-1, ...] = observation

        if self.mode == 'train':
            epsilon = max(
                0.1, 1 -
                max(self.steps - self.prelearning_steps, 0) / self.total_steps)
        elif self.mode == 'test':
            epsilon = .05
        else:
            assert False

        action = self.choose_action(self.observation_buffer, epsilon)
        return action

    def observe(self, state, action, reward, nextState, done):
        if self.mode == 'test':
            return

        state = self.preprocess_state(state)
        # self.prev_state = state
        nextState = self.preprocess_state(nextState)
        # self.prev_state = nextState

        self.steps += 1
        # ==========================================================
        # plt.figure(2)
        # plt.subplot(3, 1, 1)
        # plt.imshow(state)
        # plt.title("action: " + str(action) + "reward: " + str(reward)
        #           + "done: " + str(done))
        # plt.colorbar()
        # plt.subplot(3, 1, 2)
        # plt.imshow(nextState)
        # plt.subplot(3, 1, 3)
        # plt.imshow(nextState.astype('int16') - state)
        # plt.colorbar()
        # plt.show()
        # ==========================================================
        self.putInMemory(state, action, reward, nextState, done)
        # ==========================================================
        self.mem.add(action, reward, nextState, done)
        # ==========================================================

        if self.steps - self.prelearning_steps > 0:  # learning starts

            # state, action, reward, nextState, done = self.sampleFromMemory()
            # ==========================================================
            state, action, reward, nextState, done = self.mem.getMinibatch()
            # ==========================================================
            self.train(state, action, reward, nextState, done)

    def preprocess_state(self, state):
        # state_resize = imresize(state, (84, 84, 3))
        # state_resize_gray = np.mean(state_resize, axis=2)
        # max_state = np.maximum(prev_state, state_resize_gray)
        # return max_state.astype('uint8')
        state = cv2.resize(cv2.cvtColor(state, cv2.COLOR_RGB2GRAY),
                           self.input_shape[1:])
        return state

    def putInMemory(self, state, action, reward, nextState, done):
        memoryIdx = self.memoryIdx
        self.states[memoryIdx, ...] = state
        self.actions[memoryIdx, ...] = action
        self.rewards[memoryIdx, ...] = reward
        self.nextStates[memoryIdx, ...] = nextState
        self.dones[memoryIdx, ...] = done

        self.memoryIdx += 1
        self.memoryFillCount = max(self.memoryFillCount, self.memoryIdx)
        assert self.memoryFillCount <= self.memoryLimit
        self.memoryIdx = self.memoryIdx % self.memoryLimit

    def sampleFromMemory(self):
        # sampleIdx = np.random.permutation(self.memoryLimit)
        # sampleIdx = sampleIdx[:self.sampleSize]
        #
        # state = np.zeros((self.sampleSize,) + self.states.shape[1:])
        # action = np.zeros((self.sampleSize,) + self.actions.shape[1:], dtype='int')
        # reward = np.zeros((self.sampleSize,) + self.rewards.shape[1:])
        # nextState = np.zeros((self.sampleSize,) + self.nextStates.shape[1:])
        # done = np.zeros((self.sampleSize,) + self.dones.shape[1:], dtype='int')
        #
        # for i in xrange(self.sampleSize):
        #     state[i] = self.states[sampleIdx[i]]
        #     action[i] = self.actions[sampleIdx[i]]
        #     reward[i] = self.rewards[sampleIdx[i]]
        #     nextState[i] = self.nextStates[sampleIdx[i]]
        #     done[i] = self.dones[sampleIdx[i]]
        #
        # return state, action, reward, nextState, done
        #==================================================================================================
        state = np.zeros(
            (self.sampleSize, self.history_length) + self.states.shape[1:],
            dtype='uint8')
        nextState = np.zeros(
            (self.sampleSize, self.history_length) + self.nextStates.shape[1:],
            dtype='uint8')
        indexes = []
        while len(indexes) < self.sampleSize:
            # find random index
            while True:
                # sample one index (ignore states wraping over
                index = random.randint(self.history_length - 1,
                                       self.memoryFillCount - 1)
                # if wraps over current pointer, then get new one
                if index >= self.memoryIdx and index - (self.history_length -
                                                        1) < self.memoryIdx:
                    continue
                # if wraps over episode end, then get new one
                # NB! poststate (last screen) can be terminal state!
                if self.dones[(index - self.history_length + 1):index].any():
                    continue
                # if (self.rewards[(index - self.history_length + 1):index] != 0).any():
                #     continue
                # otherwise use this index
                break

            # NB! having index first is fastest in C-order matrices
            assert index >= self.history_length - 1
            assert index <= self.memoryLimit - 1
            state[len(indexes),
                  ...] = self.states[(index -
                                      (self.history_length - 1)):(index + 1),
                                     ...]
            nextState[len(indexes), ...] = self.nextStates[(
                index - (self.history_length - 1)):(index + 1), ...]
            indexes.append(index)

        # copy actions, rewards and terminals with direct slicing
        action = self.actions[indexes]
        reward = self.rewards[indexes]
        done = self.dones[indexes]
        return state, action, reward, nextState, done

    def build_network(self, args):
        net = DeepQNetwork(self.action_space_size, args)
        return net

    def choose_action(self, state, epsilon):
        if np.random.rand() < epsilon:
            return self.action_space.sample()
        else:
            return self.greedy(state)

    def greedy(self, state):
        # predict the Q values at current state
        state = state[np.newaxis, :]
        #replicate by batch_size
        state = np.tile(state, (self.sampleSize, 1, 1, 1))

        # ======================================================
        q = self.net.predict(state)
        #======================================================
        # q = self._network_forward(self.network, state)
        # ======================================================

        q = q[0, :]
        # return the index of maximum Q value
        return np.argmax(q)

    def _network_forward(self, net, state):
        assert state.shape[0] == self.sampleSize
        assert state.shape[1] == self.input_shape[0]

        state = state / 255.0
        arg_arrays = net.arg_dict
        train_iter = mx.io.NDArrayIter(data=state, batch_size=state.shape[0])
        data = arg_arrays[train_iter.provide_data[0][0]]

        q = []
        for batch in train_iter:
            # Copy data to executor input. Note the [:].
            data[:] = batch.data[0]

            self.network.forward(is_train=False)

            q = self.network.outputs[0]

        return q.asnumpy()

    def train(self, state, action, reward, nextState, done):
        epoch = 0
        minibatch = state, action, reward, nextState, done
        self.net.train(minibatch, epoch)
        # reward = np.clip(reward, -1, 1)
        #
        #
        # future_Qvalue = self._network_forward(self.targetNetwork, nextState)
        # future_reward = np.max(future_Qvalue, axis=1)
        # future_reward = future_reward[:, np.newaxis]
        #
        # nonzero_reward_list = np.nonzero(reward)
        # # reward += (1-done)*self.gamma*future_reward
        # reward += (1-abs(reward))*self.gamma*future_reward
        #
        # target_reward = self._network_forward(self.network, state)
        # old_target_reward = copy.deepcopy(target_reward)
        # for i in xrange(self.sampleSize):
        #     # target_reward[i][action[i]] = reward[i]
        #     # clip error to [-1, 1], Mnih 2015 Nature
        #     target_reward[i][action[i]] = max(min(reward[i], target_reward[i][action[i]]+1), target_reward[i][action[i]]-1)
        #
        # #=======================================================================
        # if self._debug:
        #     print "reward:", reward.transpose()
        #     print "future_reward:", future_reward.transpose()
        #     print "action:", action.transpose()
        #     print "done: ", done.transpose()
        #     figure_id = 0
        #     for batch_i in nonzero_reward_list[0]:
        #         if 1: #reward[batch_i, ...] != 0:
        #             figure_id += 1
        #             plt.figure(figure_id)
        #             for plot_i in range(0, self.history_length):
        #                 plt.subplot(3, self.history_length, plot_i + 1)
        #                 plt.imshow(state[batch_i, plot_i, ...])
        #                 plt.title("action: " + str(action[batch_i, ...]) + "reward: " + str(reward[batch_i, ...])
        #                           + "done: " + str(done[batch_i, ...]))
        #                 plt.colorbar()
        #
        #                 plt.subplot(3, self.history_length, plot_i + 1 + self.history_length)
        #                 plt.imshow(nextState[batch_i, plot_i, ...])
        #
        #                 plt.subplot(3, self.history_length, plot_i + 1 + self.history_length * 2)
        #                 plt.imshow(nextState[batch_i, plot_i, ...].astype('int16') - state[batch_i, plot_i, ...])
        #                 if plot_i == 0:
        #                     plt.title("reward: " + str(reward[batch_i, ...])
        #                           + " target reward: " + str(target_reward[batch_i, ...])
        #                           + " old reward: " + str(old_target_reward[batch_i, ...]))
        #                 plt.colorbar()
        #
        #     plt.show()
        #     # raw_input()
        # #=======================================================================
        #
        # train_data = state / 255.0
        # train_label = target_reward
        #
        #
        # # First we get handle to input arrays
        # arg_arrays = self.network.arg_dict
        # batch_size = self.sampleSize
        # train_iter = mx.io.NDArrayIter(data=train_data, label=train_label, batch_size=batch_size, shuffle=False)
        # # val_iter = mx.io.NDArrayIter(data=val_data, label=val_label, batch_size=batch_size)
        # data = arg_arrays[train_iter.provide_data[0][0]]
        # label = arg_arrays[train_iter.provide_label[0][0]]
        #
        # # opt = mx.optimizer.RMSProp(
        # #     learning_rate= self.learning_rate,
        # #     gamma2 = self.rmsprop_gamma2)
        #
        # opt = mx.optimizer.Adam(
        #     learning_rate=self.learning_rate)
        #
        # updater = mx.optimizer.get_updater(opt)
        #
        # # Finally we need a metric to print out training progress
        # metric = mx.metric.MSE()
        #
        # # Training loop begines
        # train_iter.reset()
        # metric.reset()
        #
        # for batch in train_iter:
        #     # Copy data to executor input. Note the [:].
        #     data[:] = batch.data[0]
        #     label[:] = batch.label[0]
        #
        #     # Forward
        #     self.network.forward(is_train=True)
        #
        #     # You perform operations on exe.outputs here if you need to.
        #     # For example, you can stack a CRF on top of a neural network.
        #
        #     # Backward
        #     self.network.backward()
        #
        #     # Update
        #     for i, pair in enumerate(zip(self.network.arg_arrays, self.network.grad_arrays)):
        #         weight, grad = pair
        #         updater(i, grad, weight)
        #     metric.update(batch.label, self.network.outputs)
        #
        #     if self.steps % 1000 == 0:
        #         print 'steps:', self.steps, 'metric:', metric.get()
        #         print 'network.outputs:', self.network.outputs[0].asnumpy()
        #         print 'label:', batch.label[0].asnumpy()
        #         # np.set_printoptions(precision=4)
        #         print 'delta: ', (batch.label[0].asnumpy() - self.network.outputs[0].asnumpy())
        # # t = 0
        # # metric.reset()
        # # for batch in val_iter:
        # #     # Copy data to executor input. Note the [:].
        # #     data[:] = batch.data[0]
        # #     label[:] = batch.label[0]
        # #
        # #     # Forward
        # #     self.network.forward(is_train=False)
        # #     metric.update(batch.label, self.network.outputs)
        # #     t += 1
        # #     if t % 50 == 0:
        # #         print 'epoch:', epoch, 'test iter:', t, 'metric:', metric.get()
        #
        # #========================================================================
        # #sync target-network with network as mentioned in Mnih et al. Nature 2015
        if self.steps % self.targetNetC == 0:
            self.net.update_target_network()
        #     self.targetNetwork.copy_params_from(self.network.arg_dict, self.network.aux_dict)

    # Basic Conv + BN + ReLU factory
    def ConvFactory(self,
                    data,
                    num_filter,
                    kernel,
                    stride=(1, 1),
                    pad=(0, 0),
                    act_type="relu"):
        # there is an optional parameter ```wrokshpace``` may influece convolution performance
        # default, the workspace is set to 256(MB)
        # you may set larger value, but convolution layer only requires its needed but not exactly
        # MXNet will handle reuse of workspace without parallelism conflict
        conv = mx.symbol.Convolution(data=data,
                                     workspace=256,
                                     num_filter=num_filter,
                                     kernel=kernel,
                                     stride=stride,
                                     pad=pad)
        # bn = mx.symbol.BatchNorm(data=conv)
        act = mx.symbol.Activation(data=conv, act_type=act_type)
        return act

    def generate_parameter(self):
        def str2bool(v):
            return v.lower() in ("yes", "true", "t", "1")

        parser = argparse.ArgumentParser()

        envarg = parser.add_argument_group('Environment')
        envarg.add_argument(
            "--game",
            default="Catcher-v0",
            help=
            "ROM bin file or env id such as Breakout-v0 if training with Open AI Gym."
        )
        envarg.add_argument(
            "--environment",
            choices=["ale", "gym"],
            default="ale",
            help="Whether to train agent using ALE or OpenAI Gym.")
        envarg.add_argument(
            "--display_screen",
            type=str2bool,
            default=False,
            help="Display game screen during training and testing.")
        # envarg.add_argument("--sound", type=str2bool, default=False, help="Play (or record) sound.")
        envarg.add_argument(
            "--frame_skip",
            type=int,
            default=4,
            help="How many times to repeat each chosen action.")
        envarg.add_argument(
            "--repeat_action_probability",
            type=float,
            default=0,
            help=
            "Probability, that chosen action will be repeated. Otherwise random action is chosen during repeating."
        )
        envarg.add_argument("--minimal_action_set",
                            dest="minimal_action_set",
                            type=str2bool,
                            default=True,
                            help="Use minimal action set.")
        envarg.add_argument(
            "--color_averaging",
            type=str2bool,
            default=True,
            help="Perform color averaging with previous frame.")
        envarg.add_argument("--screen_width",
                            type=int,
                            default=64,
                            help="Screen width after resize.")
        envarg.add_argument("--screen_height",
                            type=int,
                            default=64,
                            help="Screen height after resize.")
        envarg.add_argument(
            "--record_screen_path",
            default="./",
            help=
            "Record game screens under this path. Subfolder for each game is created."
        )
        envarg.add_argument("--record_sound_filename",
                            default="./",
                            help="Record game sound in this file.")

        memarg = parser.add_argument_group('Replay memory')
        memarg.add_argument("--replay_size",
                            type=int,
                            default=50000,
                            help="Maximum size of replay memory.")
        memarg.add_argument("--history_length",
                            type=int,
                            default=4,
                            help="How many screen frames form a state.")

        netarg = parser.add_argument_group('Deep Q-learning network')
        netarg.add_argument("--learning_rate",
                            type=float,
                            default=0.00025,
                            help="Learning rate.")
        netarg.add_argument("--discount_rate",
                            type=float,
                            default=0.99,
                            help="Discount rate for future rewards.")
        netarg.add_argument("--batch_size",
                            type=int,
                            default=32,
                            help="Batch size for neural network.")
        netarg.add_argument('--optimizer',
                            choices=['rmsprop', 'adam', 'adadelta'],
                            default='rmsprop',
                            help='Network optimization algorithm.')
        netarg.add_argument(
            "--decay_rate",
            type=float,
            default=0.95,
            help="Decay rate for RMSProp and Adadelta algorithms.")
        netarg.add_argument(
            "--clip_error",
            type=float,
            default=1,
            help=
            "Clip error term in update between this number and its negative.")
        netarg.add_argument("--min_reward",
                            type=float,
                            default=-1,
                            help="Minimum reward.")
        netarg.add_argument("--max_reward",
                            type=float,
                            default=1,
                            help="Maximum reward.")
        netarg.add_argument("--batch_norm",
                            type=str2bool,
                            default=False,
                            help="Use batch normalization in all layers.")

        # netarg.add_argument("--rescale_r", type=str2bool, help="Rescale rewards.")
        # missing: bufferSize=512,valid_size=500,min_reward=-1,max_reward=1

        neonarg = parser.add_argument_group('Neon')
        neonarg.add_argument('--backend',
                             choices=['cpu', 'gpu'],
                             default='gpu',
                             help='backend type')
        neonarg.add_argument('--device_id',
                             type=int,
                             default=0,
                             help='gpu device id (only used with GPU backend)')
        neonarg.add_argument(
            '--datatype',
            choices=['float16', 'float32', 'float64'],
            default='float32',
            help=
            'default floating point precision for backend [f64 for cpu only]')
        neonarg.add_argument(
            '--stochastic_round',
            const=True,
            type=int,
            nargs='?',
            default=False,
            help=
            'use stochastic rounding [will round to BITS number of bits if specified]'
        )

        antarg = parser.add_argument_group('Agent')
        antarg.add_argument("--exploration_rate_start",
                            type=float,
                            default=1,
                            help="Exploration rate at the beginning of decay.")
        antarg.add_argument("--exploration_rate_end",
                            type=float,
                            default=0.1,
                            help="Exploration rate at the end of decay.")
        antarg.add_argument(
            "--exploration_decay_steps",
            type=float,
            default=10000,
            help="How many steps to decay the exploration rate.")
        antarg.add_argument("--exploration_rate_test",
                            type=float,
                            default=0.05,
                            help="Exploration rate used during testing.")
        antarg.add_argument(
            "--train_frequency",
            type=int,
            default=4,
            help="Perform training after this many game steps.")
        antarg.add_argument(
            "--train_repeat",
            type=int,
            default=1,
            help="Number of times to sample minibatch during training.")
        antarg.add_argument(
            "--target_steps",
            type=int,
            default=4,
            help=
            "Copy main network to target network after this many game steps.")
        antarg.add_argument(
            "--random_starts",
            type=int,
            default=30,
            help=
            "Perform max this number of dummy actions after game restart, to produce more random game dynamics."
        )

        nvisarg = parser.add_argument_group('Visualization')
        nvisarg.add_argument(
            "--visualization_filters",
            type=int,
            default=4,
            help="Number of filters to visualize from each convolutional layer."
        )
        nvisarg.add_argument("--visualization_file",
                             default="tmp",
                             help="Write layer visualization to this file.")

        mainarg = parser.add_argument_group('Main loop')
        mainarg.add_argument(
            "--random_steps",
            type=int,
            default=50000,
            help=
            "Populate replay memory with random steps before starting learning."
        )
        mainarg.add_argument("--train_steps",
                             type=int,
                             default=250000,
                             help="How many training steps per epoch.")
        mainarg.add_argument("--test_steps",
                             type=int,
                             default=125000,
                             help="How many testing steps after each epoch.")
        mainarg.add_argument("--epochs",
                             type=int,
                             default=200,
                             help="How many epochs to run.")
        mainarg.add_argument(
            "--start_epoch",
            type=int,
            default=0,
            help=
            "Start from this epoch, affects exploration rate and names of saved snapshots."
        )
        mainarg.add_argument(
            "--play_games",
            type=int,
            default=0,
            help="How many games to play, suppresses training and testing.")
        mainarg.add_argument("--load_weights", help="Load network from file.")
        mainarg.add_argument(
            "--save_weights_prefix",
            help=
            "Save network to given file. Epoch and extension will be appended."
        )
        mainarg.add_argument("--csv_file",
                             help="Write training progress to this file.")

        comarg = parser.add_argument_group('Common')
        comarg.add_argument("--random_seed",
                            type=int,
                            help="Random seed for repeatable experiments.")
        comarg.add_argument(
            "--log_level",
            choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
            default="INFO",
            help="Log level.")
        args = parser.parse_args()
        return args
Exemplo n.º 13
0
class Agent:
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)

        nets = nets_dm

        # init replay memory
        self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype])

        # own replay memory
        self.replay_memory = deque(maxlen=rm_size)

        # start tf session
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=threads,
            log_device_placement=False,
            allow_soft_placement=True))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA)
        self.theta_q = nets.theta_q(dimO, dimA)
        self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test, sum_p = nets.policy(obs, self.theta_p)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma))
        act_expl = act_test + noise

        # test
        q, sum_q = nets.qfunction(obs, act_test, self.theta_q, name= 'q_mu_of_s')
        # training
        # policy loss
        meanq = tf.reduce_mean(q, 0)
        wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=lrp)
        grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q optimization
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")
        # q
        q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q, name= 'qs_a')
        # q targets
        act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt)
        q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt, name='qsprime_aprime')
        q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=lrq)
        grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        # logging
        log_obs = [] if dimO[0] > 20 else [tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0])]
        log_act = [] if dimA[0] > 20 else [tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in
                                           range(dimA[0])]
        log_act2 = [] if dimA[0] > 20 else [tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in
                                            range(dimA[0])]
        log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)]
        log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)]
        log_noise = [tf.histogram_summary('noise', noise_var)]
        log_train = log_obs + log_act + log_act2 + log_misc + log_grad + log_noise

        merged = tf.merge_all_summaries()
        # initialize tf log writer
        self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf", self.sess.graph, flush_secs=20)

        # init replay memory for recording episodes
        max_ep_length = 10000
        self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype)

        # tf functions
        with self.sess.as_default():
            self.act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train_q = Fun([obs, act_train, rew, obs2, term2], [train_q], log_train, self.writer)
            self._train_p = Fun([obs], [train_p])
            self._train_p = Fun([obs], [train_p], log_obs, self.writer)
            self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q], merged, self.writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def reset(self, obs):
        self._reset()
        self.observation = obs  # initial observation

    def act(self, test=False):
        obs = np.expand_dims(self.observation, axis=0)
        action = self.act_test(obs) if test else self._act_expl(obs)
        self.action = np.atleast_1d(np.squeeze(action, axis=0))  # TODO: remove this hack
        return self.action

    def observe(self, rew, term, obs2, test=False, perform_trainstep= True):

        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1
            self.rm.enqueue(obs1, term, self.action, rew)
            self.replay_memory.append((obs1, self.action, rew, obs2, term))

            if self.t > FLAGS.warmup:
                # print('warmed up')
                if perform_trainstep: self.train()

            # elif FLAGS.warmq and self.rm.n > 1000:
            #     # Train Q on warmup
            #     obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
            #     self._train_q(obs, act, rew, ob2, term2, log=(np.random.rand() < FLAGS.log), global_step=self.t)

                # save parameters etc.
                # if (self.t+45000) % 50000 == 0: # TODO: correct
                #   s = self.saver.save(self.sess,FLAGS.outdir+"f/tf/c",self.t)
                #   print("DDPG Checkpoint: " + s)

    def train(self):
        # obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
        obs, act, rew, ob2, term2, = self.get_train_batch()
        log = (np.random.rand() < FLAGS.log)

        if FLAGS.async:
            self._train(obs, act, rew, ob2, term2, log=log, global_step=self.t)
        else:
            self._train_q(obs, act, rew, ob2, term2, log=log, global_step=self.t)
            self._train_p(obs, log=log, global_step=self.t)

    def write_scalar(self, tag, val):
        s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)])
        self.writer.add_summary(s, self.t)

    def __del__(self):
        self.sess.close()

    def get_train_batch(self):

        #selecting transitions randomly from the replay memory:
        indices =  np.random.randint(0, len(self.replay_memory), [FLAGS.bsize])
        transition_batch = [self.replay_memory[i] for i in indices]

        states = np.asarray([transition_batch[i][0].squeeze() for i in range(FLAGS.bsize)])
        actions = np.asarray([transition_batch[i][1] for i in range(FLAGS.bsize)])
        rewards = np.asarray([transition_batch[i][2] for i in range(FLAGS.bsize)])
        states_prime = np.asarray([transition_batch[i][3].squeeze() for i in range(FLAGS.bsize)])
        term2 = np.asarray([transition_batch[i][4] for i in range(FLAGS.bsize)])

        return states, actions, rewards, states_prime, term2
Exemplo n.º 14
0
    def __init__(self, cfg, restore=False):
        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.allow_growth = True
        sess_config.gpu_options.per_process_gpu_memory_fraction = 0.4
        self.sess = tf.Session(config=sess_config)
        self.cfg = cfg
        assert cfg.gan == 'ls' or cfg.gan == 'w'
        self.dir = os.path.join('models', cfg.name)
        self.image_dir = os.path.join(self.dir,
                                      'images-' + cfg.name.replace('/', '-'))
        self.dump_dir = os.path.join(self.dir,
                                     'dump-' + cfg.name.replace('/', '-'))
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        if not os.path.exists(self.dump_dir):
            os.makedirs(self.dump_dir)
        if not os.path.exists(self.image_dir):
            os.makedirs(self.image_dir)

        if not restore:
            self.backup_scripts()
            self.tee = Tee(os.path.join(self.dir, 'log.txt'))

        self.is_train = tf.placeholder(tf.int32, shape=[], name='is_train')
        self.is_training = tf.equal(self.is_train, 1)
        self.memory = ReplayMemory(cfg, load=not restore)

        self.z = self.memory.z
        self.real_data = self.memory.real_data
        self.real_data_feature = self.memory.real_data_feature
        self.fake_input = self.memory.fake_input
        self.fake_input_feature = self.memory.fake_input_feature
        self.states = self.memory.states
        self.ground_truth = self.memory.ground_truth
        self.progress = self.memory.progress

        self.surrogate_loss_addition = 0
        with tf.variable_scope('generator'):
            fake_output, self.generator_debug_output, self.generator_debugger = cfg.generator(
                [self.fake_input, self.z, self.states],
                is_train=self.is_train,
                progress=self.progress,
                cfg=cfg)
            self.fake_output, self.new_states, self.surrogate_loss_addition, self.penalty = fake_output
            self.fake_output_feature = self.fake_input_feature
            self.memory.fake_output_feature = self.fake_output_feature
            self.memory.fake_output = self.fake_output

        print(cfg.critic)
        self.real_logit, self.real_embeddings, self.test_real_gradients = cfg.critic(
            images=self.real_data, cfg=cfg, is_train=self.is_training)
        self.fake_logit, self.fake_embeddings, self.test_fake_gradients = cfg.critic(
            images=self.fake_output,
            cfg=cfg,
            reuse=True,
            is_train=self.is_training)
        self.fake_input_logit, self.fake_input_embeddings, _ = cfg.critic(
            images=self.fake_input,
            cfg=cfg,
            reuse=True,
            is_train=self.is_training)
        print('real_logit', self.real_logit.shape)

        with tf.variable_scope('rl_value'):
            print('self.states', self.states.shape)
            print('self.new_states', self.new_states.shape)
            self.old_value, _, _ = cfg.value(images=self.fake_input,
                                             states=self.states,
                                             cfg=cfg,
                                             reuse=False,
                                             is_train=self.is_training)
            self.new_value, _, _ = cfg.value(images=self.fake_output,
                                             states=self.new_states,
                                             cfg=cfg,
                                             reuse=True,
                                             is_train=self.is_training)

        stopped = self.new_states[:, STATE_STOPPED_DIM:STATE_STOPPED_DIM + 1]
        clear_final = tf.cast(
            self.new_states[:, STATE_STEP_DIM:STATE_STEP_DIM + 1] >
            self.cfg.maximum_trajectory_length, tf.float32)
        print('clear final', clear_final.shape)
        print('new_value', self.new_value.shape)
        self.new_value = self.new_value * (1.0 - clear_final)
        # Reward: the bigger, the better

        if cfg.supervised:
            self.raw_reward = (cfg.all_reward +
                               (1 - cfg.all_reward) * stopped) * (
                                   -self.fake_logit)
        else:
            if cfg.gan == 'ls':
                self.raw_reward = (cfg.all_reward +
                                   (1 - cfg.all_reward) * stopped) * (
                                       1 - (self.fake_logit - 1)**2)
            else:
                self.raw_reward = (cfg.all_reward +
                                   (1 - cfg.all_reward) * stopped) * (
                                       self.fake_logit -
                                       tf.stop_gradient(self.fake_input_logit)
                                   ) * cfg.critic_logit_multiplier
        self.reward = self.raw_reward
        if cfg.use_penalty:
            self.reward -= self.penalty
        print('new_states_slice', self.new_states)
        print('new_states_slice',
              self.new_states[:, STATE_REWARD_DIM:STATE_REWARD_DIM + 1])
        print('fake_logit', self.fake_logit.shape)

        self.exp_moving_average = tf.train.ExponentialMovingAverage(
            decay=0.99, zero_debias=True)

        # TD learning
        print('reward', self.reward.shape)
        # If it stops, future return should be zero
        self.q_value = self.reward + (
            1.0 - stopped) * cfg.discount_factor * self.new_value
        print('q', self.q_value.shape)
        self.advantage = tf.stop_gradient(self.q_value) - self.old_value
        self.v_loss = tf.reduce_mean(self.advantage**2, axis=(0, 1))

        if cfg.gan == 'ls':
            print('** LSGAN')
            self.c_loss = tf.reduce_mean(self.fake_logit**2) + tf.reduce_mean(
                (self.real_logit - 1)**2)
            if cfg.use_TD:
                routine_loss = -self.q_value * self.cfg.parameter_lr_mul
                advantage = -self.advantage
            else:
                routine_loss = -self.reward
                advantage = -self.reward
            print('routine_loss', routine_loss.shape)
            print('pg_loss', self.surrogate_loss_addition.shape)
            assert len(routine_loss.shape) == len(
                self.surrogate_loss_addition.shape)

            self.g_loss = tf.reduce_mean(routine_loss +
                                         self.surrogate_loss_addition *
                                         tf.stop_gradient(advantage))
            self.emd = self.c_loss
            self.c_average = tf.constant(0, dtype=tf.float32)
        else:
            print('** WGAN')
            self.c_loss = tf.reduce_mean(self.fake_logit - self.real_logit)
            if cfg.use_TD:
                routine_loss = -self.q_value * self.cfg.parameter_lr_mul
                advantage = -self.advantage
            else:
                routine_loss = -self.reward
                advantage = -self.reward
            print('routine_loss', routine_loss.shape)
            print('pg_loss', self.surrogate_loss_addition.shape)
            assert len(routine_loss.shape) == len(
                self.surrogate_loss_addition.shape)

            self.g_loss = tf.reduce_mean(routine_loss +
                                         self.surrogate_loss_addition *
                                         tf.stop_gradient(advantage))
            self.emd = -self.c_loss
            self.c_average = tf.reduce_mean(self.fake_logit +
                                            self.real_logit) * 0.5
        update_average = self.exp_moving_average.apply([self.c_average])
        self.c_average_smoothed = self.exp_moving_average.average(
            self.c_average)
        self.centered_fake_logit = self.fake_logit - self.c_average_smoothed
        self.fake_gradients = tf.gradients(self.fake_logit, [
            self.fake_output,
        ])[0]

        # Critic gradient norm and penalty
        alpha_dist = tf.contrib.distributions.Uniform(low=0., high=1.)
        alpha = alpha_dist.sample((cfg.batch_size, 1, 1, 1))
        interpolated = self.real_data + alpha * (self.fake_output -
                                                 self.real_data)

        inte_logit, inte_embeddings, _ = cfg.critic(images=interpolated,
                                                    cfg=cfg,
                                                    reuse=True,
                                                    is_train=self.is_training)

        gradients = tf.gradients(inte_logit, [
            interpolated,
        ])[0]

        gradient_norm = tf.sqrt(1e-6 +
                                tf.reduce_sum(gradients**2, axis=[1, 2, 3]))
        gradient_penalty = cfg.gradient_penalty_lambda * tf.reduce_mean(
            tf.maximum(gradient_norm - 1.0, 0.0)**2)
        _ = tf.summary.scalar("grad_penalty_loss", gradient_penalty)
        self.critic_gradient_norm = tf.reduce_mean(gradient_norm)
        _ = tf.summary.scalar("grad_norm", self.critic_gradient_norm)
        if cfg.gan == 'w':
            if cfg.gradient_penalty_lambda > 0:
                print('** Using gradient penalty')
                self.c_loss += gradient_penalty
        else:
            gradient_norm = tf.sqrt(
                tf.reduce_sum(self.fake_gradients**2, axis=[1, 2, 3]))
            self.critic_gradient_norm = tf.reduce_mean(gradient_norm)
            print('** NOT using gradient penalty')

        _ = tf.summary.scalar("g_loss", self.g_loss)
        _ = tf.summary.scalar("neg_c_loss", -self.c_loss)
        _ = tf.summary.scalar("EMD", self.emd)

        self.theta_g = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='generator')
        self.theta_c = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='critic')
        self.theta_v = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='rl_value')
        print('# variables')
        print('    generator:', len(self.theta_g))
        print('    value:', len(self.theta_v))
        print('    critic:', len(self.theta_c))

        self.lr_g = tf.placeholder(dtype=tf.float32, shape=[], name='lr_g')
        self.lr_c = tf.placeholder(dtype=tf.float32, shape=[], name='lr_c')

        # Optimizer for Value estimator, use the same lr as g
        self.counter_v = tf.Variable(trainable=False,
                                     initial_value=0,
                                     dtype=tf.int32)
        self.opt_v = ly.optimize_loss(loss=self.v_loss,
                                      learning_rate=self.cfg.value_lr_mul *
                                      self.lr_g,
                                      optimizer=cfg.generator_optimizer,
                                      variables=self.theta_v,
                                      global_step=self.counter_v,
                                      summaries=['gradient_norm'])

        # Optimize for Generator (Actor)
        self.counter_g = tf.Variable(trainable=False,
                                     initial_value=0,
                                     dtype=tf.int32)
        self.opt_g = ly.optimize_loss(loss=self.g_loss,
                                      learning_rate=self.lr_g,
                                      optimizer=cfg.generator_optimizer,
                                      variables=self.theta_g,
                                      global_step=self.counter_g,
                                      summaries=['gradient_norm'])

        # Optimize for Discriminator (critic in WGAN or discriminator in LSGAN)
        self.counter_c = tf.Variable(trainable=False,
                                     initial_value=0,
                                     dtype=tf.int32)
        if not self.cfg.supervised:
            self.opt_c = ly.optimize_loss(loss=self.c_loss,
                                          learning_rate=self.lr_c,
                                          optimizer=cfg.critic_optimizer,
                                          variables=self.theta_c,
                                          global_step=self.counter_c,
                                          summaries=['gradient_norm'])

            if cfg.gan == 'w' and cfg.gradient_penalty_lambda <= 0:
                print(
                    '** make sure your NN input has mean 0, as biases will also be clamped.'
                )
                # Merge the clip operations on critic variables
                # For WGAN
                clipped_var_c = [
                    tf.assign(
                        var,
                        tf.clip_by_value(var, -self.cfg.clamp_critic,
                                         self.cfg.clamp_critic))
                    for var in self.theta_c
                ]
                with tf.control_dependencies([self.opt_c]):
                    self.opt_c = tf.tuple(clipped_var_c)

            with tf.control_dependencies([self.opt_c]):
                self.opt_c = tf.group(update_average)

        self.saver = tf.train.Saver(
            max_to_keep=1)  # save all checkpoints  max_to_keep=None

        self.sess.run(tf.global_variables_initializer())

        self.merged_all = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(self.dir, self.sess.graph)

        if not restore:
            self.fixed_feed_dict_random = self.memory.get_feed_dict(
                self.cfg.num_samples)
        self.high_res_nets = {}
Exemplo n.º 15
0
MAX_YAW = 2 * np.pi
MAX_X = 20
MAX_Y = 20
max_lidar_value = 14
THRESHOLD_DISTANCE_2_GOAL = 0.2 / max(MAX_X, MAX_Y)
UPDATE_EVERY = 5
count = 0
total_numsteps = 0
updates = 0
num_goal_reached = 0
done = False
i_episode = 1
episode_reward = 0
max_ep_reward = 0
episode_steps = 0
memory = ReplayMemory(args.replay_size, args.seed)


class DeepracerGym(gym.Env):
    def __init__(self, target_point):
        super(DeepracerGym, self).__init__()

        n_actions = 2  #velocity,steering
        metadata = {'render.modes': ['console']}
        #self.action_space = spaces.Discrete(n_actions)
        self.action_space = spaces.Box(np.array([0., -1.]),
                                       np.array([1., 1.]),
                                       dtype=np.float32)  # speed and steering
        # self.pose_observation_space = spaces.Box(np.array([-1. , -1., -1.]),np.array([1., 1., 1.]),dtype = np.float32)
        # self.lidar_observation_space = spaces.Box(0,1.,shape=(720,),dtype = np.float32)
        # self.observation_space = spaces.Tuple((self.pose_observation_space,self.lidar_observation_space))
Exemplo n.º 16
0
class GAN:
    def __init__(self, cfg, restore=False):
        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.allow_growth = True
        sess_config.gpu_options.per_process_gpu_memory_fraction = 0.4
        self.sess = tf.Session(config=sess_config)
        self.cfg = cfg
        assert cfg.gan == 'ls' or cfg.gan == 'w'
        self.dir = os.path.join('models', cfg.name)
        self.image_dir = os.path.join(self.dir,
                                      'images-' + cfg.name.replace('/', '-'))
        self.dump_dir = os.path.join(self.dir,
                                     'dump-' + cfg.name.replace('/', '-'))
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        if not os.path.exists(self.dump_dir):
            os.makedirs(self.dump_dir)
        if not os.path.exists(self.image_dir):
            os.makedirs(self.image_dir)

        if not restore:
            self.backup_scripts()
            self.tee = Tee(os.path.join(self.dir, 'log.txt'))

        self.is_train = tf.placeholder(tf.int32, shape=[], name='is_train')
        self.is_training = tf.equal(self.is_train, 1)
        self.memory = ReplayMemory(cfg, load=not restore)

        self.z = self.memory.z
        self.real_data = self.memory.real_data
        self.real_data_feature = self.memory.real_data_feature
        self.fake_input = self.memory.fake_input
        self.fake_input_feature = self.memory.fake_input_feature
        self.states = self.memory.states
        self.ground_truth = self.memory.ground_truth
        self.progress = self.memory.progress

        self.surrogate_loss_addition = 0
        with tf.variable_scope('generator'):
            fake_output, self.generator_debug_output, self.generator_debugger = cfg.generator(
                [self.fake_input, self.z, self.states],
                is_train=self.is_train,
                progress=self.progress,
                cfg=cfg)
            self.fake_output, self.new_states, self.surrogate_loss_addition, self.penalty = fake_output
            self.fake_output_feature = self.fake_input_feature
            self.memory.fake_output_feature = self.fake_output_feature
            self.memory.fake_output = self.fake_output

        print(cfg.critic)
        self.real_logit, self.real_embeddings, self.test_real_gradients = cfg.critic(
            images=self.real_data, cfg=cfg, is_train=self.is_training)
        self.fake_logit, self.fake_embeddings, self.test_fake_gradients = cfg.critic(
            images=self.fake_output,
            cfg=cfg,
            reuse=True,
            is_train=self.is_training)
        self.fake_input_logit, self.fake_input_embeddings, _ = cfg.critic(
            images=self.fake_input,
            cfg=cfg,
            reuse=True,
            is_train=self.is_training)
        print('real_logit', self.real_logit.shape)

        with tf.variable_scope('rl_value'):
            print('self.states', self.states.shape)
            print('self.new_states', self.new_states.shape)
            self.old_value, _, _ = cfg.value(images=self.fake_input,
                                             states=self.states,
                                             cfg=cfg,
                                             reuse=False,
                                             is_train=self.is_training)
            self.new_value, _, _ = cfg.value(images=self.fake_output,
                                             states=self.new_states,
                                             cfg=cfg,
                                             reuse=True,
                                             is_train=self.is_training)

        stopped = self.new_states[:, STATE_STOPPED_DIM:STATE_STOPPED_DIM + 1]
        clear_final = tf.cast(
            self.new_states[:, STATE_STEP_DIM:STATE_STEP_DIM + 1] >
            self.cfg.maximum_trajectory_length, tf.float32)
        print('clear final', clear_final.shape)
        print('new_value', self.new_value.shape)
        self.new_value = self.new_value * (1.0 - clear_final)
        # Reward: the bigger, the better

        if cfg.supervised:
            self.raw_reward = (cfg.all_reward +
                               (1 - cfg.all_reward) * stopped) * (
                                   -self.fake_logit)
        else:
            if cfg.gan == 'ls':
                self.raw_reward = (cfg.all_reward +
                                   (1 - cfg.all_reward) * stopped) * (
                                       1 - (self.fake_logit - 1)**2)
            else:
                self.raw_reward = (cfg.all_reward +
                                   (1 - cfg.all_reward) * stopped) * (
                                       self.fake_logit -
                                       tf.stop_gradient(self.fake_input_logit)
                                   ) * cfg.critic_logit_multiplier
        self.reward = self.raw_reward
        if cfg.use_penalty:
            self.reward -= self.penalty
        print('new_states_slice', self.new_states)
        print('new_states_slice',
              self.new_states[:, STATE_REWARD_DIM:STATE_REWARD_DIM + 1])
        print('fake_logit', self.fake_logit.shape)

        self.exp_moving_average = tf.train.ExponentialMovingAverage(
            decay=0.99, zero_debias=True)

        # TD learning
        print('reward', self.reward.shape)
        # If it stops, future return should be zero
        self.q_value = self.reward + (
            1.0 - stopped) * cfg.discount_factor * self.new_value
        print('q', self.q_value.shape)
        self.advantage = tf.stop_gradient(self.q_value) - self.old_value
        self.v_loss = tf.reduce_mean(self.advantage**2, axis=(0, 1))

        if cfg.gan == 'ls':
            print('** LSGAN')
            self.c_loss = tf.reduce_mean(self.fake_logit**2) + tf.reduce_mean(
                (self.real_logit - 1)**2)
            if cfg.use_TD:
                routine_loss = -self.q_value * self.cfg.parameter_lr_mul
                advantage = -self.advantage
            else:
                routine_loss = -self.reward
                advantage = -self.reward
            print('routine_loss', routine_loss.shape)
            print('pg_loss', self.surrogate_loss_addition.shape)
            assert len(routine_loss.shape) == len(
                self.surrogate_loss_addition.shape)

            self.g_loss = tf.reduce_mean(routine_loss +
                                         self.surrogate_loss_addition *
                                         tf.stop_gradient(advantage))
            self.emd = self.c_loss
            self.c_average = tf.constant(0, dtype=tf.float32)
        else:
            print('** WGAN')
            self.c_loss = tf.reduce_mean(self.fake_logit - self.real_logit)
            if cfg.use_TD:
                routine_loss = -self.q_value * self.cfg.parameter_lr_mul
                advantage = -self.advantage
            else:
                routine_loss = -self.reward
                advantage = -self.reward
            print('routine_loss', routine_loss.shape)
            print('pg_loss', self.surrogate_loss_addition.shape)
            assert len(routine_loss.shape) == len(
                self.surrogate_loss_addition.shape)

            self.g_loss = tf.reduce_mean(routine_loss +
                                         self.surrogate_loss_addition *
                                         tf.stop_gradient(advantage))
            self.emd = -self.c_loss
            self.c_average = tf.reduce_mean(self.fake_logit +
                                            self.real_logit) * 0.5
        update_average = self.exp_moving_average.apply([self.c_average])
        self.c_average_smoothed = self.exp_moving_average.average(
            self.c_average)
        self.centered_fake_logit = self.fake_logit - self.c_average_smoothed
        self.fake_gradients = tf.gradients(self.fake_logit, [
            self.fake_output,
        ])[0]

        # Critic gradient norm and penalty
        alpha_dist = tf.contrib.distributions.Uniform(low=0., high=1.)
        alpha = alpha_dist.sample((cfg.batch_size, 1, 1, 1))
        interpolated = self.real_data + alpha * (self.fake_output -
                                                 self.real_data)

        inte_logit, inte_embeddings, _ = cfg.critic(images=interpolated,
                                                    cfg=cfg,
                                                    reuse=True,
                                                    is_train=self.is_training)

        gradients = tf.gradients(inte_logit, [
            interpolated,
        ])[0]

        gradient_norm = tf.sqrt(1e-6 +
                                tf.reduce_sum(gradients**2, axis=[1, 2, 3]))
        gradient_penalty = cfg.gradient_penalty_lambda * tf.reduce_mean(
            tf.maximum(gradient_norm - 1.0, 0.0)**2)
        _ = tf.summary.scalar("grad_penalty_loss", gradient_penalty)
        self.critic_gradient_norm = tf.reduce_mean(gradient_norm)
        _ = tf.summary.scalar("grad_norm", self.critic_gradient_norm)
        if cfg.gan == 'w':
            if cfg.gradient_penalty_lambda > 0:
                print('** Using gradient penalty')
                self.c_loss += gradient_penalty
        else:
            gradient_norm = tf.sqrt(
                tf.reduce_sum(self.fake_gradients**2, axis=[1, 2, 3]))
            self.critic_gradient_norm = tf.reduce_mean(gradient_norm)
            print('** NOT using gradient penalty')

        _ = tf.summary.scalar("g_loss", self.g_loss)
        _ = tf.summary.scalar("neg_c_loss", -self.c_loss)
        _ = tf.summary.scalar("EMD", self.emd)

        self.theta_g = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='generator')
        self.theta_c = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='critic')
        self.theta_v = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='rl_value')
        print('# variables')
        print('    generator:', len(self.theta_g))
        print('    value:', len(self.theta_v))
        print('    critic:', len(self.theta_c))

        self.lr_g = tf.placeholder(dtype=tf.float32, shape=[], name='lr_g')
        self.lr_c = tf.placeholder(dtype=tf.float32, shape=[], name='lr_c')

        # Optimizer for Value estimator, use the same lr as g
        self.counter_v = tf.Variable(trainable=False,
                                     initial_value=0,
                                     dtype=tf.int32)
        self.opt_v = ly.optimize_loss(loss=self.v_loss,
                                      learning_rate=self.cfg.value_lr_mul *
                                      self.lr_g,
                                      optimizer=cfg.generator_optimizer,
                                      variables=self.theta_v,
                                      global_step=self.counter_v,
                                      summaries=['gradient_norm'])

        # Optimize for Generator (Actor)
        self.counter_g = tf.Variable(trainable=False,
                                     initial_value=0,
                                     dtype=tf.int32)
        self.opt_g = ly.optimize_loss(loss=self.g_loss,
                                      learning_rate=self.lr_g,
                                      optimizer=cfg.generator_optimizer,
                                      variables=self.theta_g,
                                      global_step=self.counter_g,
                                      summaries=['gradient_norm'])

        # Optimize for Discriminator (critic in WGAN or discriminator in LSGAN)
        self.counter_c = tf.Variable(trainable=False,
                                     initial_value=0,
                                     dtype=tf.int32)
        if not self.cfg.supervised:
            self.opt_c = ly.optimize_loss(loss=self.c_loss,
                                          learning_rate=self.lr_c,
                                          optimizer=cfg.critic_optimizer,
                                          variables=self.theta_c,
                                          global_step=self.counter_c,
                                          summaries=['gradient_norm'])

            if cfg.gan == 'w' and cfg.gradient_penalty_lambda <= 0:
                print(
                    '** make sure your NN input has mean 0, as biases will also be clamped.'
                )
                # Merge the clip operations on critic variables
                # For WGAN
                clipped_var_c = [
                    tf.assign(
                        var,
                        tf.clip_by_value(var, -self.cfg.clamp_critic,
                                         self.cfg.clamp_critic))
                    for var in self.theta_c
                ]
                with tf.control_dependencies([self.opt_c]):
                    self.opt_c = tf.tuple(clipped_var_c)

            with tf.control_dependencies([self.opt_c]):
                self.opt_c = tf.group(update_average)

        self.saver = tf.train.Saver(
            max_to_keep=1)  # save all checkpoints  max_to_keep=None

        self.sess.run(tf.global_variables_initializer())

        self.merged_all = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(self.dir, self.sess.graph)

        if not restore:
            self.fixed_feed_dict_random = self.memory.get_feed_dict(
                self.cfg.num_samples)
        self.high_res_nets = {}

    def get_training_feed_dict_and_states(self, iter):
        feed_dict, features = self.memory.get_feed_dict_and_states(
            self.cfg.batch_size)
        feed_dict[self.lr_g] = self.cfg.lr_g(iter)
        feed_dict[self.lr_c] = self.cfg.lr_c(iter)
        feed_dict[self.is_train] = 1
        return feed_dict, features

    def get_replay_feed_dict(self, iter):
        feed_dict = self.memory.get_replay_feed_dict(self.cfg.batch_size)
        feed_dict[self.lr_c] = self.cfg.lr_c(iter)
        feed_dict[self.is_train] = 1
        return feed_dict

    def train(self):
        start_t = time.time()

        g_loss_pool = []
        v_loss_pool = []
        emd_pool = []
        # critic gradient (critic logit w.r.t. critic input image) norm
        cgn = 0

        for iter in range(self.cfg.max_iter_step + 1):
            progress = float(iter) / self.cfg.max_iter_step
            iter_start_time = time.time()
            run_options = tf.RunOptions()
            run_metadata = tf.RunMetadata()
            if self.cfg.gan == 'w' and (iter < self.cfg.critic_initialization
                                        or iter % 500 == 0):
                citers = 100
            else:
                citers = self.cfg.citers

            if iter == 0:
                # Make sure there are terminating states
                giters = 100
            else:
                giters = self.cfg.giters

            # Update generator actor/critic
            for j in range(giters):
                feed_dict, features = self.get_training_feed_dict_and_states(
                    iter)
                if iter == 0:
                    feed_dict[self.lr_g] = 0
                feed_dict[self.progress] = progress
                _, g_loss, v_loss, fake_output, new_states = self.sess.run(
                    [(self.opt_g, self.opt_v), self.g_loss, self.v_loss,
                     self.fake_output, self.new_states],
                    feed_dict=feed_dict,
                    options=run_options,
                    run_metadata=run_metadata)
                if self.cfg.supervised:
                    ground_truth = feed_dict[self.ground_truth]
                else:
                    ground_truth = None
                self.memory.replace_memory(
                    self.memory.images_and_states_to_records(
                        fake_output,
                        new_states,
                        features,
                        ground_truth=ground_truth))
                v_loss_pool.append(v_loss)
                g_loss_pool.append(g_loss)

                if iter % self.cfg.summary_freq == 0 and j == 0:
                    merged = self.sess.run(self.merged_all,
                                           feed_dict=feed_dict,
                                           options=run_options,
                                           run_metadata=run_metadata)
                    self.summary_writer.add_summary(merged, iter)
                    self.summary_writer.add_run_metadata(
                        run_metadata, 'critic_metadata {}'.format(iter), iter)

            merged = []
            # Update GAN discriminator ('critic' for WGAN)
            for j in range(citers):
                feed_dict = self.get_replay_feed_dict(iter)
                if not self.cfg.supervised:
                    # update discriminator only if it is unsupervised
                    _, emd, cgn = self.sess.run(
                        [self.opt_c, self.emd, self.critic_gradient_norm],
                        feed_dict=feed_dict)
                    emd_pool.append(emd)

            if merged:
                self.summary_writer.add_summary(merged, iter)
                self.summary_writer.add_run_metadata(
                    run_metadata, 'generator_metadata {}'.format(iter), iter)

            # Visualizations
            if self.cfg.realtime_vis or iter % self.cfg.write_image_interval == 0:
                self.visualize(iter)

            v_loss_pool = v_loss_pool[-self.cfg.median_filter_size:]
            g_loss_pool = g_loss_pool[-self.cfg.median_filter_size:]
            emd_pool = emd_pool[-self.cfg.median_filter_size:]

            if (iter + 1) % 500 == 0:
                self.saver.save(self.sess,
                                os.path.join(self.dir, "model.ckpt"),
                                global_step=(iter + 1))

            if iter % 100 == 0:
                eta = (time.time() - start_t) / (iter + 1) / 3600 * (
                    self.cfg.max_iter_step - iter)
                tot_time = (time.time() - start_t) / (iter + 1) / 3600 * (
                    self.cfg.max_iter_step)
                if iter < 500:
                    eta = tot_time = 0
                print('#--------------------------------------------')
                print('# Task: %s  ela. %.2f min  ETA: %.1f/%.1f h' %
                      (self.cfg.name,
                       (time.time() - start_t) / 60.0, eta, tot_time))
                self.memory.debug()

            if iter % 10 == 0:
                print(
                    'it%6d,%5.0f ms/it, g_loss=%.2f, v_loss=%.2f, EMD=%.3f, cgn=%.2f'
                    % (iter, 1000 *
                       (time.time() - iter_start_time), np.median(g_loss_pool),
                       np.median(v_loss_pool), np.median(emd_pool), cgn))

    def restore(self, ckpt):
        self.saver.restore(self.sess,
                           os.path.join(self.dir, "model.ckpt-%s" % ckpt))

    def gradient_processor(self, grads):
        if self.cfg.gan == 'ls':
            # We show negative grad. (since we are minimizing)
            real_grads = []
            for g in grads:
                if (abs(np.mean(g) - 1)) > 0.001:
                    real_grads.append(g)
            return -grads / np.std(real_grads) * 0.2 + 0.5
        else:
            return 10 * grads + 0.5

    def visualize(self, iter):
        progress = float(iter) / self.cfg.max_iter_step
        lower_regions = []
        pool_images, pool_states, pool_features = self.memory.records_to_images_states_features(
            self.memory.image_pool[:self.cfg.num_samples])

        if self.cfg.supervised:
            gt0 = [x[1] for x in pool_images]
            pool_images = [x[0] for x in pool_images]
        else:
            gt0 = None
        lower_regions.append(pool_images)

        # Generated data
        feed_dict = merge_dict(self.fixed_feed_dict_random, {
            self.is_train: self.cfg.test_random_walk,
            self.progress: progress
        })
        eval_images = []
        eval_states = []
        gt1 = self.fixed_feed_dict_random[self.ground_truth]
        for i in range(self.cfg.test_steps):
            output_images, output_states = self.sess.run(
                [self.fake_output, self.new_states], feed_dict=feed_dict)
            feed_dict[self.fake_input] = output_images
            feed_dict[self.states] = output_states

            eval_images.append(output_images)
            eval_states.append(output_states)

        best_outputs = []
        best_indices = []
        for i in range(self.cfg.num_samples):
            best_index = self.cfg.test_steps - 1
            for j in range(self.cfg.test_steps):
                if eval_states[j][i][STATE_REWARD_DIM] > 0:
                    best_index = j
                    break
            best_image = eval_images[best_index][i]
            best_indices.append(best_index + 1)
            best_outputs.append(best_image)

        lower_regions.append(best_outputs)
        # Real data
        lower_regions.append(self.fixed_feed_dict_random[self.real_data])

        if self.cfg.vis_draw_critic_scores:
            lower_regions[0] = self.draw_critic_scores(lower_regions[0],
                                                       ground_truth=gt0)
            lower_regions[1] = self.draw_critic_scores(lower_regions[1],
                                                       ground_truth=gt1)
            if not self.cfg.supervised:
                lower_regions[2] = self.draw_critic_scores(lower_regions[2])

        for img, state in zip(lower_regions[0], pool_states):
            cv2.putText(img, str(state), (4, 33), cv2.FONT_HERSHEY_SIMPLEX,
                        0.25, (1.0, 0.0, 0.0))

        for img, ind in zip(lower_regions[1], best_indices):
            cv2.putText(img, str(ind), (23, 23), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                        (1.0, 0.0, 0.0))

        lower_regions = list(map(make_image_grid, lower_regions))
        seperator = np.ones(
            (lower_regions[0].shape[0], 16, lower_regions[0].shape[2]),
            dtype=np.float32)
        lower_region = np.hstack([
            lower_regions[0], seperator, lower_regions[1], seperator,
            lower_regions[2]
        ])

        upper_region = np.ones_like(lower_region)

        per_row = lower_region.shape[1] // (self.generator_debugger.width + 4)

        # The upper part
        h, w = self.cfg.source_img_size, self.cfg.source_img_size
        images = []
        debug_plots = []
        gradients = []
        rows = lower_region.shape[0] // (h + 2) // 3
        groups_per_row = per_row // (self.cfg.test_steps + 1)
        per_row = (self.cfg.test_steps + 1) * groups_per_row

        gts = []
        for j in range(min(self.cfg.num_samples, rows * groups_per_row)):
            if self.cfg.supervised:
                img_gt = self.memory.get_next_RAW(
                    1, test=self.cfg.vis_step_test)[0][0]
                img, gt = img_gt[0], img_gt[1]
            else:
                img = self.memory.get_next_RAW(1)[0][0]
                gt = None
            # z is useless at test time...
            images_, debug_plots_, gradients_ = self.draw_steps(
                img,
                ground_truth=gt,
                is_train=self.cfg.test_random_walk,
                progress=progress)
            images += images_
            if self.cfg.supervised:
                gts += [gt] * len(images_)
                gradients_ = [gt] * len(images_)
            debug_plots += debug_plots_
            gradients += gradients_

        if not self.cfg.supervised:
            gradients = self.gradient_processor(np.stack(gradients, axis=0))

        pad = 0
        for i in range(rows):
            for j in range(per_row):
                start_x, start_y = pad + 3 * i * (h + 2), pad + j * (w + 4)
                index = i * per_row + j
                if index < len(images):
                    upper_region[start_x:start_x + h,
                                 start_y:start_y + w] = images[index]
                    upper_region[start_x + h + 1:start_x + h * 2 + 1,
                                 start_y:start_y + w] = gradients[index]
                    upper_region[start_x + 2 * (h + 1):start_x + h * 3 + 2,
                                 start_y:start_y + w] = debug_plots[index]

        seperator = np.ones((16, upper_region.shape[1], upper_region.shape[2]),
                            dtype=np.float32)
        upper_region = np.vstack([seperator, upper_region, seperator])

        img = np.vstack([upper_region, lower_region])
        if self.cfg.realtime_vis:
            cv2.imshow('vis', img[:, :, ::-1])
            cv2.waitKey(20)
        if iter % self.cfg.write_image_interval == 0:
            fn = os.path.join(self.image_dir, '%06d.png' % iter)
            cv2.imwrite(fn, img[:, :, ::-1] * 255.0)

    def draw_value_reward_score(self, img, value, reward, score):
        img = img.copy()
        # Average with 0.5 for semi-transparent background
        img[:14] = img[:14] * 0.5 + 0.25
        img[50:] = img[50:] * 0.5 + 0.25
        if self.cfg.gan == 'ls':
            red = -np.tanh(float(score) / 1) * 0.5 + 0.5
        else:
            red = -np.tanh(float(score) / 10.0) * 0.5 + 0.5
        top = '%+.2f %+.2f' % (value, reward)
        cv2.putText(img, top, (3, 7), cv2.FONT_HERSHEY_SIMPLEX, 0.25,
                    (1.0, 1.0 - red, 1.0 - red))
        score = '%+.3f' % score
        cv2.putText(img, score, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.35,
                    (1.0, 1.0 - red, 1.0 - red))
        return img

    def draw_steps(self, img, progress, ground_truth=None, is_train=0):
        images = []
        debug_plots = []
        gradients = []
        states = self.memory.get_initial_states(self.cfg.batch_size)

        tmp_fake_output = [img] * self.cfg.batch_size
        tmp_fake_output = np.stack(tmp_fake_output, axis=0)
        initial_value, initial_score = self.sess.run(
            [self.new_value[0], self.centered_fake_logit[0]],
            feed_dict={
                self.fake_output: tmp_fake_output,
                self.new_states: states,
                self.progress: progress
            })

        images.append(
            self.draw_value_reward_score(img, initial_value, 0, initial_score))
        debug_plots.append(img * 0 + 1)
        # z is useless at test time...
        gradients.append(img * 0 + 1)
        for k in range(self.cfg.test_steps):
            feed_dict = {
                self.fake_input: [img] * self.cfg.batch_size,
                self.real_data: [img] * self.cfg.batch_size,
                self.z: self.memory.get_noise(self.cfg.batch_size),
                self.is_train: is_train,
                self.states: states,
                self.progress: progress
            }
            if self.cfg.supervised:
                feed_dict[self.ground_truth] = [ground_truth]
                feed_dict[self.progress] = progress
            debug_info, img, grad, new_state, new_value, score, reward = self.sess.run(
                [
                    self.generator_debug_output, self.fake_output[0],
                    self.fake_gradients[0], self.new_states, self.new_value[0],
                    self.centered_fake_logit[0], self.reward[0]
                ],
                feed_dict=feed_dict)
            debug_plot = self.generator_debugger(debug_info)
            images.append(
                self.draw_value_reward_score(img, new_value, reward, score))
            gradients.append(grad)
            debug_plots.append(debug_plot)
            states = new_state

            if states[0, STATE_STOPPED_DIM] > 0:
                break

        for k in range(len(images), self.cfg.test_steps + 1):
            images.append(img * 0 + 1)
            gradients.append(img * 0 + 1)
            debug_plots.append(img * 0 + 1)
        return images, debug_plots, gradients

    def draw_critic_scores(self, images, ground_truth=None):
        # We do not care about states here, so that value drawn may not make sense.
        images = list(images)
        original_len = len(images)
        if len(images) < self.cfg.batch_size:
            images += [images[0]] * (self.cfg.batch_size - len(images))
        states = self.memory.get_initial_states(self.cfg.batch_size)
        # indexs = self.memory.get_random_indexs(self.cfg,batch_size)
        images = np.stack(images, axis=0)
        if self.cfg.supervised:
            # TODO
            feed_dict = {
                self.real_data: images,
                self.fake_input: images,
                self.ground_truth: ground_truth,
                self.new_states: states,
                self.states: states,
                self.is_train: 0
            }
        else:
            feed_dict = {
                self.fake_output: images,
                self.real_data: images,
            }
        if self.cfg.gan == 'ls':
            logit = self.fake_logit
        else:
            logit = self.centered_fake_logit
        scores = self.sess.run(logit, feed_dict=feed_dict)
        if self.cfg.supervised:
            scores = np.sqrt(scores) * 100.0
        ret = []
        for i in range(len(images)):
            img, score = images[i].copy(), scores[i]
            # Average with 0.5 for semi-transparent background
            img[50:] = img[50:] * 0.5 + 0.25
            if self.cfg.gan == 'ls':
                red = -np.tanh(float(score) / 1) * 0.5 + 0.5
            else:
                red = -np.tanh(float(score) / 10.0) * 0.5 + 0.5
            score = '%+.3f' % score
            cv2.putText(img, score, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.35,
                        (1.0, 1.0 - red, 1.0 - red))
            ret.append(img)
        return ret[:original_len]

    def backup_scripts(self):
        script_dir = os.path.join(self.dir, 'scripts')
        try:
            os.mkdir(script_dir)
        except Exception as e:
            pass
        for fn in os.listdir('.'):
            if fn.endswith('.py'):
                shutil.copy(fn, script_dir)
        print('Scripts are backed up. Initializing network...')

    def get_high_resolution_net(self, res):
        if res not in self.high_res_nets:
            print('Creating high_res_network for ', res)
            net = Dict()
            net.high_res_input = tf.placeholder(
                tf.float32,
                shape=(None, res[0], res[1], self.cfg.real_img_channels),
                name='highres_in')
            net.fake_input = self.fake_input
            net.fake_input_feature = self.fake_input_feature
            net.real_data = self.real_data
            net.z = self.z
            net.is_train = self.is_train
            net.states = self.states
            with tf.variable_scope('generator', reuse=True):
                fake_output, net.generator_debug_output, net.generator_debugger = self.cfg.generator(
                    [net.fake_input, net.z, net.states],
                    is_train=net.is_train,
                    cfg=self.cfg,
                    high_res=net.high_res_input,
                    progress=0)
                net.fake_output, net.new_states, net.high_res_output = fake_output

            net.fake_logit, net.fake_embeddings, _ = self.cfg.critic(
                images=net.fake_output,
                cfg=self.cfg,
                reuse=True,
                is_train=False)
            self.high_res_nets[res] = net
        return self.high_res_nets[res]

    def eval(self,
             spec_files=None,
             output_dir='./outputs',
             step_by_step=False,
             show_linear=True,
             show_input=True):
        from util import get_image_center
        if output_dir is not None:
            try:
                os.mkdir(output_dir)
            except:
                pass
        print(spec_files)

        # Use a fixed noise
        batch_size = 1
        for fn in spec_files:
            print('Processing input {}'.format(fn))

            from util import read_tiff16, linearize_ProPhotoRGB
            if fn.endswith('.tif') or fn.endswith('.tiff'):
                image = read_tiff16(fn)
                high_res_image = linearize_ProPhotoRGB(image)
            else:
                # TODO: deal with png and jpeg files better - they are probably not RAW.
                print(
                    'Warning: sRGB color space jpg and png images may not work perfectly. See README for details. (image {})'
                    .format(fn))
                image = cv2.imread(fn)[:, :, ::-1]
                if image.dtype == np.uint8:
                    image = image / 255.0
                if image.dtype == np.uint16:
                    image = image / 65535.0
                else:
                    print(
                        'image data type {} is not supported. Please email Yuanming Hu.'
                        .format(image.dtype))
                high_res_image = np.power(image, 2.2)  # Linearize sRGB
                high_res_image /= 2 * high_res_image.max(
                )  # Mimic RAW exposure

                # Uncomment to bypass preprocessing
                # high_res_image = image

            noises = [
                self.memory.get_noise(batch_size)
                for _ in range(self.cfg.test_steps)
            ]
            fn = fn.split('/')[-1]

            def get_dir():
                if output_dir is not None:
                    d = output_dir
                else:
                    d = self.dump_dir
                return d

            try:
                os.mkdir(get_dir())
            except:
                pass

            def show_and_save(x, img):
                img = img[:, :, ::-1]
                #cv2.imshow(x, img)
                cv2.imwrite(os.path.join(get_dir(), fn + '.' + x + '.png'),
                            img * 255.0)

            #if os.path.exists(os.path.join(get_dir(), fn + '.retouched.png')):
            #    print('Skipping', fn)
            #    continue

            high_res_input = high_res_image
            low_res_img = cv2.resize(get_image_center(high_res_image),
                                     dsize=(64, 64))
            res = high_res_input.shape[:2]
            net = self.get_high_resolution_net(res)

            low_res_img_trajs = [low_res_img]
            low_res_images = [low_res_img]
            states = self.memory.get_initial_states(batch_size)
            high_res_output = high_res_input
            masks = []
            decisions = []
            operations = []
            debug_info_list = []

            tmp_fake_input = low_res_images * batch_size
            tmp_fake_input = np.array(tmp_fake_input)
            print(tmp_fake_input.shape)

            for i in range(self.cfg.test_steps):
                feed_dict = {
                    net.fake_input: low_res_images * batch_size,
                    net.z: noises[i],
                    net.is_train: 0,
                    net.states: states,
                    net.high_res_input: [high_res_output] * batch_size
                }
                new_low_res_images, new_scores, new_states, new_high_res_output, debug_info = self.sess.run(
                    [
                        net.fake_output[0], net.fake_logit[0],
                        net.new_states[0], net.high_res_output[0],
                        net.generator_debug_output
                    ],
                    feed_dict=feed_dict)
                low_res_img_trajs.append(new_low_res_images)
                low_res_images = [new_low_res_images]
                # print('new_states', new_states.shape)
                states = [new_states] * batch_size
                debug_info_list.append(debug_info)
                debug_plots = self.generator_debugger(debug_info,
                                                      combined=False)
                decisions.append(debug_plots[0])
                operations.append(debug_plots[1])
                masks.append(debug_plots[2])
                high_res_output = new_high_res_output
                if states[0][STATE_STOPPED_DIM] > 0:
                    break
                if step_by_step:
                    show_and_save('intermediate%02d' % i, high_res_output)

            linear_high_res = high_res_input

            # Max to white, and then gamma correction
            high_res_input = (high_res_input / high_res_input.max())**(1 / 2.4)

            # Save linear
            if show_linear:
                show_and_save('linear', linear_high_res)

            # Save corrected
            if show_input:
                show_and_save('input_tone_mapped', high_res_input)

            # Save retouched
            show_and_save('retouched', high_res_output)

            # Steps & debugging information
            with open(os.path.join(get_dir(), fn + '_debug.pkl'), 'wb') as f:
                pickle.dump(debug_info_list, f)

            padding = 4
            patch = 64
            grid = patch + padding
            steps = len(low_res_img_trajs)

            fused = np.ones(shape=(grid * 4, grid * steps, 3),
                            dtype=np.float32)

            for i in range(len(low_res_img_trajs)):
                sx = grid * i
                sy = 0
                fused[sy:sy + patch, sx:sx + patch] = cv2.resize(
                    low_res_img_trajs[i],
                    dsize=(patch, patch),
                    interpolation=cv2.cv2.INTER_NEAREST)

            for i in range(len(low_res_img_trajs) - 1):
                sx = grid * i + grid // 2
                sy = grid
                fused[sy:sy + patch, sx:sx + patch] = cv2.resize(
                    decisions[i],
                    dsize=(patch, patch),
                    interpolation=cv2.cv2.INTER_NEAREST)
                sy = grid * 2 - padding // 2
                fused[sy:sy + patch, sx:sx + patch] = cv2.resize(
                    operations[i],
                    dsize=(patch, patch),
                    interpolation=cv2.cv2.INTER_NEAREST)
                sy = grid * 3 - padding
                fused[sy:sy + patch, sx:sx + patch] = cv2.resize(
                    masks[i],
                    dsize=(patch, patch),
                    interpolation=cv2.cv2.INTER_NEAREST)

            # Save steps
            show_and_save('steps', fused)
Exemplo n.º 17
0
    def __init__(self, env_type, state_dims, num_actions):
        if env_type == EnvTypes.ATARI:
            state_size = [state_dims[0], state_dims[1]*FRAME_STACK, state_dims[2]]
        elif env_type == EnvTypes.STANDARD:
            state_size = state_dims
        self.replay_memory = ReplayMemory(REPLAY_MEMORY_CAPACITY, state_size)
        self.exploration = 1.0
        self.train_iter = 0
        self.env_type = env_type

        if env_type == EnvTypes.ATARI:
            buffer_size = FRAME_STACK*FRAME_SKIP
            self.observation_buffer = [np.zeros((state_dims[0], state_dims[1], state_dims[2]))
                                       for _ in range(buffer_size)]
        else:
            self.observation_buffer = [np.zeros((state_dims[0]))]

        self.config = tf.ConfigProto()
        self.config.gpu_options.per_process_gpu_memory_fraction = GPU_MEMORY_FRACTION
        self.sess = tf.Session(config=self.config)

        # build q network
        self.dqn_vars = dict()
        with tf.variable_scope(DQN_SCOPE):
            if env_type == EnvTypes.ATARI:
                self.x, self.initial_layers = self.add_atari_layers(state_dims, self.dqn_vars)
            elif env_type == EnvTypes.STANDARD:
                self.x, self.initial_layers = self.add_standard_layers(state_dims, self.dqn_vars)

            # add final hidden layers
            self.hid = fc(self.initial_layers, 128, HIDDEN, var_dict=self.dqn_vars)
            self.q = fc(self.hid, num_actions, OUTPUT,
                        var_dict=self.dqn_vars, activation=False)
            
            tf.histogram_summary('q_values', self.q)
                          
        # build target network
        self.target_vars = dict()
        with tf.variable_scope(TARGET_SCOPE):
            if env_type == EnvTypes.ATARI:
                self.t_x, self.t_initial_layers = self.add_atari_layers(state_dims,
                                                                        self.target_vars)
            elif env_type == EnvTypes.STANDARD:
                self.t_x, self.t_initial_layers = self.add_standard_layers(state_dims,
                                                                           self.target_vars)

            self.t_hid = fc(self.t_initial_layers, 128, HIDDEN, var_dict=self.target_vars)
            self.t_q = fc(self.t_hid, num_actions, OUTPUT,
                          var_dict=self.target_vars, activation=False)

            tf.histogram_summary('target_q_values', self.t_q)

        # add weight transfer operations from primary dqn network to target network
        self.assign_ops = []
        with tf.variable_scope(TRANSFER_SCOPE):
            for variable in self.dqn_vars.keys():
                target_variable = TARGET_SCOPE + variable[len(DQN_SCOPE):]
                decay = tf.mul(1 - TAU, self.target_vars[target_variable])
                update = tf.mul(TAU, self.dqn_vars[variable])
                new_target_weight = tf.add(decay, update)
                target_assign = self.target_vars[target_variable].assign(new_target_weight)
                self.assign_ops.append(target_assign)

        # build dqn evaluation
        with tf.variable_scope(EVALUATION_SCOPE):
            # one-hot action selection
            self.action = tf.placeholder(tf.int32, shape=[None])
            self.action_one_hot = tf.one_hot(self.action, num_actions)
            # reward
            self.reward = tf.placeholder(tf.float32, shape=[None, 1])
            # terminal state
            self.nonterminal = tf.placeholder(tf.float32, shape=[None, 1])

            self.target = tf.add(self.reward, tf.mul(GAMMA, tf.mul(self.nonterminal,
                          tf.reduce_max(self.t_q, 1, True))))
            self.predict = tf.reduce_sum(tf.mul(self.action_one_hot, self.q), 1, True)
            self.error = tf.reduce_mean(mse(self.predict, self.target))

            tf.scalar_summary('error', self.error)
        
        val_print = tf.Print(self.error, [self.predict, self.target])
        self.optimize = tf.train.RMSPropOptimizer(ALPHA, decay=RMS_DECAY, momentum=MOMENTUM,
                        epsilon=EPSILON).minimize(self.error, var_list=self.dqn_vars.values())

        # write out the graph and summaries for tensorboard
        self.summaries = tf.merge_all_summaries()
        if os.path.isdir(TENSORBOARD_GRAPH_DIR):
            shutil.rmtree(TENSORBOARD_GRAPH_DIR)
        self.writer = tf.train.SummaryWriter(TENSORBOARD_GRAPH_DIR, self.sess.graph)

        # initialize variables
        self.sess.run(tf.initialize_all_variables())

        # create saver
        self.saver = tf.train.Saver()
Exemplo n.º 18
0
class AtariGame(Game):
    def __init__(self,
                 rom_path=_default_rom_path,
                 frame_skip=4, history_length=4,
                 resize_mode='scale', resized_rows=84, resized_cols=84, crop_offset=8,
                 display_screen=False, max_null_op=30,
                 replay_memory_size=1000000,
                 replay_start_size=100,
                 death_end_episode=True):
        super(AtariGame, self).__init__()
        self.rng = get_numpy_rng()
        self.ale = ale_load_from_rom(rom_path=rom_path, display_screen=display_screen)
        self.start_lives = self.ale.lives()
        self.action_set = self.ale.getMinimalActionSet()
        self.resize_mode = resize_mode
        self.resized_rows = resized_rows
        self.resized_cols = resized_cols
        self.crop_offset = crop_offset
        self.frame_skip = frame_skip
        self.history_length = history_length
        self.max_null_op = max_null_op
        self.death_end_episode = death_end_episode
        self.screen_buffer_length = 2
        self.screen_buffer = numpy.empty((self.screen_buffer_length,
                                          self.ale.getScreenDims()[1], self.ale.getScreenDims()[0]),
                                         dtype='uint8')
        self.replay_memory = ReplayMemory(state_dim=(resized_rows, resized_cols),
                                          history_length=history_length,
                                          memory_size=replay_memory_size,
                                          replay_start_size=replay_start_size)
        self.start()

    def start(self):
        self.ale.reset_game()
        null_op_num = self.rng.randint(self.screen_buffer_length,
                                       max(self.max_null_op + 1, self.screen_buffer_length + 1))
        for i in range(null_op_num):
            self.ale.act(0)
            self.ale.getScreenGrayscale(self.screen_buffer[i % self.screen_buffer_length, :, :])
        self.total_reward = 0
        self.episode_reward = 0
        self.episode_step = 0
        self.max_episode_step = DEFAULT_MAX_EPISODE_STEP
        self.start_lives = self.ale.lives()

    def force_restart(self):
        self.start()
        self.replay_memory.clear()


    def begin_episode(self, max_episode_step=DEFAULT_MAX_EPISODE_STEP):
        """
            Begin an episode of a game instance. We can play the game for a maximum of
            `max_episode_step` and after that, we are forced to restart
        """
        if self.episode_step > self.max_episode_step or self.ale.game_over():
            self.start()
        else:
            for i in range(self.screen_buffer_length):
                self.ale.act(0)
                self.ale.getScreenGrayscale(self.screen_buffer[i % self.screen_buffer_length, :, :])
        self.max_episode_step = max_episode_step
        self.start_lives = self.ale.lives()
        self.episode_reward = 0
        self.episode_step = 0

    @property
    def episode_terminate(self):
        termination_flag = self.ale.game_over() or self.episode_step >= self.max_episode_step
        if self.death_end_episode:
            return (self.ale.lives() < self.start_lives) or termination_flag
        else:
            return termination_flag

    @property
    def state_enabled(self):
        return self.replay_memory.size >= self.replay_memory.history_length

    def get_observation(self):
        image = self.screen_buffer.max(axis=0)
        if 'crop' == self.resize_mode:
            original_rows, original_cols = image.shape
            new_resized_rows = int(round(
                float(original_rows) * self.resized_cols / original_cols))
            resized = cv2.resize(image, (self.resized_cols, new_resized_rows),
                                 interpolation=cv2.INTER_LINEAR)
            crop_y_cutoff = new_resized_rows - self.crop_offset - self.resized_rows
            img = resized[crop_y_cutoff:
            crop_y_cutoff + self.resized_rows, :]
            return img
        else:
            return cv2.resize(image, (self.resized_cols, self.resized_rows),
                              interpolation=cv2.INTER_LINEAR)

    def play(self, a):
        assert not self.episode_terminate,\
            "Warning, the episode seems to have terminated. " \
            "We need to call either game.begin_episode(max_episode_step) to continue a new " \
            "episode or game.start() to force restart."
        self.episode_step += 1
        reward = 0.0
        action = self.action_set[a]
        for i in range(self.frame_skip):
            reward += self.ale.act(action)
            self.ale.getScreenGrayscale(self.screen_buffer[i % self.screen_buffer_length, :, :])
        self.total_reward += reward
        self.episode_reward += reward
        ob = self.get_observation()
        terminate_flag = self.episode_terminate
        self.replay_memory.append(ob, a, numpy.clip(reward, -1, 1), terminate_flag)
        return reward, terminate_flag
Exemplo n.º 19
0
class QEngine:
    def __init__(self, **kwargs):
        self.setup = kwargs
        self._initialize(**kwargs)
        del kwargs["game"]

    def _prepare_for_save(self):
        self.setup["epsilon"] = self._epsilon
        self.setup["steps"] = self._steps
        self.setup["skiprate"] = self._skiprate

    # TODO why the f**k isn't it in init?
    def _initialize(self, game, network_args=None, actions=None,
                    history_length=4,
                    batchsize=64,
                    update_pattern=(1, 1),
                    replay_memory_size=10000,
                    backprop_start_step=10000, start_epsilon=1.0,
                    end_epsilon=0.1,
                    epsilon_decay_start_step=50000,
                    epsilon_decay_steps=100000,
                    reward_scale=1.0,
                    use_game_variables=True,
                    misc_scale=None,
                    reshaped_x=None,
                    reshaped_y=None,
                    skiprate=4,
                    shaping_on=False,
                    count_states=False,
                    name=None,
                    net_type="cnn", melt_steps=10000, remember_n_actions=0):

        if network_args is None:
            network_args = dict()
        if count_states is not None:
            self._count_states = bool(count_states)

        self.name = name
        self._reward_scale = reward_scale
        self._game = game
        self._batchsize = batchsize
        self._history_length = max(history_length, 1)
        self._update_pattern = update_pattern
        self._epsilon = max(min(start_epsilon, 1.0), 0.0)
        self._end_epsilon = min(max(end_epsilon, 0.0), self._epsilon)
        self._epsilon_decay_steps = epsilon_decay_steps
        self._epsilon_decay_stride = (self._epsilon - end_epsilon) / epsilon_decay_steps
        self._epsilon_decay_start = epsilon_decay_start_step
        self._skiprate = max(skiprate, 0)
        self._shaping_on = shaping_on
        self._steps = 0
        self._melt_steps = melt_steps
        self._backprop_start_step = max(backprop_start_step, batchsize)
        self._use_game_variables = use_game_variables
        self._last_action_index = 0

        if self._shaping_on:
            self._last_shaping_reward = 0

        self.learning_mode = True

        if actions is None:
            self._actions = generate_default_actions(game)
        else:
            self._actions = actions

        self._actions_num = len(self._actions)
        self._actions_stats = np.zeros([self._actions_num], np.int)

        # changes img_shape according to the history size
        self._channels = game.get_screen_channels()
        if self._history_length > 1:
            self._channels *= self._history_length

        if reshaped_x is None:
            x = game.get_screen_width()
            y = game.get_screen_height()
            scale_x = scale_y = 1.0
        else:
            x = reshaped_x
            scale_x = float(x) / game.get_screen_width()

            if reshaped_y is None:
                y = int(game.get_screen_height() * scale_x)
                scale_y = scale_x
            else:
                y = reshaped_y
                scale_y = float(y) / game.get_screen_height()

        img_shape = [self._channels, y, x]

        # TODO check if it is slow (it seems that no)
        if scale_x == 1 and scale_y == 1:
            def convert(img):
                img = img.astype(np.float32) / 255.0
                return img
        else:
            def convert(img):
                img = img.astype(np.float32) / 255.0
                new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype)
                for i in xrange(img.shape[0]):
                    # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True)
                    new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA)
                return new_image
        self._convert_image = convert

        if self._use_game_variables:
            single_state_misc_len = game.get_available_game_variables_size() + int(self._count_states)
        else:
            single_state_misc_len = int(self._count_states)
        self._single_state_misc_len = single_state_misc_len

        self._remember_n_actions = remember_n_actions
        if remember_n_actions > 0:
            self._remember_n_actions = remember_n_actions
            self._action_len = len(self._actions[0])
            self._last_n_actions = np.zeros([remember_n_actions * self._action_len], dtype=np.float32)
            self._total_misc_len = single_state_misc_len * self._history_length + len(self._last_n_actions)
            self._last_action_index = 0
        else:
            self._total_misc_len = single_state_misc_len * self._history_length

        if self._total_misc_len > 0:
            self._misc_state_included = True
            self._current_misc_state = np.zeros(self._total_misc_len, dtype=np.float32)
            if single_state_misc_len > 0:
                self._state_misc_buffer = np.zeros(single_state_misc_len, dtype=np.float32)
                if misc_scale is not None:
                    self._misc_scale = np.array(misc_scale, dtype=np.float32)
                else:
                    self._misc_scale = None
        else:
            self._misc_state_included = False

        state_format = dict()
        state_format["s_img"] = img_shape
        state_format["s_misc"] = self._total_misc_len
        self._transitions = ReplayMemory(state_format, replay_memory_size, batchsize)

        network_args["state_format"] = state_format
        network_args["actions_number"] = len(self._actions)

        if net_type in ("dqn", None, ""):
            self._evaluator = DQN(**network_args)
        elif net_type == "duelling":
            self._evaluator = DuellingDQN(**network_args)
        else:
            print "Unsupported evaluator type."
            exit(1)
            # TODO throw. . .?

        self._current_image_state = np.zeros(img_shape, dtype=np.float32)

    def _update_state(self):
        raw_state = self._game.get_state()
        img = self._convert_image(raw_state.image_buffer)
        state_misc = None

        if self._single_state_misc_len > 0:
            state_misc = self._state_misc_buffer

            if self._use_game_variables:
                game_variables = raw_state.game_variables.astype(np.float32)
                state_misc[0:len(game_variables)] = game_variables

            if self._count_states:
                state_misc[-1] = raw_state.number

            if self._misc_scale is not None:
                state_misc = state_misc * self._misc_scale

        if self._history_length > 1:
            pure_channels = self._channels / self._history_length
            self._current_image_state[0:-pure_channels] = self._current_image_state[pure_channels:]
            self._current_image_state[-pure_channels:] = img

            if self._single_state_misc_len > 0:
                misc_len = len(state_misc)
                hist = self._history_length
                self._current_misc_state[0:(hist - 1) * misc_len] = self._current_misc_state[misc_len:hist * misc_len]

                self._current_misc_state[(hist - 1) * misc_len:hist * misc_len] = state_misc

        else:
            self._current_image_state[:] = img
            if self._single_state_misc_len > 0:
                self._current_misc_state[0:len(state_misc)] = state_misc

        if self._remember_n_actions:
            self._last_n_actions[:-self._action_len] = self._last_n_actions[self._action_len:]
            self._last_n_actions[-self._action_len:] = self._actions[self._last_action_index]
            self._current_misc_state[-len(self._last_n_actions):] = self._last_n_actions


    def new_episode(self, update_state=False):
        self._game.new_episode()
        self.reset_state()
        self._last_shaping_reward = 0
        if update_state:
            self._update_state()

    # Return current state including history
    def _current_state(self):
        if self._misc_state_included:
            s = [self._current_image_state, self._current_misc_state]
        else:
            s = [self._current_image_state]
        return s

    # Return current state's COPY including history.
    def _current_state_copy(self):
        if self._misc_state_included:
            s = [self._current_image_state.copy(), self._current_misc_state.copy()]
        else:
            s = [self._current_image_state.copy()]
        return s

    # Sets the whole state to zeros. 
    def reset_state(self):
        self._current_image_state.fill(0.0)
        self._last_action_index = 0
        if self._misc_state_included:
            self._current_misc_state.fill(0.0)
            if self._remember_n_actions > 0:
                self._last_n_actions.fill(0)

    def make_step(self):
        self._update_state()
        # TODO Check if not making the copy still works
        a = self._evaluator.estimate_best_action(self._current_state_copy())
        self._actions_stats[a] += 1
        self._game.make_action(self._actions[a], self._skiprate + 1)
        self._last_action_index = a

    def make_sleep_step(self, sleep_time=1 / 35.0):
        self._update_state()
        a = self._evaluator.estimate_best_action(self._current_state_copy())
        self._actions_stats[a] += 1

        self._game.set_action(self._actions[a])
        self._last_action_index = a
        for i in xrange(self._skiprate):
            self._game.advance_action(1, False, True)
            sleep(sleep_time)
        self._game.advance_action()
        sleep(sleep_time)

    # Performs a learning step according to epsilon-greedy policy.
    # The step spans self._skiprate +1 actions.
    def make_learning_step(self):
        self._steps += 1
        # epsilon decay
        if self._steps > self._epsilon_decay_start and self._epsilon > self._end_epsilon:
            self._epsilon = max(self._epsilon - self._epsilon_decay_stride, 0)

            # Copy because state will be changed in a second
        s = self._current_state_copy();

        # With probability epsilon choose a random action:
        if self._epsilon >= random.random():
            a = random.randint(0, len(self._actions) - 1)
        else:
            a = self._evaluator.estimate_best_action(s)
        self._actions_stats[a] += 1

        # make action and get the reward
        self._last_action_index = a
        r = self._game.make_action(self._actions[a], self._skiprate + 1)
        r = np.float32(r)
        if self._shaping_on:
            sr = np.float32(doom_fixed_to_double(self._game.get_game_variable(GameVariable.USER1)))
            r += sr - self._last_shaping_reward
            self._last_shaping_reward = sr

        r *= self._reward_scale

        # update state s2 accordingly
        if self._game.is_episode_finished():
            # terminal state
            s2 = None
            self._transitions.add_transition(s, a, s2, r, terminal=True)
        else:
            self._update_state()
            s2 = self._current_state()
            self._transitions.add_transition(s, a, s2, r, terminal=False)

        # Perform q-learning once for a while
        if self._transitions.size >= self._backprop_start_step and self._steps % self._update_pattern[0] == 0:
            for a in xrange(self._update_pattern[1]):
                self._evaluator.learn(self._transitions.get_sample())

        # Melt the network sometimes
        if self._steps % self._melt_steps == 0:
            self._evaluator.melt()

    # Adds a transition to the bank.
    def add_transition(self, s, a, s2, r, terminal):
        self._transitions.add_transition(s, a, s2, r, terminal)

    # Runs a single episode in current mode. It ignores the mode if learn==true/false
    def run_episode(self, sleep_time=0):
        self.new_episode()
        if sleep_time == 0:
            while not self._game.is_episode_finished():
                self.make_step()
        else:
            while not self._game.is_episode_finished():
                self.make_sleep_step(sleep_time)

        return np.float32(self._game.get_total_reward())

    # Utility stuff
    def get_actions_stats(self, clear=False, norm=True):
        stats = self._actions_stats.copy()
        if norm:
            stats = stats / np.float32(self._actions_stats.sum())
            stats[stats == 0.0] = -1
            stats = np.around(stats, 3)

        if clear:
            self._actions_stats.fill(0)
        return stats

    def get_steps(self):
        return self._steps

    def get_epsilon(self):
        return self._epsilon

    def get_network(self):
        return self._evaluator.network

    def set_epsilon(self, eps):
        self._epsilon = eps

    def set_skiprate(self, skiprate):
        self._skiprate = max(skiprate, 0)

    def get_skiprate(self):
        return self._skiprate

    # Saves network weights to a file
    def save_params(self, filename, quiet=False):
        if not quiet:
            print "Saving network weights to " + filename + "..."
        self._prepare_for_save()
        params = get_all_param_values(self._evaluator.network)
        pickle.dump(params, open(filename, "wb"))
        if not quiet:
            print "Saving finished."

    # Loads network weights from the file
    def load_params(self, filename, quiet=False):
        if not quiet:
            print "Loading network weights from " + filename + "..."
        params = pickle.load(open(filename, "rb"))
        set_all_param_values(self._evaluator.network, params)
        set_all_param_values(self._evaluator.frozen_network, params)

        if not quiet:
            print "Loading finished."

            # Loads the whole engine with params from file

    @staticmethod
    def load(game, filename, quiet=False):
        if not quiet:
            print "Loading qengine from " + filename + "..."

        params = pickle.load(open(filename, "rb"))

        qengine_args = params[0]
        network_params = params[1]

        steps = qengine_args["steps"]
        epsilon = qengine_args["epsilon"]
        del (qengine_args["epsilon"])
        del (qengine_args["steps"])
        qengine_args["game"] = game

        qengine = QEngine(**qengine_args)
        set_all_param_values(qengine._evaluator.network, network_params)
        set_all_param_values(qengine._evaluator.frozen_network, network_params)

        if not quiet:
            print "Loading finished."
            qengine._steps = steps
            qengine._epsilon = epsilon
        return qengine

    # Saves the whole engine with params to a file
    def save(self, filename, quiet=False):
        if not quiet:
            print "Saving qengine to " + filename + "..."
        self._prepare_for_save()
        network_params = get_all_param_values(self._evaluator.network)
        params = [self.setup, network_params]
        pickle.dump(params, open(filename, "wb"))
        if not quiet:
            print "Saving finished."
Exemplo n.º 20
0
# Environment
env = gym.make(config['env_name'])
torch.manual_seed(config['seed'])
np.random.seed(config['seed'])
random.seed(config['seed'])
env.seed(config['seed'])
env.action_space.np_random.seed(config['seed'])
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, config)

# Memory
memory = ReplayMemory(config['replay_size'])

# Training Loop
total_numsteps = 0
updates = 0
test_step = 10000

for i_episode in itertools.count(1):
    episode_reward = 0
    episode_steps = 0
    done = False
    state = env.reset()

    acc_log_alpha = 0.
    while not done:
        if config['start_steps'] > total_numsteps:
Exemplo n.º 21
0
class DQN():

    def __init__(self, env_type, state_dims, num_actions):
        if env_type == EnvTypes.ATARI:
            state_size = [state_dims[0], state_dims[1]*FRAME_STACK, state_dims[2]]
        elif env_type == EnvTypes.STANDARD:
            state_size = state_dims
        self.replay_memory = ReplayMemory(REPLAY_MEMORY_CAPACITY, state_size)
        self.exploration = 1.0
        self.train_iter = 0
        self.env_type = env_type

        if env_type == EnvTypes.ATARI:
            buffer_size = FRAME_STACK*FRAME_SKIP
            self.observation_buffer = [np.zeros((state_dims[0], state_dims[1], state_dims[2]))
                                       for _ in range(buffer_size)]
        else:
            self.observation_buffer = [np.zeros((state_dims[0]))]

        self.config = tf.ConfigProto()
        self.config.gpu_options.per_process_gpu_memory_fraction = GPU_MEMORY_FRACTION
        self.sess = tf.Session(config=self.config)

        # build q network
        self.dqn_vars = dict()
        with tf.variable_scope(DQN_SCOPE):
            if env_type == EnvTypes.ATARI:
                self.x, self.initial_layers = self.add_atari_layers(state_dims, self.dqn_vars)
            elif env_type == EnvTypes.STANDARD:
                self.x, self.initial_layers = self.add_standard_layers(state_dims, self.dqn_vars)

            # add final hidden layers
            self.hid = fc(self.initial_layers, 128, HIDDEN, var_dict=self.dqn_vars)
            self.q = fc(self.hid, num_actions, OUTPUT,
                        var_dict=self.dqn_vars, activation=False)
            
            tf.histogram_summary('q_values', self.q)
                          
        # build target network
        self.target_vars = dict()
        with tf.variable_scope(TARGET_SCOPE):
            if env_type == EnvTypes.ATARI:
                self.t_x, self.t_initial_layers = self.add_atari_layers(state_dims,
                                                                        self.target_vars)
            elif env_type == EnvTypes.STANDARD:
                self.t_x, self.t_initial_layers = self.add_standard_layers(state_dims,
                                                                           self.target_vars)

            self.t_hid = fc(self.t_initial_layers, 128, HIDDEN, var_dict=self.target_vars)
            self.t_q = fc(self.t_hid, num_actions, OUTPUT,
                          var_dict=self.target_vars, activation=False)

            tf.histogram_summary('target_q_values', self.t_q)

        # add weight transfer operations from primary dqn network to target network
        self.assign_ops = []
        with tf.variable_scope(TRANSFER_SCOPE):
            for variable in self.dqn_vars.keys():
                target_variable = TARGET_SCOPE + variable[len(DQN_SCOPE):]
                decay = tf.mul(1 - TAU, self.target_vars[target_variable])
                update = tf.mul(TAU, self.dqn_vars[variable])
                new_target_weight = tf.add(decay, update)
                target_assign = self.target_vars[target_variable].assign(new_target_weight)
                self.assign_ops.append(target_assign)

        # build dqn evaluation
        with tf.variable_scope(EVALUATION_SCOPE):
            # one-hot action selection
            self.action = tf.placeholder(tf.int32, shape=[None])
            self.action_one_hot = tf.one_hot(self.action, num_actions)
            # reward
            self.reward = tf.placeholder(tf.float32, shape=[None, 1])
            # terminal state
            self.nonterminal = tf.placeholder(tf.float32, shape=[None, 1])

            self.target = tf.add(self.reward, tf.mul(GAMMA, tf.mul(self.nonterminal,
                          tf.reduce_max(self.t_q, 1, True))))
            self.predict = tf.reduce_sum(tf.mul(self.action_one_hot, self.q), 1, True)
            self.error = tf.reduce_mean(mse(self.predict, self.target))

            tf.scalar_summary('error', self.error)
        
        val_print = tf.Print(self.error, [self.predict, self.target])
        self.optimize = tf.train.RMSPropOptimizer(ALPHA, decay=RMS_DECAY, momentum=MOMENTUM,
                        epsilon=EPSILON).minimize(self.error, var_list=self.dqn_vars.values())

        # write out the graph and summaries for tensorboard
        self.summaries = tf.merge_all_summaries()
        if os.path.isdir(TENSORBOARD_GRAPH_DIR):
            shutil.rmtree(TENSORBOARD_GRAPH_DIR)
        self.writer = tf.train.SummaryWriter(TENSORBOARD_GRAPH_DIR, self.sess.graph)

        # initialize variables
        self.sess.run(tf.initialize_all_variables())

        # create saver
        self.saver = tf.train.Saver()

    def add_atari_layers(self, dims, var_dict):
        x = tf.placeholder(tf.float32, shape=[None, dims[0], dims[1]*FRAME_STACK, 1])
        conv1 = conv2d(x, 8, 4, 32, CONV1, var_dict=var_dict)
        conv2 = conv2d(conv1, 4, 2, 64, CONV2, var_dict=var_dict)
        conv3 = conv2d(conv2, 3, 1, 64, CONV3, var_dict=var_dict)
        conv_shape = conv3.get_shape().as_list()
        flatten = [-1, conv_shape[1]*conv_shape[2]*conv_shape[3]]
        return x, tf.reshape(conv3, flatten)

    def add_standard_layers(self, dims, var_dict):
        x = tf.placeholder(tf.float32, shape=[None, dims[0]])
        fc1 = fc(x, 256, FC, var_dict=var_dict)
        return x, fc1
        
    def process_observation(self, observation):
        if self.env_type == EnvTypes.ATARI:
            # convert to normalized luminance and downscale
            observation = downscale(rgb_to_luminance(observation), 2)

        # push the new observation onto the buffer
        self.observation_buffer.pop(len(self.observation_buffer)-1)
        self.observation_buffer.insert(0, observation)

    def _get_stacked_state(self):
        stacked_state = self.observation_buffer[0]
        for i in range(1, FRAME_STACK):
            stacked_state = np.hstack((stacked_state, self.observation_buffer[i*FRAME_SKIP]))
        return stacked_state

    def _predict(self):
        if self.env_type == EnvTypes.ATARI:
            state = self._get_stacked_state()
        else:
            state = self.observation_buffer[0]
        state = np.expand_dims(state, axis=0)
        return np.argmax(self.sess.run(self.q, feed_dict={self.x: state}))

    def training_predict(self, env, observation):
        self.process_observation(observation)

        # select action according to epsilon-greedy policy
        if random.random() < self.exploration:
            action = env.action_space.sample()
        else:
            action = self._predict()
        self.exploration = max(self.exploration - EXPLORATION_DECAY, FINAL_EXPLORATION)

        return action

    def testing_predict(self, observation):
        self.process_observation(observation)
        return self._predict()

    def notify_state_transition(self, action, reward, done):
        if self.env_type == EnvTypes.ATARI:
            state = self._get_stacked_state()
        else:
            state = self.observation_buffer[0]
        self.replay_memory.add_state_transition(state, action, reward, done)
        if done:
            # flush the observation buffer
            for i in range(len(self.observation_buffer)):
                self.observation_buffer[i] = np.zeros(self.observation_buffer[i].shape)

    def batch_train(self, save_dir):
        # sample batch from replay memory
        state, action, reward, terminal, newstate = self.replay_memory.sample(BATCH_SIZE)
        reward = np.expand_dims(reward, axis=1)
        terminal = np.expand_dims(terminal, axis=1)
        nonterminal = 1 - terminal

        # update target network weights
        self.sess.run(self.assign_ops)

        # run neural network training step
        if self.train_iter % SUMMARY_PERIOD == 0:
            summary, _ = self.sess.run([self.summaries, self.optimize], feed_dict={self.x:state,
                                       self.t_x:newstate, self.action:action,
                                       self.reward:reward, self.nonterminal:nonterminal})
            self.writer.add_summary(summary, self.train_iter)
        else:
            self.sess.run(self.optimize, feed_dict={self.x:state, self.t_x:newstate,
                          self.action:action, self.reward:reward, self.nonterminal:nonterminal})

        # save the dqn
        if save_dir is not None and self.train_iter % SAVE_CHECKPOINT_PERIOD == 0:
            self.save_algorithm(save_dir)

        self.train_iter += 1

    def save_algorithm(self, save_dir):
        # create directory tree for saving the algorithm
        checkpoint_dir = save_dir + "/save_{}".format(self.train_iter)
        os.mkdir(checkpoint_dir)
        model_file = checkpoint_dir + "/model.ckpt"

        print("Saving algorithm to {}".format(checkpoint_dir))
        t = time.time()
        self.saver.save(self.sess, model_file)
        print("Completed saving in {} seconds".format(time.time() - t))

    def restore_algorithm(self, restore_dir):
        self.train_iter = int(restore_dir[restore_dir.rfind("save_") + len("save_"):])
        self.saver.restore(self.sess, restore_dir + "/model.ckpt")
Exemplo n.º 22
0
class DoubleDQNAgent():
    def __init__(self, state_size, action_size):
        # for rendering
        self.render = False

        self.state_size = state_size
        self.action_size = action_size

        # hyperparams for estimator
        self.gamma = 0.95
        self.lr = 0.001
        self.replay_memory_size = 50000
        self.epsilon = 1.0
        self.epsilon_min = 0.000001
        self.explore_steps = 3000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.explore_steps
        self.batch_size = 32
        self.replay_memory_init_size = 1000

        # Estimators
        self.q_estimator = DQNEstimator(state_size, action_size)
        self.target_estimator = DQNEstimator(state_size, action_size)
        self.optimizer = optim.SGD(self.q_estimator.parameters(), lr=self.lr)

        # memory
        self.memory = ReplayMemory(self.replay_memory_size)

    def update_target_estimator(self):
        self.target_estimator.load_state_dict(self.q_estimator.state_dict())

    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            state = Variable(torch.from_numpy(state)).float()
            q_values = self.q_estimator(state)
            _, best_action = torch.max(q_values, dim=1)
            return int(best_action)

    def train(self):
        # epsilon decay
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
        # fetch samples from memory
        batch = self.memory.sample(self.batch_size)
        batch = np.array(batch).transpose()

        # stack all the states
        states = np.vstack(batch[0])
        actions = torch.LongTensor(list(batch[1]))
        rewards = torch.FloatTensor(list(batch[2]))

        # stack all the next states
        next_states = np.vstack(batch[3])
        dones = batch[4]
        dones = dones.astype(int)

        # actions one hot encoding
        actions_one_hot = F.one_hot(actions, num_classes=self.action_size)
        actions_one_hot = torch.FloatTensor(actions_one_hot.float())
        actions_one_hot = Variable(actions_one_hot)

        # Forward prop
        states = torch.FloatTensor(states)
        states = Variable(states)
        preds = self.q_estimator(states)

        # get current action value
        preds = torch.sum(torch.mul(preds, actions_one_hot), dim=1)

        # Double DQN
        next_states = torch.FloatTensor(next_states)
        next_states = Variable(next_states)
        next_action_values = self.q_estimator(next_states)
        best_actions = torch.argmax(next_action_values, dim=1)
        q_values_next_target = self.target_estimator(next_states)

        dones = torch.FloatTensor(dones)
        target = rewards + (1 - dones) * self.gamma * q_values_next_target[
            np.arange(self.batch_size), best_actions]
        target = Variable(target)

        loss = F.mse_loss(preds, target).mean()

        # zero out accumulated grads
        self.optimizer.zero_grad()

        # back prop
        loss.backward()
        self.optimizer.step()
        return loss.item()
Exemplo n.º 23
0
env = NormalizedActions(gym.make(args.env_name))

writer = SummaryWriter()

env.seed(args.seed)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
if args.algo == "NAF":
    agent = NAF(args.gamma, args.tau, args.hidden_size,
                      env.observation_space.shape[0], env.action_space)
else:
    agent = DDPG(args.gamma, args.tau, args.hidden_size,
                      env.observation_space.shape[0], env.action_space)

memory = ReplayMemory(args.replay_size)

ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, 
    desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None

rewards = []
total_numsteps = 0
updates = 0

for i_episode in range(args.num_episodes):
    state = torch.Tensor([env.reset()])

    if args.ou_noise: 
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                      i_episode) / args.exploration_end + args.final_noise_scale
Exemplo n.º 24
0
class DQNAgent():
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        # visualising training
        self.render = False
        self.load_model = False

        # hyperparams for estimator
        self.gamma = 0.95
        self.lr = 0.001
        self.replay_memory_size = 50000
        self.epsilon = 1.0
        self.min_epsilon = 0.000001
        self.explore_step = 3000
        self.epsilon_decay = (self.epsilon -
                              self.min_epsilon) / self.explore_step
        self.batch_size = 32
        self.replay_memory_init_size = 500
        self.update_target_model_every = 1000

        # Replay Memory
        self.memory = ReplayMemory(self.replay_memory_size)

        # create estimator and target estimators
        self.q_estimator = DQNEstimator(state_size, action_size)
        self.target_estimator = DQNEstimator(state_size, action_size)
        self.optimizer = optim.Adam(self.q_estimator.parameters(), lr=self.lr)

        # initialize target estimator
        # TODO: copy q_estimator weights to target model

        if self.load_model:
            # TODO: Load saved Q estimator
            pass

    def update_target_estimator(self):
        self.target_estimator.load_state_dict(self.q_estimator.state_dict())

    def get_action(self, state):
        # random action
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:  # greedy action
            state = Variable(torch.from_numpy(state)).float()
            q_values = self.q_estimator(state)
            _, best_action = torch.max(q_values, dim=1)
            return int(best_action)

    def train_network(self):
        # epsilon decay
        if self.epsilon > self.min_epsilon:
            self.epsilon -= self.epsilon_decay

        # fetch samples
        samples = self.memory.sample(self.batch_size)
        samples = np.array(samples).transpose()

        # create batches of states, actions, rewards, next_states, done
        # stack all the states
        states = np.vstack(samples[0])
        actions = torch.LongTensor(list(samples[1]))
        rewards = torch.FloatTensor(list(samples[2]))
        next_states = Variable(torch.FloatTensor(np.vstack(samples[3])))
        is_dones = samples[4]

        is_dones = torch.FloatTensor(is_dones.astype(int))

        # forward propagation Q_network for current states
        states = torch.Tensor(states)
        states = Variable(states).float()
        preds = self.q_estimator(states)

        # onehot encoding actions
        actions_one_hot = F.one_hot(actions, num_classes=self.action_size)
        actions_one_hot = torch.FloatTensor(actions_one_hot.float())
        actions_one_hot = Variable(actions_one_hot)

        # get current actions' action value
        preds = torch.sum(torch.mul(preds, actions_one_hot), dim=1)

        # Q function of next state
        nex_state_preds = self.target_estimator(next_states).data

        # calculate Q-Learning target
        target = rewards + (1 - is_dones) * self.gamma * torch.max(
            nex_state_preds, dim=1)[0]
        target = Variable(target)

        # calculate mse loss (preds and targets)
        loss = F.mse_loss(preds, target).mean()

        # backward propagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()
Exemplo n.º 25
0
 def setUp(self):
     self.heap = BinaryHeap()
     self.replayMemory = ReplayMemory(10, 32, 4, 84, 84)
Exemplo n.º 26
0
class Driver(object):
    '''
    A driver object for the SCRC
    '''

    def __init__(self, args):
        '''Constructor'''
        self.WARM_UP = 0
        self.QUALIFYING = 1
        self.RACE = 2
        self.UNKNOWN = 3
        self.stage = args.stage
        
        self.parser = msgParser.MsgParser()
        self.state = carState.CarState()
        self.control = carControl.CarControl()

        self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0]
        self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0]
        self.num_inputs = 19
        self.num_steers = len(self.steers)
        self.num_speeds = len(self.speeds)
        self.num_actions = self.num_steers + self.num_speeds
        
        self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args)
        self.mem = ReplayMemory(args.replay_size, self.num_inputs, args)
        self.minibatch_size = args.batch_size

        if args.load_replay:
            self.mem.load(args.load_replay)
        if args.load_weights:
            self.net.load_weights(args.load_weights)
        self.save_weights_prefix = args.save_weights_prefix
        self.save_interval = args.save_interval
        self.save_replay = args.save_replay

        self.enable_training = args.enable_training
        self.enable_exploration = args.enable_exploration
        self.save_csv = args.save_csv
        if self.save_csv:
          self.csv_file = open(args.save_csv, "wb")
          self.csv_writer = csv.writer(self.csv_file)
          self.csv_writer.writerow(['episode', 'distFormStart', 'distRaced', 'curLapTime', 'lastLapTime', 'racePos', 'epsilon', 'replay_memory', 'train_steps'])

        self.total_train_steps = 0
        self.exploration_decay_steps = args.exploration_decay_steps
        self.exploration_rate_start = args.exploration_rate_start
        self.exploration_rate_end = args.exploration_rate_end
        self.skip = args.skip

        self.show_sensors = args.show_sensors
        self.show_qvalues = args.show_qvalues

        self.episode = 0
        self.distances = []
        self.onRestart()
        
        if self.show_sensors:
            from sensorstats import Stats
            self.stats = Stats(inevery=8)
        
        if self.show_qvalues:
            from plotq import PlotQ
            self.plotq = PlotQ(self.num_steers, self.num_speeds)

    def init(self):
        '''Return init string with rangefinder angles'''
        self.angles = [0 for x in range(19)]
        
        for i in range(5):
            self.angles[i] = -90 + i * 15
            self.angles[18 - i] = 90 - i * 15
        
        for i in range(5, 9):
            self.angles[i] = -20 + (i-5) * 5
            self.angles[18 - i] = 20 - (i-5) * 5
        
        return self.parser.stringify({'init': self.angles})

    def getState(self):
        #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()])
        #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0
        state = np.array(self.state.getTrack()) / 200.0
        assert state.shape == (self.num_inputs,)
        return state

    def getReward(self, terminal):
        if terminal:
            reward = -1000
        else:
            dist = self.state.getDistFromStart()
            if self.prev_dist is not None:
                reward = max(0, dist - self.prev_dist) * 10
                assert reward >= 0, "reward: %f" % reward
            else:
                reward = 0
            self.prev_dist = dist
            
            #reward -= self.state.getTrackPos()
            #print "reward:", reward
        
        return reward

    def getTerminal(self):
        return np.all(np.array(self.state.getTrack()) == -1)

    def getEpsilon(self):
        # calculate decaying exploration rate
        if self.total_train_steps < self.exploration_decay_steps:
            return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps
        else:
            return self.exploration_rate_end
 
    def drive(self, msg):
        # parse incoming message
        self.state.setFromMsg(msg)
        
        # show sensors
        if self.show_sensors:
            self.stats.update(self.state)

        # training
        if self.enable_training and self.mem.count >= self.minibatch_size:
          minibatch = self.mem.getMinibatch()
          self.net.train(minibatch)
          self.total_train_steps += 1
          #print "total_train_steps:", self.total_train_steps

        # skip frame and use the same action as previously
        if self.skip > 0:
            self.frame = (self.frame + 1) % self.skip
            if self.frame != 0:
                return self.control.toMsg()

        # fetch state, calculate reward and terminal indicator  
        state = self.getState()
        terminal = self.getTerminal()
        reward = self.getReward(terminal)
        #print "reward:", reward

        # store new experience in replay memory
        if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None:
            self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal)

        # if terminal state (out of track), then restart game
        if terminal:
            #print "terminal state, restarting"
            self.control.setMeta(1)
            return self.control.toMsg()
        else:
            self.control.setMeta(0)

        # choose actions for wheel and speed
        epsilon = self.getEpsilon()
        if self.enable_exploration and random.random() < epsilon:
            #print "random move"
            steer = random.randrange(self.num_steers)
            #speed = random.randrange(self.num_speeds)
            speed = random.randint(2, self.num_speeds-1)
        else:
            # use broadcasting to efficiently produce minibatch of desired size
            minibatch = state + np.zeros((self.minibatch_size, 1))
            Q = self.net.predict(minibatch)
            assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape)
            #print "steer Q: ", Q[0,:self.num_steers]
            #print "speed Q:", Q[0,-self.num_speeds:]
            steer = np.argmax(Q[0, :self.num_steers])
            speed = np.argmax(Q[0, -self.num_speeds:])
            if self.show_qvalues:
                self.plotq.update(Q[0])
        #print "steer:", steer, "speed:", speed

        # gears are always automatic
        gear = self.gear()

        # set actions
        self.setSteerAction(steer)
        self.setGearAction(gear)
        self.setSpeedAction(speed)

        # remember state and actions 
        self.prev_state = state
        self.prev_steer = steer
        self.prev_speed = speed

        #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count

        #print "reward:", reward, "epsilon:", epsilon

        return self.control.toMsg()

    def gear(self):
        rpm = self.state.getRpm()
        gear = self.state.getGear()
        
        if self.prev_rpm == None:
            up = True
        else:
            if (self.prev_rpm - rpm) < 0:
                up = True
            else:
                up = False
        
        if up and rpm > 7000 and gear < 6:
            gear += 1
        
        if not up and rpm < 3000 and gear > 0:
            gear -= 1
        
        return gear
        
    def setSteerAction(self, steer):
        assert 0 <= steer <= self.num_steers
        self.control.setSteer(self.steers[steer])

    def setGearAction(self, gear):
        assert -1 <= gear <= 6
        self.control.setGear(gear)

    def setSpeedAction(self, speed):
        assert 0 <= speed <= self.num_speeds
        accel = self.speeds[speed]
        if accel >= 0:
            #print "accel", accel
            self.control.setAccel(accel)
            self.control.setBrake(0)
        else:
            #print "brake", -accel
            self.control.setAccel(0)
            self.control.setBrake(-accel)
    
    def onShutDown(self):
        if self.save_weights_prefix:
            self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl")
        
        if self.save_replay:
            self.mem.save(self.save_replay)

        if self.save_csv:
            self.csv_file.close()

    def onRestart(self):
    
        self.prev_rpm = None
        self.prev_dist = None
        self.prev_state = None
        self.prev_steer = None
        self.prev_speed = None
        self.frame = -1

        if self.episode > 0:
            dist = self.state.getDistRaced()
            self.distances.append(dist)
            epsilon = self.getEpsilon()
            print "Episode:", self.episode, "\tDistance:", dist, "\tMax:", max(self.distances), "\tMedian10:", np.median(self.distances[-10:]), \
                "\tEpsilon:", epsilon, "\tReplay memory:", self.mem.count

            if self.save_weights_prefix and self.save_interval > 0 and self.episode % self.save_interval == 0:
                self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl")
                #self.mem.save(self.save_weights_prefix + "_" + str(self.episode) + "_replay.pkl")

            if self.save_csv:
                self.csv_writer.writerow([
                    self.episode, 
                    self.state.getDistFromStart(), 
                    self.state.getDistRaced(), 
                    self.state.getCurLapTime(), 
                    self.state.getLastLapTime(), 
                    self.state.getRacePos(), 
                    epsilon, 
                    self.mem.count,
                    self.total_train_steps
                ])
                self.csv_file.flush()

        self.episode += 1
Exemplo n.º 27
0
                          self.position.unsqueeze(0))
        else:
            reward = 0
            next_state = None
        return cur_state, next_state, self.to_variable(np.array([reward]))

    def to_variable(self, x):
        if torch.cuda.is_available():
            return Variable(torch.from_numpy(x).float()).cuda()
        else:
            return Variable(torch.from_numpy(x).float())


if __name__ == '__main__':
    env = Environment(test=False, init_position=np.array([0, 1]))
    replay_memory = ReplayMemory(100)
    max_val = 0
    min_val = env.wealth
    while (1):
        action = np.random.randint(3)
        cur_state, next_state, reward = env.step(action)
        replay_memory.push(cur_state, action, next_state, reward)
        if (next_state == None):
            break
        max_val = max(max_val, env.wealth)
        min_val = min(min_val, env.wealth)
    print(max_val, min_val)

    # transitions = replay_memory.sample(1)
    # Transition(*zip(*transitions))
    # batch = Transition(*zip(*transitions))
Exemplo n.º 28
0
class Agent:

    def __init__(self, max_memory, batch_size, action_size, atom_size, input_size, kernel_size):
        self.z = np.linspace(V_MIN, V_MAX, ATOM_SIZE)
        self.action_size = action_size
        self.epsilon = EPSILON
        self.batch_size = batch_size
        self.atom_size = atom_size
        self.memory = ReplayMemory(max_memory)
        self.brain = RainbowDQN(action_size=action_size, atom_size=atom_size,
                                input_size=input_size, kernel_size=kernel_size)
        self.target_brain = RainbowDQN(action_size=action_size, atom_size=atom_size,
                                       input_size=input_size, kernel_size=kernel_size)
        self.target_brain.load_state_dict(self.brain.state_dict())
        self.optim = optim.Adam(self.brain.parameters(), lr=0.001)

    def step(self, state_input):
        probs = self.brain(state_input)
        best_action = self.select_best_action(probs)
        return best_action

    def select_best_action(self, probs):
        numpy_probs = self.variable_to_numpy(probs)
        z_probs = np.multiply(numpy_probs, self.z)
        best_action = np.sum(z_probs, axis=1).argmax()
        # best_action = np.argmax(numpy_probs, axis=1)
        return best_action

    def store_states(self, states, best_action, reward, done, next_states):
        td = self.calculate_td(states, best_action, reward, done, next_states)
        self.memory.add_memory(states, best_action, reward, done, next_states, td=td)

    def variable_to_numpy(self, probs):
        # probs is a list of softmax prob
        numpy_probs = probs.data.numpy()
        return numpy_probs

    #TODO find out why td does not get -100 reward
    def calculate_td(self, states, best_action, reward, done, next_states):
        probs = self.brain(states)
        numpy_probs = self.variable_to_numpy(probs)
        # states_prob = np.multiply(numpy_probs, self.z)
        # states_q_value = np.sum(states_prob, axis=1)[best_action]
        states_q_value = numpy_probs[0][best_action]

        next_probs = self.brain(next_states)
        numpy_next_probs = self.variable_to_numpy(next_probs)
        # next_states_prob = np.multiply(numpy_next_probs, self.z)
        # max_next_states_q_value = np.sum(next_states_prob, axis=1).max()
        max_next_states_q_value = np.max(numpy_next_probs, axis=1)[0]

        if done:
            td = reward - states_q_value
        else:
            td = (reward + gamma * max_next_states_q_value) - states_q_value

        return abs(td)

    def learn(self):
        # make sure that there is at least an amount of batch_size before training it
        if self.memory.count < self.batch_size:
            return

        tree_indexes, tds, batches = self.memory.get_memory(self.batch_size)
        total_loss = None
        for index, batch in enumerate(batches):

            # fixme fix this None type
            if batch is None:
                continue

            state_input = batch[0]
            best_action = batch[1]
            reward = batch[2]
            done = batch[3]
            next_state_input = batch[4]

            current_q = self.brain(state_input)
            next_best_action = self.step(next_state_input)
            # max_current_q = torch.max(current_q)

            next_z_prob = self.target_brain(next_state_input)
            next_z_prob = self.variable_to_numpy(next_z_prob)

            # target = reward + (1 - done) * gamma * next_z_prob.data[0][next_best_action]
            # target = Variable(torch.FloatTensor([target]))

            #TODO finish single dqn with per

            target_z_prob = np.zeros([self.action_size, ATOM_SIZE], dtype=np.float32)
            if done:
                Tz = min(V_MAX, max(V_MIN, reward))
                b = (Tz - V_MIN) / (self.z[1] - self.z[0])
                m_l = math.floor(b)
                m_u = math.ceil(b)
                target_z_prob[best_action][m_l] += (m_u - b)
                target_z_prob[best_action][m_u] += (b - m_l)
            else:
                for z_index in range(len(next_z_prob)):
                    Tz = min(V_MAX, max(V_MIN, reward + gamma * self.z[z_index]))
                    b = (Tz - V_MIN) / (self.z[1] - self.z[0])
                    m_l = math.floor(b)
                    m_u = math.ceil(b)

                    target_z_prob[best_action][m_l] += next_z_prob[next_best_action][z_index] * (m_u - b)
                    target_z_prob[best_action][m_u] += next_z_prob[next_best_action][z_index] * (b - m_l)
            target_z_prob = Variable(torch.from_numpy(target_z_prob))

            # backward propagate
            output_prob = self.brain(state_input)[0]
            loss = -torch.sum(target_z_prob * torch.log(output_prob + 1e-8))

            # loss = F.mse_loss(max_current_q, target)
            total_loss = loss if total_loss is None else total_loss + loss

            # update td
            td = self.calculate_td(state_input, best_action, reward, done, next_state_input)
            tds[index] = td

        self.optim.zero_grad()
        total_loss.backward()
        self.optim.step()

        # load brain to target brain
        self.target_brain.load_state_dict(self.brain.state_dict())

        self.memory.update_memory(tree_indexes, tds)
Exemplo n.º 29
0
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)

        nets = ddpg_nets_dm

        tau = FLAGS.tau
        discount = FLAGS.discount
        pl2norm = FLAGS.pl2norm
        l2norm = FLAGS.l2norm
        plearning_rate = FLAGS.prate
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        # init replay memory
        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        # start tf session
        self.sess = tf.Session(
            config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread,
                                  log_device_placement=False,
                                  allow_soft_placement=True,
                                  gpu_options=tf.GPUOptions(
                                      per_process_gpu_memory_fraction=0.1)))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA, FLAGS.l1size, FLAGS.l2size)
        self.theta_q = nets.theta_q(dimO, dimA, FLAGS.l1size, FLAGS.l2size)
        self.theta_pt, update_pt = exponential_moving_averages(
            self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(
            self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test = nets.policy(obs, self.theta_p)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((outheta) * noise_var -
                                     tf.random_normal(dimA, stddev=ousigma))
        act_expl = act_test + noise

        # test
        q = nets.qfunction(obs, act_test, self.theta_q)
        # training

        # q optimization
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA,
                                   "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")

        # policy loss
        act_train_policy = nets.policy(obs, self.theta_p)
        q_train_policy = nets.qfunction(obs, act_train_policy, self.theta_q)
        meanq = tf.reduce_mean(q_train_policy, 0)
        wd_p = tf.add_n([pl2norm * tf.nn.l2_loss(var)
                         for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=plearning_rate,
                                         epsilon=1e-4)
        grads_and_vars_p = optim_p.compute_gradients(loss_p,
                                                     var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q
        q_train = nets.qfunction(obs, act_train, self.theta_q)
        # q targets
        act2 = nets.policy(obs2, theta=self.theta_pt)
        q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
        q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var)
                         for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                         epsilon=1e-4)
        grads_and_vars_q = optim_q.compute_gradients(loss_q,
                                                     var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        summary_writer = tf.train.SummaryWriter(
            os.path.join(FLAGS.outdir, 'board'), self.sess.graph)
        summary_list = []
        summary_list.append(
            tf.scalar_summary('Qvalue', tf.reduce_mean(q_train)))
        summary_list.append(tf.scalar_summary('loss', ms_td_error))
        summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew)))

        # tf functions
        with self.sess.as_default():
            self._act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train = Fun([obs, act_train, rew, obs2, term2],
                              [train_p, train_q, loss_q], summary_list,
                              summary_writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)
Exemplo n.º 30
0
class Agent:

    def __init__(self, dimO, dimA):
        dimA, dimO = dimA[0], dimO[0]
        self.dimA = dimA
        self.dimO = dimO

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        if FLAGS.icnn_opt == 'adam':
            self.opt = self.adam
        elif FLAGS.icnn_opt == 'bundle_entropy':
            self.opt = self.bundle_entropy
        else:
            raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)

        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True)))

        self.noise = np.zeros(self.dimA)

        obs = tf.placeholder(tf.float32, [None, dimO], "obs")
        act = tf.placeholder(tf.float32, [None, dimA], "act")
        rew = tf.placeholder(tf.float32, [None], "rew")
        with tf.variable_scope('q'):
            negQ = self.negQ(obs, act)
        negQ_entr = negQ - entropy(act)
        q = -negQ
        q_entr = -negQ_entr
        act_grad, = tf.gradients(negQ, act)
        act_grad_entr, = tf.gradients(negQ_entr, act)

        obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target")
        act_target = tf.placeholder(tf.float32, [None, dimA], "act_target")
        term_target = tf.placeholder(tf.bool, [None], "term_target")
        with tf.variable_scope('q_target'):
            negQ_target = self.negQ(obs_target, act_target)
        negQ_entr_target = negQ_target - entropy(act_target)
        act_target_grad, = tf.gradients(negQ_target, act_target)
        act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target)
        q_target = -negQ_target
        q_target_entr = -negQ_entr_target

        if FLAGS.icnn_opt == 'adam':
            y = tf.where(term_target, rew, rew + discount * q_target_entr)
            y = tf.maximum(q_entr - 1., y)
            y = tf.minimum(q_entr + 1., y)
            y = tf.stop_gradient(y)
            td_error = q_entr - y
        elif FLAGS.icnn_opt == 'bundle_entropy':
            q_target = tf.where(term2, rew, rew + discount * q2_entropy)
            q_target = tf.maximum(q_entropy - 1., q_target)
            q_target = tf.minimum(q_entropy + 1., q_target)
            q_target = tf.stop_gradient(q_target)
            td_error = q_entropy - q_target
        else:
            raise RuntimeError("Needs checking.")
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)

        regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/')
        loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses)

        self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/')
        self.theta_cvx_ = [v for v in self.theta_
                           if 'proj' in v.name and 'W:' in v.name]
        self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_]
        self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_]
        # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_]

        self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope='q_target/')
        update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i))
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]

        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)


        summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'),
                                                self.sess.graph)
        if FLAGS.icnn_opt == 'adam':
            tf.summary.scalar('Qvalue', tf.reduce_mean(q))
        elif FLAGS.icnn_opt == 'bundle_entropy':
            tf.summary.scalar('Qvalue', tf.reduce_mean(q_entr))
        tf.summary.scalar('loss', ms_td_error)
        tf.summary.scalar('reward', tf.reduce_mean(rew))
        merged = tf.summary.merge_all

        # tf functions
        with self.sess.as_default():
            self._train = Fun([obs, act, rew, obs_target, act_target, term_target],
                              [optimize_q, update_target, loss_q],
                              merged, summary_writer)
            self._fg = Fun([obs, act], [negQ, act_grad])
            self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad])
            self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr])
            self._fg_entr_target = Fun([obs_target, act_target],
                                       [negQ_entr_target, act_entr_target_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())
            self.sess.run(self.makeCvx)
            self.sess.run([theta_target_i.assign(theta_i)
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)])

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def bundle_entropy(self, func, obs):
        act = np.ones((obs.shape[0], self.dimA)) * 0.5
        def fg(x):
            value, grad = func(obs, 2 * x - 1)
            grad *= 2
            return value, grad

        act = bundle_entropy.solveBatch(fg, act)[0]
        act = 2 * act - 1

        return act

    def adam(self, func, obs, plot=False):
        # if npr.random() < 1./20:
        #     plot = True
        b1 = 0.9
        b2 = 0.999
        lam = 0.5
        eps = 1e-8
        alpha = 0.01
        nBatch = obs.shape[0]
        act = np.zeros((nBatch, self.dimA))
        m = np.zeros_like(act)
        v = np.zeros_like(act)

        b1t, b2t = 1., 1.
        act_best, a_diff, f_best = [None]*3
        hist = {'act': [], 'f': [], 'g': []}
        for i in range(1000):
            f, g = func(obs, act)
            if plot:
                hist['act'].append(act.copy())
                hist['f'].append(f)
                hist['g'].append(g)

            if i == 0:
                act_best = act.copy()
                f_best = f.copy()
            else:
                prev_act_best = act_best.copy()
                I = (f < f_best)
                act_best[I] = act[I]
                f_best[I] = f[I]
                a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1))
                a_diff = a_diff_i if a_diff is None \
                         else lam*a_diff + (1.-lam)*a_diff_i
                # print(a_diff_i, a_diff, np.sum(f))
                if a_diff < 1e-3 and i > 5:
                    #print('  + Adam took {} iterations'.format(i))
                    if plot:
                        self.adam_plot(func, obs, hist)
                    return act_best

            m = b1 * m + (1. - b1) * g
            v = b2 * v + (1. - b2) * (g * g)
            b1t *= b1
            b2t *= b2
            mhat = m/(1.-b1t)
            vhat = v/(1.-b2t)

            act -= alpha * mhat / (np.sqrt(v) + eps)
            # act = np.clip(act, -1, 1)
            act = np.clip(act, -1.+1e-8, 1.-1e-8)

        #print('  + Warning: Adam did not converge.')
        if plot:
            self.adam_plot(func, obs, hist)
        return act_best

    def adam_plot(self, func, obs, hist):
        hist['act'] = np.array(hist['act']).T
        hist['f'] = np.array(hist['f']).T
        hist['g'] = np.array(hist['g']).T
        if self.dimA == 1:
            xs = np.linspace(-1.+1e-8, 1.-1e-8, 100)
            ys = [func(obs[[0],:], [[xi]])[0] for xi in xs]
            fig = plt.figure()
            plt.plot(xs, ys)
            plt.plot(hist['act'][0,0,:], hist['f'][0,:], label='Adam')
            plt.legend()
            fname = os.path.join(FLAGS.outdir, 'adamPlt.png')
            #print("Saving Adam plot to {}".format(fname))
            plt.savefig(fname)
            plt.close(fig)
        elif self.dimA == 2:
            assert(False)
        else:
            xs = npr.uniform(-1., 1., (5000, self.dimA))
            ys = np.array([func(obs[[0],:], [xi])[0] for xi in xs])
            epi = np.hstack((xs, ys))
            pca = PCA(n_components=2).fit(epi)
            W = pca.components_[:,:-1]
            xs_proj = xs.dot(W.T)
            fig = plt.figure()

            X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100)
            Z = griddata(xs_proj[:,0], xs_proj[:,1], ys.ravel(),
                         X, Y, interp='linear')

            plt.contourf(X, Y, Z, 15)
            plt.colorbar()

            adam_x = hist['act'][:,0,:].T
            adam_x = adam_x.dot(W.T)
            plt.plot(adam_x[:,0], adam_x[:,1], label='Adam', color='k')
            plt.legend()

            fname = os.path.join(FLAGS.outdir, 'adamPlt.png')
            #print("Saving Adam plot to {}".format(fname))
            plt.savefig(fname)
            plt.close(fig)

    def reset(self, obs):
        self.noise = np.zeros(self.dimA)
        self.observation = obs  # initial observation

    def act(self, test=False):
        with self.sess.as_default():
            #print('--- Selecting action, test={}'.format(test))
            obs = np.expand_dims(self.observation, axis=0)

            if FLAGS.icnn_opt == 'adam':
                f = self._fg_entr
                # f = self._fg
            elif FLAGS.icnn_opt == 'bundle_entropy':
                f = self._fg
            else:
                raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)

            tflearn.is_training(False)
            action = self.opt(f, obs)
            tflearn.is_training(not test)

            if not test:
                self.noise -= FLAGS.outheta*self.noise - \
                              FLAGS.ousigma*npr.randn(self.dimA)
                action += self.noise
            action = np.clip(action, -1, 1)

            self.action = np.atleast_1d(np.squeeze(action, axis=0))
            return self.action

    def observe(self, rew, term, obs2, test=False):
        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1

            self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in range(FLAGS.iter):
                    loss = self.train()

    def train(self):
        with self.sess.as_default():
            obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)
            if FLAGS.icnn_opt == 'adam':
                # f = self._opt_train_entr
                f = self._fg_entr_target
                # f = self._fg_target
            elif FLAGS.icnn_opt == 'bundle_entropy':
                f = self._fg_target
            else:
                raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)
            #print('--- Optimizing for training')
            tflearn.is_training(False)
            act2 = self.opt(f, ob2)
            tflearn.is_training(True)

            _, _, loss = self._train(obs, act, rew, ob2, act2, term2,
                                     log=FLAGS.summary, global_step=self.t)
            self.sess.run(self.proj)
            return loss

    def negQ(self, x, y, reuse=False):
        szs = [FLAGS.l1size, FLAGS.l2size]
        assert(len(szs) >= 1)
        fc = tflearn.fully_connected
        bn = tflearn.batch_normalization
        lrelu = tflearn.activations.leaky_relu

        if reuse:
            tf.get_variable_scope().reuse_variables()

        nLayers = len(szs)
        us = []
        zs = []
        z_zs = []
        z_ys = []
        z_us = []

        reg = 'L2'

        prevU = x
        for i in range(nLayers):
            with tf.variable_scope('u'+str(i)) as s:
                u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg)
                if i < nLayers-1:
                    u = tf.nn.relu(u)
                    if FLAGS.icnn_bn:
                        u = bn(u, reuse=reuse, scope=s, name='bn')
            variable_summaries(u, suffix='u{}'.format(i))
            us.append(u)
            prevU = u

        prevU, prevZ = x, y
        for i in range(nLayers+1):
            sz = szs[i] if i < nLayers else 1
            z_add = []
            if i > 0:
                with tf.variable_scope('z{}_zu_u'.format(i)) as s:
                    zu_u = fc(prevU, szs[i-1], reuse=reuse, scope=s,
                              activation='relu', bias=True,
                              regularizer=reg, bias_init=tf.constant_initializer(1.))
                    variable_summaries(zu_u, suffix='zu_u{}'.format(i))
                with tf.variable_scope('z{}_zu_proj'.format(i)) as s:
                    z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s,
                              bias=False, regularizer=reg)
                    variable_summaries(z_zu, suffix='z_zu{}'.format(i))
                z_zs.append(z_zu)
                z_add.append(z_zu)

            with tf.variable_scope('z{}_yu_u'.format(i)) as s:
                yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True,
                          regularizer=reg, bias_init=tf.constant_initializer(1.))
                variable_summaries(yu_u, suffix='yu_u{}'.format(i))
            with tf.variable_scope('z{}_yu'.format(i)) as s:
                z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False,
                          regularizer=reg)
                z_ys.append(z_yu)
                variable_summaries(z_yu, suffix='z_yu{}'.format(i))
            z_add.append(z_yu)

            with tf.variable_scope('z{}_u'.format(i)) as s:
                z_u = fc(prevU, sz, reuse=reuse, scope=s,
                         bias=True, regularizer=reg,
                         bias_init=tf.constant_initializer(0.))
                variable_summaries(z_u, suffix='z_u{}'.format(i))
            z_us.append(z_u)
            z_add.append(z_u)

            z = tf.add_n(z_add)
            variable_summaries(z, suffix='z{}_preact'.format(i))
            if i < nLayers:
                # z = tf.nn.relu(z)
                z = lrelu(z, alpha=FLAGS.lrelu)
                variable_summaries(z, suffix='z{}_act'.format(i))

            zs.append(z)
            prevU = us[i] if i < nLayers else None
            prevZ = z

        z = tf.reshape(z, [-1], name='energies')
        return z


    def __del__(self):
        self.sess.close()
Exemplo n.º 31
0
class DQN(object):
    """
    A starter class to implement the Deep Q Network algorithm

    TODOs specify the main areas where logic needs to be added.

    If you get an error a Box2D error using the pip version try installing from source:
    > git clone https://github.com/pybox2d/pybox2d
    > pip install -e .

    """
    def __init__(self, env):

        self.env = env
        self.sess = tf.Session()

        # A few starter hyperparameters
        self.batch_size = 512
        self.gamma = 0.9
        # If using e-greedy exploration
        self.eps_start = 0.9
        self.eps_end = 0.05
        self.eps_decay = 5000  # in learning steps
        # If using a target network
        self.clone_steps = 200
        self.eps_value_list = np.linspace(self.eps_start, self.eps_end,
                                          self.eps_decay)
        self.eps_value = self.eps_start
        # memory
        self.replay_memory = ReplayMemory(100000)
        # Perhaps you want to have some samples in the memory before starting to train?
        self.min_replay_size = 10000

        self.cost_his = []
        self.eps_his = []
        self.reward_his = []
        self.state_space_size = self.env.observation_space.shape[0]
        self.action_space_size = self.env.action_space.n
        self.lr = 0.001

        self.learn_step = 0

        # define yours training operations here...
        self.observation_input = tf.placeholder(
            tf.float32, shape=[None, self.state_space_size])
        self.observation_input_ = tf.placeholder(tf.float32,
                                                 [None, self.state_space_size])
        self.build_model(self.observation_input)
        t_params = tf.get_collection('target_net_params')
        e_params = tf.get_collection('eval_net_params')
        self.replace_target_op = [
            tf.assign(t, e) for t, e in zip(t_params, e_params)
        ]

        # define your update operations here...

        self.num_episodes = 5000
        self.num_steps = 0

        self.saver = tf.train.Saver(tf.trainable_variables())
        self.sess.run(tf.global_variables_initializer())

    def build_model(self, observation_input, scope='train'):
        """
        TODO: Define the tensorflow model

        Hint: You will need to define and input placeholder and output Q-values

        Currently returns an op that gives all zeros.
        """
        self.q_target = tf.placeholder(tf.float32,
                                       [None, self.action_space_size])
        with tf.variable_scope('eval_net'):

            collection_names = [
                'eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES
            ]
            l1_size = 10
            w_initializer = tf.random_normal_initializer(0., 0.3)
            b_initializer = tf.constant_initializer(0.1)

            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.state_space_size, l1_size],
                                     initializer=w_initializer,
                                     collections=collection_names)
                b1 = tf.get_variable('b1', [1, l1_size],
                                     initializer=b_initializer,
                                     collections=collection_names)
                l1 = tf.nn.relu(tf.matmul(self.observation_input, w1) + b1)

            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [l1_size, self.action_space_size],
                                     initializer=w_initializer,
                                     collections=collection_names)
                b2 = tf.get_variable('b2', [1, self.action_space_size],
                                     initializer=b_initializer,
                                     collections=collection_names)
                self.q_eval = tf.matmul(l1, w2) + b2

        with tf.variable_scope('loss'):
            self.loss = tf.losses.huber_loss(self.q_target, self.q_eval)
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(
                self.loss)

        with tf.variable_scope('target_net'):
            collection_names = [
                'target_net_params', tf.GraphKeys.GLOBAL_VARIABLES
            ]

            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.state_space_size, l1_size],
                                     initializer=w_initializer,
                                     collections=collection_names)
                b1 = tf.get_variable('b1', [1, l1_size],
                                     initializer=b_initializer,
                                     collections=collection_names)
                l1 = tf.nn.relu(tf.matmul(self.observation_input_, w1) + b1)

            with tf.variable_scope('l2'):
                w2 = tf.get_variable('w2', [l1_size, self.action_space_size],
                                     initializer=w_initializer,
                                     collections=collection_names)
                b2 = tf.get_variable('b2', [1, self.action_space_size],
                                     initializer=b_initializer,
                                     collections=collection_names)
                self.q_next = tf.matmul(l1, w2) + b2

    def _reshape_state(self, state):
        return state.reshape(1, len(state))

    def select_action(self, obs, evaluation_mode=False):
        """
        TODO: Select an action given an observation using your model. This
        should include any exploration strategy you wish to implement

        If evaluation_mode=True, then this function should behave as if training is
        finished. This may be reducing exploration, etc.

        Currently returns a random action.
        """
        observation = obs[np.newaxis, :]
        if evaluation_mode:
            actions_value = self.sess.run(
                self.q_eval, feed_dict={self.observation_input: observation})
            action = np.argmax(actions_value)
            return action
        if np.random.uniform() > self.eps_value:
            #print('Exploiting')
            actions_value = self.sess.run(
                self.q_eval, feed_dict={self.observation_input: observation})
            action = np.argmax(actions_value)
        else:
            #print('Exploring')
            action = self.env.action_space.sample()
        return action

    def update(self):
        """
        TODO: Implement the functionality to update the network according to the
        Q-learning rule
        """
        if self.learn_step % self.clone_steps == 0:
            self.sess.run(self.replace_target_op)

        batch_memory = self.replay_memory.sample(self.batch_size)
        observation_input_ = np.concatenate(
            [[transition.next_state for transition in batch_memory]], axis=1)
        observation_input = np.concatenate(
            [[transition.state for transition in batch_memory]], axis=1)

        q_next, q_eval = self.sess.run(
            [self.q_next, self.q_eval],
            feed_dict={
                self.observation_input_: observation_input_,
                self.observation_input: observation_input,
            })

        q_target = q_eval.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        batch_action = np.asarray(
            [transition.action for transition in batch_memory])
        batch_reward = np.asarray(
            [transition.reward for transition in batch_memory])
        eval_act_index = batch_action
        reward = batch_reward
        q_target[batch_index,
                 eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
        _, self.cost = self.sess.run([self._train_op, self.loss],
                                     feed_dict={
                                         self.observation_input:
                                         observation_input,
                                         self.q_target: q_target
                                     })
        self.cost_his.append(self.cost)

        if self.learn_step < self.eps_decay:
            self.eps_value = self.eps_value_list[self.learn_step %
                                                 self.eps_decay]
        self.eps_his.append(self.eps_value)
        self.learn_step += 1

    def plot_loss(self):
        import matplotlib.pyplot as plt
        f, axarr = plt.subplots(2, sharex=True)
        axarr[0].plot(np.arange(len(self.cost_his)), self.cost_his)
        axarr[0].set_title('Learning Curve')
        axarr[1].plot(np.arange(len(self.cost_his)), self.eps_his)
        axarr[0].ylabel('Cost')
        axarr[1].ylabel('Epsilon')
        plt.xlabel('training steps')
        plt.show()

    def train(self):
        """
        The training loop. This runs a single episode.

        TODO: Implement the following as desired:
            1. Storing transitions to the ReplayMemory
            2. Updating the network at some frequency
            3. Backing up the current parameters to a reference, target network
        """
        done = False
        obs = env.reset()
        while not done:

            # self.eps_value = self.eps_value_list[eps_counter]
            action = self.select_action(obs, evaluation_mode=False)
            next_obs, reward, done, info = env.step(action)
            self.replay_memory.push(obs, action, next_obs, reward, done)
            if (self.num_steps > self.min_replay_size) and (self.num_steps % 50
                                                            == 0):
                self.update()
            obs = next_obs
            self.num_steps += 1

    def eval(self, save_snapshot=True):
        """
        Run an evaluation episode, this will call
        """
        total_reward = 0.0
        ep_steps = 0
        done = False
        obs = env.reset()
        print(self.eps_value)
        while not done:
            env.render()
            action = self.select_action(obs, evaluation_mode=True)
            obs, reward, done, info = env.step(action)
            total_reward += reward
        print("Evaluation episode: ", total_reward)
        if save_snapshot:
            print("Saving state with Saver")
            self.saver.save(self.sess,
                            'models/dqn-model',
                            global_step=self.num_episodes)
Exemplo n.º 32
0
adv_ph = tf.placeholder(dtype=tf.float32, shape=(None, ))
ret_ph = tf.placeholder(dtype=tf.float32, shape=(None, ))
logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None, ))

# Main outputs from computation graph
pi, logp, logp_pi, v = mlp_actor_critic(x_ph, a_ph)

# Need all placeholders in *this* order later (to zip with data from buffer)
all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

# Every step, get: action, value, and logprob
get_action_ops = [pi, v, logp_pi]

# Experience buffer
local_steps_per_epoch = int(args.steps / num_procs())
memory = ReplayMemory(obs_dim, act_dim, local_steps_per_epoch, args.gamma,
                      args.lam)

# Count variables
var_counts = tuple(count_vars(scope) for scope in ['pi', 'v'])

# Objective functions
ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
min_adv = tf.where(adv_ph > 0, (1 + args.clip_ratio) * adv_ph,
                   (1 - args.clip_ratio) * adv_ph)
actor_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
critic_loss = tf.reduce_mean((ret_ph - v)**2)

# Info (useful to watch during learning)
approx_kl = tf.reduce_mean(
    logp_old_ph - logp)  # a sample estimate for KL-divergence, easy to compute
approx_ent = tf.reduce_mean(
Exemplo n.º 33
0
def train(active_mv):

    senv = ShapeNetEnv(FLAGS)
    replay_mem = ReplayMemory(FLAGS)

    #### for debug
    #a = np.array([[1,0,1],[0,0,0]])
    #b = np.array([[1,0,1],[0,1,0]])
    #print('IoU: {}'.format(replay_mem.calu_IoU(a, b)))
    #sys.exit()
    #### for debug

    log_string('====== Starting burning in memories ======')
    burn_in(senv, replay_mem)
    log_string('====== Done. {} trajectories burnt in ======'.format(
        FLAGS.burn_in_length))

    #epsilon = FLAGS.init_eps
    K_single = np.asarray([[420.0, 0.0, 112.0], [0.0, 420.0, 112.0],
                           [0.0, 0.0, 1]])
    K_list = np.tile(K_single[None, None, ...],
                     (1, FLAGS.max_episode_length, 1, 1))

    ### burn in(pretrain) for MVnet
    if FLAGS.burn_in_iter > 0:
        for i in xrange(FLAGS.burn_in_iter):
            if not FLAGS.random_pretrain:
                mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size)
            else:
                mvnet_input = replay_mem.get_batch_list_random(
                    senv, FLAGS.batch_size)
            tic = time.time()
            out_stuff = active_mv.run_step(mvnet_input,
                                           mode='burnin',
                                           is_training=True)
            summs_burnin = burnin_log(i, out_stuff, time.time() - tic)
            for summ in summs_burnin:
                active_mv.train_writer.add_summary(summ, i)

    rollout_obj = Rollout(active_mv, senv, replay_mem, FLAGS)

    for i_idx in xrange(FLAGS.max_iter):

        t0 = time.time()

        rollout_obj.go(i_idx, verbose=True, add_to_mem=True, is_train=True)
        t1 = time.time()

        replay_mem.enable_gbl()
        mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size)
        t2 = time.time()

        out_stuff = active_mv.run_step(mvnet_input,
                                       mode='train',
                                       is_training=True)
        replay_mem.disable_gbl()
        t3 = time.time()

        train_log(i_idx, out_stuff, (t0, t1, t2, t3))

        active_mv.train_writer.add_summary(out_stuff.merged_train, i_idx)

        if i_idx % FLAGS.save_every_step == 0 and i_idx > 0:
            save(active_mv, i_idx, i_idx, i_idx)

        if i_idx % FLAGS.test_every_step == 0 and i_idx > 0:
            print('Evaluating active policy')
            evaluate(active_mv,
                     FLAGS.test_episode_num,
                     replay_mem,
                     i_idx,
                     rollout_obj,
                     mode='active')
            print('Evaluating random policy')
            evaluate(active_mv,
                     FLAGS.test_episode_num,
                     replay_mem,
                     i_idx,
                     rollout_obj,
                     mode='random')
Exemplo n.º 34
0
class Driver(object):
    '''
    A driver object for the SCRC
    '''

    def __init__(self, args):
        '''Constructor'''
        self.WARM_UP = 0
        self.QUALIFYING = 1
        self.RACE = 2
        self.UNKNOWN = 3
        self.stage = args.stage
        
        self.parser = msgParser.MsgParser()
        self.state = carState.CarState()
        self.control = carControl.CarControl()

        self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0]
        self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0]
        self.num_inputs = 19
        self.num_steers = len(self.steers)
        self.num_speeds = len(self.speeds)
        self.num_actions = self.num_steers + self.num_speeds
        
        self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args)
        self.mem = ReplayMemory(args.replay_size, self.num_inputs, args)
        self.minibatch_size = args.batch_size

        if args.load_weights:
            self.net.load_weights(args.load_weights)
        self.save_weights_prefix = args.save_weights_prefix
        self.pretrained_network = args.pretrained_network

        self.steer_lock = 0.785398
        self.max_speed = 100

        self.algorithm = args.algorithm
        self.device = args.device
        self.mode = args.mode
        self.maxwheelsteps = args.maxwheelsteps
        
        self.enable_training = args.enable_training
        self.enable_exploration = args.enable_exploration

        self.total_train_steps = 0
        self.exploration_decay_steps = args.exploration_decay_steps
        self.exploration_rate_start = args.exploration_rate_start
        self.exploration_rate_end = args.exploration_rate_end

        self.show_sensors = args.show_sensors
        self.show_qvalues = args.show_qvalues

        self.episode = 0
        self.onRestart()
        
        if self.show_sensors:
            from sensorstats import Stats
            self.stats = Stats(inevery=8)
        
        if self.show_qvalues:
            from plotq import PlotQ
            self.plotq = PlotQ(self.num_steers, self.num_speeds)

        if self.device == 'wheel':
            from wheel import Wheel
            self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force)

    def init(self):
        '''Return init string with rangefinder angles'''
        self.angles = [0 for x in range(19)]
        
        for i in range(5):
            self.angles[i] = -90 + i * 15
            self.angles[18 - i] = 90 - i * 15
        
        for i in range(5, 9):
            self.angles[i] = -20 + (i-5) * 5
            self.angles[18 - i] = 20 - (i-5) * 5
        
        return self.parser.stringify({'init': self.angles})

    def getState(self):
        #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()])
        #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0
        state = np.array(self.state.getTrack()) / 200.0
        assert state.shape == (self.num_inputs,)
        return state

    def getReward(self, terminal):
        if terminal:
            reward = -1000
        else:
            dist = self.state.getDistFromStart()
            if self.prev_dist is not None:
                reward = max(0, dist - self.prev_dist) * 10
                assert reward >= 0, "reward: %f" % reward
            else:
                reward = 0
            self.prev_dist = dist
            
            #reward -= self.state.getTrackPos()
            #print "reward:", reward
        
        return reward

    def getTerminal(self):
        return np.all(np.array(self.state.getTrack()) == -1)

    def getEpsilon(self):
        # calculate decaying exploration rate
        if self.total_train_steps < self.exploration_decay_steps:
            return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps
        else:
            return self.exploration_rate_end
 
    def drive(self, msg):
        # parse incoming message
        self.state.setFromMsg(msg)
        
        # show sensors
        if self.show_sensors:
            self.stats.update(self.state)
        
        # fetch state, calculate reward and terminal indicator  
        state = self.getState()
        terminal = self.getTerminal()
        reward = self.getReward(terminal)
        #print "reward:", reward

        # store new experience in replay memory
        if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None:
            self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal)

        # if terminal state (out of track), then restart game
        if terminal:
            print "terminal state, restarting"
            self.control.setMeta(1)
            return self.control.toMsg()
        else:
            self.control.setMeta(0)

        # choose actions for wheel and speed
        if self.enable_exploration and random.random() < self.getEpsilon():
            #print "random move"
            steer = random.randrange(self.num_steers)
            #speed = random.randrange(self.num_speeds)
            speed = random.randint(2, self.num_speeds-1)
        elif self.algorithm == 'network':
            # use broadcasting to efficiently produce minibatch of desired size
            minibatch = state + np.zeros((self.minibatch_size, 1))
            Q = self.net.predict(minibatch)
            assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape)
            #print "steer Q: ", Q[0,:21]
            #print "speed Q:", Q[0,-5:]
            steer = np.argmax(Q[0, :self.num_steers])
            speed = np.argmax(Q[0, -self.num_speeds:])
            if self.show_qvalues:
                self.plotq.update(Q[0])
        elif self.algorithm == 'hardcoded':
            steer = self.getSteerAction(self.steer())
            speed = self.getSpeedActionAccel(self.speed())
        else:
            assert False, "Unknown algorithm"
        #print "steer:", steer, "speed:", speed

        # gears are always automatic
        gear = self.gear()

        # check for manual override 
        # might be partial, so we always need to choose algorithmic actions first
        events = self.wheel.getEvents()
        if self.mode == 'override' and self.wheel.supportsDrive():
            # wheel
            for event in events:
                if self.wheel.isWheelMotion(event):
                    self.wheelsteps = self.maxwheelsteps

            if self.wheelsteps > 0:
                wheel = self.wheel.getWheel()
                steer = self.getSteerAction(wheel)
                self.wheelsteps -= 1

            # gas pedal
            accel = self.wheel.getAccel()
            if accel > 0:
                speed = self.getSpeedActionAccel(accel)
            
            # brake pedal
            brake = self.wheel.getBrake()
            if brake > 0:
                speed = self.getSpeedActionBrake(brake)

        # check for wheel buttons always, not only in override mode
        for event in events:
            if self.wheel.isButtonDown(event, 2):
                self.algorithm = 'network'
                self.mode = 'override'
                self.wheel.generateForce(0)
                print "Switched to network algorithm"
            elif self.wheel.isButtonDown(event, 3):
                self.net.load_weights(self.pretrained_network)
                self.algorithm = 'network'
                self.mode = 'ff'
                self.enable_training = False
                print "Switched to pretrained network"
            elif self.wheel.isButtonDown(event, 4):
                self.enable_training = not self.enable_training
                print "Switched training", "ON" if self.enable_training else "OFF"
            elif self.wheel.isButtonDown(event, 5):
                self.algorithm = 'hardcoded'
                self.mode = 'ff'
                print "Switched to hardcoded algorithm"
            elif self.wheel.isButtonDown(event, 6):
                self.enable_exploration = not self.enable_exploration
                self.mode = 'override'
                self.wheel.generateForce(0)
                print "Switched exploration", "ON" if self.enable_exploration else "OFF"
            elif self.wheel.isButtonDown(event, 7):
                self.mode = 'ff' if self.mode == 'override' else 'override'
                if self.mode == 'override':
                    self.wheel.generateForce(0)
                print "Switched force feedback", "ON" if self.mode == 'ff' else "OFF"
            elif self.wheel.isButtonDown(event, 0) or self.wheel.isButtonDown(event, 8):
                gear = max(-1, gear - 1)
            elif self.wheel.isButtonDown(event, 1) or self.wheel.isButtonDown(event, 9):
                gear = min(6, gear + 1)

        # set actions
        self.setSteerAction(steer)
        self.setGearAction(gear)
        self.setSpeedAction(speed)

        # turn wheel using force feedback
        if self.mode == 'ff' and self.wheel.supportsForceFeedback():
            wheel = self.wheel.getWheel()
            self.wheel.generateForce(self.control.getSteer()-wheel)

        # remember state and actions 
        self.prev_state = state
        self.prev_steer = steer
        self.prev_speed = speed

        # training
        if self.enable_training and self.mem.count >= self.minibatch_size:
            minibatch = self.mem.getMinibatch()
            self.net.train(minibatch)
            self.total_train_steps += 1
            #print "total_train_steps:", self.total_train_steps

        #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count

        return self.control.toMsg()

    def setSteerAction(self, steer):
        self.control.setSteer(self.steers[steer])

    def setGearAction(self, gear):
        assert -1 <= gear <= 6
        self.control.setGear(gear)

    def setSpeedAction(self, speed):
        accel = self.speeds[speed]
        if accel >= 0:
            #print "accel", accel
            self.control.setAccel(accel)
            self.control.setBrake(0)
        else:
            #print "brake", -accel
            self.control.setAccel(0)
            self.control.setBrake(-accel)

    def getSteerAction(self, wheel):
        steer = np.argmin(np.abs(np.array(self.steers) - wheel))
        return steer

    def getSpeedActionAccel(self, accel):
        speed = np.argmin(np.abs(np.array(self.speeds) - accel))
        return speed

    def getSpeedActionBrake(self, brake):
        speed = np.argmin(np.abs(np.array(self.speeds) + brake))
        return speed

    def steer(self):
        angle = self.state.angle
        dist = self.state.trackPos
        
        steer = (angle - dist*0.5)/self.steer_lock
        return steer
    
    def gear(self):
        rpm = self.state.getRpm()
        gear = self.state.getGear()
        
        if self.prev_rpm == None:
            up = True
        else:
            if (self.prev_rpm - rpm) < 0:
                up = True
            else:
                up = False
        
        if up and rpm > 7000:
            gear += 1
        
        if not up and rpm < 3000:
            gear -= 1
        
        return gear

    def speed(self):
        speed = self.state.getSpeedX()
        accel = self.prev_accel
        
        if speed < self.max_speed:
            accel += 0.1
            if accel > 1:
                accel = 1.0
        else:
            accel -= 0.1
            if accel < 0:
                accel = 0.0
        
        self.prev_accel = accel
        return accel
        
    def onShutDown(self):
        pass
    
    def onRestart(self):
        if self.mode == 'ff':
            self.wheel.generateForce(0)
    
        self.prev_rpm = None
        self.prev_accel = 0
        self.prev_dist = None
        self.prev_state = None
        self.prev_steer = None
        self.prev_speed = None
        self.wheelsteps = 0

        if self.save_weights_prefix and self.episode > 0:
            self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl")

        self.episode += 1
        print "Episode", self.episode
Exemplo n.º 35
0
    epsilon_by_frame = lambda frame_idx: epsilon_final + (
        epsilon_start - epsilon_final) * math.exp(-1. * frame_idx /
                                                  epsilon_decay)

    # Worker Process Queues
    output_queue = mp.Queue(maxsize=args.pop)
    params_queue = mp.Queue(maxsize=args.pop)
    elite_queue = mp.Queue(maxsize=int(2 * args.pop))

    # Agent
    agent = SAC(STATE_DIM, ACTION_DIM, args)
    sac_episodes = args.sac_episodes

    # Memory
    memory = ReplayMemory(args.replay_size)
    processes = []
    elite_list = []

    # Training Loop
    total_numsteps = 0
    updates = 0
    time_list = []
    max_rewards = []
    min_rewards = []
    avg_rewards = []
    noise_mut = []
    total_time = 0
    critic_loss = 0

    # Create and start the processes
Exemplo n.º 36
0
env = gym.make('MinitaurBulletEnv-v0')
torch.manual_seed(seed)
np.random.seed(seed)
env.seed(seed)
max_steps = env._max_episode_steps
print('max_steps: ', max_steps)

batch_size=128 ##  512
LEARNING_RATE=0.0001
start_steps=10000 ## Steps sampling random actions
replay_size=1000000 ## size of replay buffer

agent = soft_actor_critic_agent(env.observation_space.shape[0], env.action_space, \
        hidden_size=256, seed=seed, lr=LEARNING_RATE, gamma=0.99, tau=0.005, alpha=0.2)

memory = ReplayMemory(replay_size)

print('device: ', device)
print('leraning rate: ', LEARNING_RATE)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

print('state_dim: ',state_dim, ', action_dim: ', action_dim)

threshold = env.spec.reward_threshold
print('threshold: ', threshold)

scores, avg_scores, avg_numm_steps = sac_train(max_steps=max_steps)

reward_round = round(np.max(scores), 2)
Exemplo n.º 37
0
    def __init__(self, dimO, dimA):
        dimA, dimO = dimA[0], dimO[0]
        self.dimA = dimA
        self.dimO = dimO

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        if FLAGS.icnn_opt == 'adam':
            self.opt = self.adam
        elif FLAGS.icnn_opt == 'bundle_entropy':
            self.opt = self.bundle_entropy
        else:
            raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)

        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True)))

        self.noise = np.zeros(self.dimA)

        obs = tf.placeholder(tf.float32, [None, dimO], "obs")
        act = tf.placeholder(tf.float32, [None, dimA], "act")
        rew = tf.placeholder(tf.float32, [None], "rew")
        with tf.variable_scope('q'):
            negQ = self.negQ(obs, act)
        negQ_entr = negQ - entropy(act)
        q = -negQ
        q_entr = -negQ_entr
        act_grad, = tf.gradients(negQ, act)
        act_grad_entr, = tf.gradients(negQ_entr, act)

        obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target")
        act_target = tf.placeholder(tf.float32, [None, dimA], "act_target")
        term_target = tf.placeholder(tf.bool, [None], "term_target")
        with tf.variable_scope('q_target'):
            negQ_target = self.negQ(obs_target, act_target)
        negQ_entr_target = negQ_target - entropy(act_target)
        act_target_grad, = tf.gradients(negQ_target, act_target)
        act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target)
        q_target = -negQ_target
        q_target_entr = -negQ_entr_target

        if FLAGS.icnn_opt == 'adam':
            y = tf.where(term_target, rew, rew + discount * q_target_entr)
            y = tf.maximum(q_entr - 1., y)
            y = tf.minimum(q_entr + 1., y)
            y = tf.stop_gradient(y)
            td_error = q_entr - y
        elif FLAGS.icnn_opt == 'bundle_entropy':
            q_target = tf.where(term2, rew, rew + discount * q2_entropy)
            q_target = tf.maximum(q_entropy - 1., q_target)
            q_target = tf.minimum(q_entropy + 1., q_target)
            q_target = tf.stop_gradient(q_target)
            td_error = q_entropy - q_target
        else:
            raise RuntimeError("Needs checking.")
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)

        regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/')
        loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses)

        self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/')
        self.theta_cvx_ = [v for v in self.theta_
                           if 'proj' in v.name and 'W:' in v.name]
        self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_]
        self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_]
        # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_]

        self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope='q_target/')
        update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i))
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]

        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)


        summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'),
                                                self.sess.graph)
        if FLAGS.icnn_opt == 'adam':
            tf.summary.scalar('Qvalue', tf.reduce_mean(q))
        elif FLAGS.icnn_opt == 'bundle_entropy':
            tf.summary.scalar('Qvalue', tf.reduce_mean(q_entr))
        tf.summary.scalar('loss', ms_td_error)
        tf.summary.scalar('reward', tf.reduce_mean(rew))
        merged = tf.summary.merge_all

        # tf functions
        with self.sess.as_default():
            self._train = Fun([obs, act, rew, obs_target, act_target, term_target],
                              [optimize_q, update_target, loss_q],
                              merged, summary_writer)
            self._fg = Fun([obs, act], [negQ, act_grad])
            self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad])
            self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr])
            self._fg_entr_target = Fun([obs_target, act_target],
                                       [negQ_entr_target, act_entr_target_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())
            self.sess.run(self.makeCvx)
            self.sess.run([theta_target_i.assign(theta_i)
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)])

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)
Exemplo n.º 38
0
class GaussianDQN(Agent):
    def __init__(self,
                 approximator,
                 policy,
                 mdp_info,
                 batch_size,
                 target_update_frequency,
                 initial_replay_size,
                 max_replay_size,
                 fit_params=None,
                 approximator_params=None,
                 clip_reward=True,
                 update_type='weighted',
                 delta=0.1,
                 store_prob=False,
                 q_max=100,
                 max_spread=None):
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency
        self.update_type = update_type
        self.delta = delta
        self.standard_bound = norm.ppf(1 - self.delta, loc=0, scale=1)
        self.store_prob = store_prob
        self.q_max = q_max
        self.max_spread = max_spread
        self._replay_memory = ReplayMemory(initial_replay_size,
                                           max_replay_size)

        self._n_updates = 0
        self._epsilon = 1e-7
        apprx_params_train = deepcopy(approximator_params)
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(approximator_params)
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

        super(GaussianDQN, self).__init__(policy, mdp_info)

    @staticmethod
    def _compute_prob_max(mean_list, sigma_list):
        n_actions = len(mean_list)
        lower_limit = mean_list - 8 * sigma_list
        upper_limit = mean_list + 8 * sigma_list
        epsilon = 1e2
        n_trapz = 100
        x = np.zeros(shape=(n_trapz, n_actions))
        y = np.zeros(shape=(n_trapz, n_actions))
        integrals = np.zeros(n_actions)
        for j in range(n_actions):
            if sigma_list[j] < epsilon:
                p = 1
                for k in range(n_actions):
                    if k != j:
                        p *= norm.cdf(mean_list[j],
                                      loc=mean_list[k],
                                      scale=sigma_list[k])
                integrals[j] = p
            else:
                x[:, j] = np.linspace(lower_limit[j], upper_limit[j], n_trapz)
                y[:, j] = norm.pdf(x[:, j],
                                   loc=mean_list[j],
                                   scale=sigma_list[j])
                for k in range(n_actions):
                    if k != j:
                        y[:, j] *= norm.cdf(x[:, j],
                                            loc=mean_list[k],
                                            scale=sigma_list[k])
                integrals[j] = (upper_limit[j] - lower_limit[j]) / (
                    2 * (n_trapz - 1)) * (y[0, j] + y[-1, j] +
                                          2 * np.sum(y[1:-1, j]))

        # print(np.sum(integrals))
        # assert np.isclose(np.sum(integrals), 1)
        with np.errstate(divide='raise'):
            try:
                return integrals / np.sum(integrals)
            except FloatingPointError:
                print(integrals)
                print(mean_list)
                print(sigma_list)
                input()

    def fit(self, dataset):
        mask = np.ones((len(dataset), 2))
        self._replay_memory.add(dataset, mask)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _, mask = \
                self._replay_memory.get(self._batch_size)

            if self._clip_reward:
                reward = np.clip(reward, -1, 1)

            q_next, sigma_next, prob_explore = self._next_q(
                next_state, absorbing)

            q = reward + self.mdp_info.gamma * q_next
            sigma = self.mdp_info.gamma * sigma_next
            stacked = np.stack([q, sigma])

            self.approximator.fit(state,
                                  action,
                                  stacked,
                                  prob_exploration=prob_explore,
                                  **self._fit_params)

            self._n_updates += 1

            if self._n_updates % self._target_update_frequency == 0:
                self._update_target()

    def _update_target(self):
        """
        Update the target network.

        """
        self.target_approximator.model.set_weights(
            self.approximator.model.get_weights())

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                `next_state`.

        Returns:
            Maximum action-value for each state in `next_state`.

        """
        q_and_sigma = self.target_approximator.predict(next_state).squeeze()

        q = q_and_sigma[0, :, :]
        sigma = q_and_sigma[1, :, :]
        for i in range(q.shape[0]):
            if absorbing[i]:
                q[i] *= 0
                sigma[i] *= self._epsilon
        max_q = np.zeros((q.shape[0]))
        max_sigma = np.zeros((q.shape[0]))
        probs = []
        prob_explore = np.zeros(q.shape[0])
        for i in range(q.shape[0]):  # for each batch
            means = q[i, :]
            sigmas = sigma[i, :]
            prob = GaussianDQN._compute_prob_max(means, sigmas)
            probs.append(prob)
            prob_explore[i] = 1. - np.max(prob)

        if self.update_type == 'mean':
            best_actions = np.argmax(q, axis=1)
            for i in range(q.shape[0]):
                max_q[i] = q[i, best_actions[i]]
                max_sigma[i] = sigma[i, best_actions[i]]
        elif self.update_type == 'weighted':
            for i in range(q.shape[0]):  # for each batch
                means = q[i, :]
                sigmas = sigma[i, :]
                prob = probs[i]
                max_q[i] = np.sum(means * prob)
                max_sigma[i] = np.sum(sigmas * prob)
        elif self.update_type == 'optimistic':
            for i in range(q.shape[0]):  # for each batch
                means = q[i, :]
                sigmas = sigma[i, :]
                bounds = sigmas * self.standard_bound + means
                bounds = np.clip(bounds, -self.q_max, self.q_max)
                next_index = np.random.choice(
                    np.argwhere(bounds == np.max(bounds)).ravel())
                max_q[i] = q[i, next_index]
                max_sigma[i] = sigma[i, next_index]
        else:
            raise ValueError("Update type not implemented")

        return max_q, max_sigma, np.mean(prob_explore)

    def draw_action(self, state):
        action = super(GaussianDQN, self).draw_action(np.array(state))

        return action

    def episode_start(self):
        return
Exemplo n.º 39
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num-envs', type=int, default=1)
    parser.add_argument('--t-max', type=int, default=1)
    parser.add_argument('--learning-rate', type=float, default=0.0002)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--steps-per-epoch', type=int, default=100000)
    parser.add_argument('--testing', type=int, default=0)
    parser.add_argument('--continue-training', type=int, default=0)
    parser.add_argument('--epoch-num', type=int, default=40)
    parser.add_argument('--start-epoch', type=int, default=20)
    parser.add_argument('--testing-epoch', type=int, default=3)
    parser.add_argument('--save-log', type=str, default='basic/log')
    parser.add_argument('--signal-num', type=int, default=4)
    parser.add_argument('--toxin', type=int, default=0)
    parser.add_argument('--a1-AC-folder', type=str, default='basic/a1_Qnet')
    parser.add_argument('--eps-start', type=float, default=1.0)
    parser.add_argument('--replay-start-size', type=int, default=50000)
    parser.add_argument('--decay-rate', type=int, default=500000)
    parser.add_argument('--replay-memory-size', type=int, default=1000000)
    parser.add_argument('--eps-min', type=float, default=0.05)

    rewards = {
        "positive": 1.0,
        "negative": -1.0,
        "tick": -0.002,
        "loss": -2.0,
        "win": 2.0
    }

    args = parser.parse_args()
    config = Config(args)
    q_ctx = config.ctx
    steps_per_epoch = args.steps_per_epoch
    np.random.seed(args.seed)
    start_epoch = args.start_epoch
    testing_epoch = args.testing_epoch
    save_log = args.save_log
    epoch_num = args.epoch_num
    epoch_range = range(epoch_num)
    toxin = args.toxin
    a1_Qnet_folder = args.a1_AC_folder

    freeze_interval = 10000
    update_interval = 5
    replay_memory_size = args.replay_memory_size
    discount = 0.99
    replay_start_size = args.replay_start_size
    history_length = 1
    eps_start = args.eps_start
    eps_min = args.eps_min
    eps_decay = (eps_start - eps_min) / args.decay_rate
    eps_curr = eps_start
    freeze_interval /= update_interval
    minibatch_size = 32

    testing = args.testing
    testing = True if testing == 1 else False
    continue_training = args.continue_training
    continue_training = True if continue_training == 1 else False

    game = HunterWorld(width=256,
                       height=256,
                       num_preys=10,
                       draw=False,
                       num_hunters=2,
                       num_toxins=toxin)

    env = PLE(game,
              fps=30,
              force_fps=True,
              display_screen=False,
              reward_values=rewards,
              resized_rows=80,
              resized_cols=80,
              num_steps=2)

    replay_memory = ReplayMemory(state_dim=(148, ),
                                 action_dim=(2, ),
                                 history_length=history_length,
                                 memory_size=replay_memory_size,
                                 replay_start_size=replay_start_size,
                                 state_dtype='float32')

    action_set = env.get_action_set()
    action_map1 = []
    for action in action_set[0].values():
        action_map1.append(action)

    action_map2 = []
    for action in action_set[1].values():
        action_map2.append(action)
    action_num = len(action_map1)

    target1 = Qnetwork(actions_num=8,
                       q_ctx=q_ctx,
                       isTrain=False,
                       batch_size=1,
                       dir=dir,
                       folder=a1_Qnet_folder)
    target32 = Qnetwork(actions_num=8,
                        q_ctx=q_ctx,
                        isTrain=False,
                        batch_size=32,
                        dir=dir,
                        folder=a1_Qnet_folder)
    Qnet = Qnetwork(actions_num=8,
                    q_ctx=q_ctx,
                    isTrain=True,
                    batch_size=32,
                    dir=dir,
                    folder=a1_Qnet_folder)

    if testing:
        env.force_fps = False
        env.game.draw = True
        env.display_screen = True
        Qnet.load_params(testing_epoch)
    elif continue_training:
        epoch_range = range(start_epoch, epoch_num + start_epoch)
        Qnet.load_params(start_epoch - 1)
        logging_config(logging, dir, save_log, file_name)
    else:
        logging_config(logging, dir, save_log, file_name)

    copyTargetQNetwork(Qnet.model, target1.model)
    copyTargetQNetwork(Qnet.model, target32.model)

    logging.info('args=%s' % args)
    logging.info('config=%s' % config.__dict__)
    print_params(logging, Qnet.model)

    training_steps = 0
    total_steps = 0
    for epoch in epoch_range:
        steps_left = steps_per_epoch
        episode = 0
        epoch_reward = 0
        start = time.time()
        env.reset_game()
        while steps_left > 0:
            episode += 1
            episode_loss = 0.0
            episode_q_value = 0.0
            episode_update_step = 0
            episode_action_step = 0
            episode_reward = 0
            episode_step = 0
            collisions = 0.0
            time_episode_start = time.time()
            env.reset_game()
            while not env.game_over():
                if replay_memory.size >= history_length and replay_memory.size > replay_start_size:
                    do_exploration = (np.random.rand() < eps_curr)
                    eps_curr = max(eps_curr - eps_decay, eps_min)
                    if do_exploration:
                        action1 = np.random.randint(action_num)
                        action2 = np.random.randint(action_num)
                    else:
                        current_state = replay_memory.latest_slice()
                        state = nd.array(
                            current_state.reshape((1, ) + current_state.shape),
                            ctx=q_ctx)
                        target1.model.forward(mx.io.DataBatch([state], []))
                        q_value = target1.model.get_outputs()[0].asnumpy()[0]
                        action1 = numpy.argmax(q_value[0:4])
                        action2 = numpy.argmax(q_value[4:8])
                        episode_q_value += q_value[action1]
                        episode_q_value += q_value[action2 + 4]
                        episode_action_step += 1
                else:
                    action1 = np.random.randint(action_num)
                    action2 = np.random.randint(action_num)

                next_ob, reward, terminal_flag = env.act(
                    [action_map1[action1], action_map2[action2]])

                reward = np.sum(reward)
                replay_memory.append(
                    np.array(next_ob).flatten(), [action1, action2], reward,
                    terminal_flag)

                total_steps += 1
                episode_reward += reward
                if reward < 0:
                    collisions += 1
                episode_step += 1

                if total_steps % update_interval == 0 and replay_memory.size > replay_start_size:
                    training_steps += 1

                    state_batch, actions, rewards, nextstate_batch, terminate_flags = replay_memory.sample(
                        batch_size=minibatch_size)
                    state_batch = nd.array(state_batch, ctx=q_ctx)
                    actions_batch1 = nd.array(actions[:, 0], ctx=q_ctx)
                    actions_batch2 = nd.array(actions[:, 1], ctx=q_ctx)

                    target32.model.forward(
                        mx.io.DataBatch([nd.array(nextstate_batch, ctx=q_ctx)],
                                        []))
                    Qvalue = target32.model.get_outputs()[0].asnumpy()

                    y_batch1 = rewards + np.max(Qvalue[:, 0:4], axis=1) * (
                        1.0 - terminate_flags) * discount
                    y_batch2 = rewards + np.max(Qvalue[:, 4:8], axis=1) * (
                        1.0 - terminate_flags) * discount

                    y_batch1 = nd.array(y_batch1, ctx=q_ctx)
                    y_batch2 = nd.array(y_batch2, ctx=q_ctx)

                    Qnet.model.forward(mx.io.DataBatch([
                        state_batch, actions_batch1, y_batch1,
                        actions_batch2 + 4, y_batch2
                    ], []),
                                       is_train=True)
                    Qnet.model.backward()
                    Qnet.model.update()

                    if training_steps % 10 == 0:
                        out = Qnet.model.get_outputs()
                        loss1 = 0.5 * nd.square(
                            nd.choose_element_0index(out[0], actions_batch1) -
                            y_batch1)
                        loss2 = 0.5 * nd.square(
                            nd.choose_element_0index(out[1], actions_batch2) -
                            y_batch2)
                        episode_loss += nd.sum(loss1).asnumpy()
                        episode_loss += nd.sum(loss2).asnumpy()
                        episode_update_step += 1

                    if training_steps % freeze_interval == 0:
                        copyTargetQNetwork(Qnet.model, target1.model)
                        copyTargetQNetwork(Qnet.model, target32.model)

            steps_left -= episode_step
            time_episode_end = time.time()
            epoch_reward += episode_reward
            info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d/%d, Reward:%f, fps:%f, Exploration:%f" \
                       % (epoch, episode, steps_left, episode_step, steps_per_epoch, episode_reward,
                          episode_step / (time_episode_end - time_episode_start), eps_curr)

            info_str += ", Collision:%f/%d " % (collisions / episode_step,
                                                collisions)

            if episode_update_step > 0:
                info_str += ", Avg Loss:%f/%d" % (
                    episode_loss / episode_update_step, episode_update_step)
            if episode_action_step > 0:
                info_str += ", Avg Q Value:%f/%d " % (
                    episode_q_value / episode_action_step, episode_action_step)

            if episode % 1 == 0:
                print info_str
                logging.info(info_str)

        end = time.time()
        fps = steps_per_epoch / (end - start)
        Qnet.save_params(epoch)
        print "Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (
            epoch, fps, epoch_reward / float(episode), episode)
Exemplo n.º 40
0
def train(args, net, env):
    # Begin tf session
    with tf.Session() as sess:
        # Initialize variables
        tf.global_variables_initializer().run()
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

        # load from previous save
        if len(args.ckpt_name) > 0:
            saver.restore(sess, os.path.join(args.save_dir, args.ckpt_name))

        # Load data
        shift = sess.run(net.shift)
        scale = sess.run(net.scale)
        shift_u = sess.run(net.shift_u)
        scale_u = sess.run(net.scale_u)

        replay_memory = ReplayMemory(args, shift, scale, shift_u, scale_u, env,
                                     net, sess)

        # Store normalization parameters
        sess.run(tf.assign(net.shift, replay_memory.shift_x))
        sess.run(tf.assign(net.scale, replay_memory.scale_x))
        sess.run(tf.assign(net.shift_u, replay_memory.shift_u))
        sess.run(tf.assign(net.scale_u, replay_memory.scale_u))

        #Function to evaluate loss on validation set
        def val_loss(kl_weight):
            replay_memory.reset_batchptr_val()
            loss = 0.0
            for b in range(replay_memory.n_batches_val):
                # Get inputs
                batch_dict = replay_memory.next_batch_val()
                x = batch_dict["states"]
                u = batch_dict['inputs']

                # Construct inputs for network
                feed_in = {}
                feed_in[net.x] = np.reshape(
                    x, (2 * args.batch_size * args.seq_length, args.state_dim))
                feed_in[net.u] = u
                if args.kl_weight > 0.0:
                    feed_in[net.kl_weight] = kl_weight
                else:
                    feed_in[net.kl_weight] = 1.0

                # Find loss
                feed_out = net.cost
                cost = sess.run(feed_out, feed_in)
                loss += cost

            return loss / replay_memory.n_batches_val

        # Initialize variable to track validation score over time
        old_score = 1e9
        count_decay = 0
        decay_epochs = []

        # Define temperature for annealing kl_weight
        T = args.anneal_time * replay_memory.n_batches_train
        count = 0

        # Loop over epochs
        for e in range(args.num_epochs):
            visualize_predictions(args, sess, net, replay_memory, env, e)

            # Initialize loss
            loss = 0.0
            rec_loss = 0.0
            kl_loss = 0.0
            loss_count = 0
            replay_memory.reset_batchptr_train()

            # Loop over batches
            for b in range(replay_memory.n_batches_train):
                start = time.time()
                count += 1

                # Update kl_weight
                if e < args.start_kl:
                    kl_weight = 1e-3
                else:
                    count += 1
                    kl_weight = min(args.kl_weight,
                                    1e-3 + args.kl_weight * count / float(T))

                # Get inputs
                batch_dict = replay_memory.next_batch_train()
                x = batch_dict["states"]
                u = batch_dict['inputs']

                # Construct inputs for network
                feed_in = {}
                feed_in[net.x] = np.reshape(
                    x, (2 * args.batch_size * args.seq_length, args.state_dim))
                feed_in[net.u] = u
                feed_in[net.kl_weight] = kl_weight

                # Find loss and perform training operation
                feed_out = [
                    net.cost, net.loss_reconstruction, net.kl_loss, net.train
                ]
                out = sess.run(feed_out, feed_in)

                # Update and display cumulative losses
                loss += out[0]
                rec_loss += out[1]
                kl_loss += out[2]
                loss_count += 1

                end = time.time()

                # Print loss
                if (e * replay_memory.n_batches_train +
                        b) % 100 == 0 and b > 0:
                    print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                      .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train,
                              e, loss/loss_count, end - start))
                    print("{}/{} (epoch {}), rec_loss = {:.3f}, time/batch = {:.3f}" \
                      .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train,
                              e, rec_loss/loss_count, end - start))
                    print("{}/{} (epoch {}), kl_loss = {:.3f}, time/batch = {:.3f}" \
                      .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train,
                              e, kl_loss/loss_count, end - start))

                    print('')
                    loss = 0.0
                    rec_loss = 0.0
                    kl_loss = 0.0
                    loss_count = 0

            # Evaluate loss on validation set
            score = val_loss(args.kl_weight * (e >= args.start_kl))
            print('Validation Loss: {0:f}'.format(score))

            # Set learning rate
            if (old_score - score) < 0.01 and e != args.start_kl:
                count_decay += 1
                decay_epochs.append(e)
                if len(decay_epochs) >= 3 and np.sum(
                        np.diff(decay_epochs)[-2:]) == 2:
                    break
                print('setting learning rate to ',
                      args.learning_rate * (args.decay_rate**count_decay))
                sess.run(
                    tf.assign(
                        net.learning_rate,
                        args.learning_rate * (args.decay_rate**count_decay)))
                if args.learning_rate * (args.decay_rate**count_decay) < 1e-5:
                    break
            print('learning rate is set to ',
                  args.learning_rate * (args.decay_rate**count_decay))
            old_score = score

            # Save model every epoch
            checkpoint_path = os.path.join(args.save_dir,
                                           args.save_name + '.ckpt')
            saver.save(sess, checkpoint_path, global_step=e)
            print("model saved to {}".format(checkpoint_path))
Exemplo n.º 41
0
    def _initialize(self, game, network_args=None, actions=None,
                    history_length=4,
                    batchsize=64,
                    update_pattern=(1, 1),
                    replay_memory_size=10000,
                    backprop_start_step=10000, start_epsilon=1.0,
                    end_epsilon=0.1,
                    epsilon_decay_start_step=50000,
                    epsilon_decay_steps=100000,
                    reward_scale=1.0,
                    use_game_variables=True,
                    misc_scale=None,
                    reshaped_x=None,
                    reshaped_y=None,
                    skiprate=4,
                    shaping_on=False,
                    count_states=False,
                    name=None,
                    net_type="cnn", melt_steps=10000, remember_n_actions=0):

        if network_args is None:
            network_args = dict()
        if count_states is not None:
            self._count_states = bool(count_states)

        self.name = name
        self._reward_scale = reward_scale
        self._game = game
        self._batchsize = batchsize
        self._history_length = max(history_length, 1)
        self._update_pattern = update_pattern
        self._epsilon = max(min(start_epsilon, 1.0), 0.0)
        self._end_epsilon = min(max(end_epsilon, 0.0), self._epsilon)
        self._epsilon_decay_steps = epsilon_decay_steps
        self._epsilon_decay_stride = (self._epsilon - end_epsilon) / epsilon_decay_steps
        self._epsilon_decay_start = epsilon_decay_start_step
        self._skiprate = max(skiprate, 0)
        self._shaping_on = shaping_on
        self._steps = 0
        self._melt_steps = melt_steps
        self._backprop_start_step = max(backprop_start_step, batchsize)
        self._use_game_variables = use_game_variables
        self._last_action_index = 0

        if self._shaping_on:
            self._last_shaping_reward = 0

        self.learning_mode = True

        if actions is None:
            self._actions = generate_default_actions(game)
        else:
            self._actions = actions

        self._actions_num = len(self._actions)
        self._actions_stats = np.zeros([self._actions_num], np.int)

        # changes img_shape according to the history size
        self._channels = game.get_screen_channels()
        if self._history_length > 1:
            self._channels *= self._history_length

        if reshaped_x is None:
            x = game.get_screen_width()
            y = game.get_screen_height()
            scale_x = scale_y = 1.0
        else:
            x = reshaped_x
            scale_x = float(x) / game.get_screen_width()

            if reshaped_y is None:
                y = int(game.get_screen_height() * scale_x)
                scale_y = scale_x
            else:
                y = reshaped_y
                scale_y = float(y) / game.get_screen_height()

        img_shape = [self._channels, y, x]

        # TODO check if it is slow (it seems that no)
        if scale_x == 1 and scale_y == 1:
            def convert(img):
                img = img.astype(np.float32) / 255.0
                return img
        else:
            def convert(img):
                img = img.astype(np.float32) / 255.0
                new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype)
                for i in xrange(img.shape[0]):
                    # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True)
                    new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA)
                return new_image
        self._convert_image = convert

        if self._use_game_variables:
            single_state_misc_len = game.get_available_game_variables_size() + int(self._count_states)
        else:
            single_state_misc_len = int(self._count_states)
        self._single_state_misc_len = single_state_misc_len

        self._remember_n_actions = remember_n_actions
        if remember_n_actions > 0:
            self._remember_n_actions = remember_n_actions
            self._action_len = len(self._actions[0])
            self._last_n_actions = np.zeros([remember_n_actions * self._action_len], dtype=np.float32)
            self._total_misc_len = single_state_misc_len * self._history_length + len(self._last_n_actions)
            self._last_action_index = 0
        else:
            self._total_misc_len = single_state_misc_len * self._history_length

        if self._total_misc_len > 0:
            self._misc_state_included = True
            self._current_misc_state = np.zeros(self._total_misc_len, dtype=np.float32)
            if single_state_misc_len > 0:
                self._state_misc_buffer = np.zeros(single_state_misc_len, dtype=np.float32)
                if misc_scale is not None:
                    self._misc_scale = np.array(misc_scale, dtype=np.float32)
                else:
                    self._misc_scale = None
        else:
            self._misc_state_included = False

        state_format = dict()
        state_format["s_img"] = img_shape
        state_format["s_misc"] = self._total_misc_len
        self._transitions = ReplayMemory(state_format, replay_memory_size, batchsize)

        network_args["state_format"] = state_format
        network_args["actions_number"] = len(self._actions)

        if net_type in ("dqn", None, ""):
            self._evaluator = DQN(**network_args)
        elif net_type == "duelling":
            self._evaluator = DuellingDQN(**network_args)
        else:
            print "Unsupported evaluator type."
            exit(1)
            # TODO throw. . .?

        self._current_image_state = np.zeros(img_shape, dtype=np.float32)
Exemplo n.º 42
0
    "input_shape": meta_controller_input_shape
}
controller_hparams = {
    "learning_rate": learning_rate,
    "epsilon": 1,
    "action_dim": env.action_space.n,
    "input_shape": controller_input_shape
}

controller = Controller(sess, controller_hparams)
meta_controller = MetaController(sess, meta_controller_hparams)
'''
Initialize the replay buffers
'''
d1 = ReplayMemory(name="controller",
                  buffer_capacity=256,
                  storage_capacity=4096,
                  obs_shape=controller_input_shape)
d2 = ReplayMemory(name="metacontroller",
                  buffer_capacity=256,
                  storage_capacity=4096,
                  obs_shape=meta_controller_input_shape)

#Storing performance
performanceDf = pd.DataFrame(
    columns=["episode", "intrinsic_reward", "goal_x", "goal_y", "training"])
if not os.path.exists("results"):
    os.makedirs("results")
'''
Pre-training step. Iterate over subgoals randomly and train controller to achieve subgoals
'''
Exemplo n.º 43
0
class DQLearner(interfaces.LearningAgent):
    def __init__(self,
                 dqn,
                 num_actions,
                 gamma=0.99,
                 learning_rate=0.00025,
                 replay_start_size=50000,
                 epsilon_start=1.0,
                 epsilon_end=0.01,
                 epsilon_steps=1000000,
                 update_freq=4,
                 target_copy_freq=30000,
                 replay_memory_size=1000000,
                 frame_history=4,
                 batch_size=32,
                 error_clip=1,
                 restore_network_file=None,
                 double=True):
        self.dqn = dqn
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history]
        inp_dtype = self.dqn.get_input_dtype()
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.gamma = gamma
        with tf.variable_scope('online'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            mask = tf.reshape(self.inp_mask, mask_shape)
            masked_input = self.inp_frames * mask
            self.q_online = self.dqn.construct_q_network(masked_input)
        with tf.variable_scope('target'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            sp_mask = tf.reshape(self.inp_sp_mask, mask_shape)
            masked_sp_input = self.inp_sp_frames * sp_mask
            self.q_target = self.dqn.construct_q_network(masked_sp_input)

        if double:
            with tf.variable_scope('online', reuse=True):
                self.q_online_prime = self.dqn.construct_q_network(
                    masked_sp_input)
            self.maxQ = tf.gather_nd(
                self.q_target,
                tf.transpose([
                    tf.range(0, 32, dtype=tf.int32),
                    tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32)
                ], [1, 0]))
        else:
            self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1)

        self.r = tf.sign(self.inp_reward)
        use_backup = tf.cast(tf.logical_not(self.inp_terminated),
                             dtype=tf.float32)
        self.y = self.r + use_backup * gamma * self.maxQ
        self.delta = tf.reduce_sum(self.inp_actions * self.q_online,
                                   reduction_indices=1) - self.y
        self.error = tf.where(
            tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta),
            error_clip * tf.abs(self.delta))
        self.loss = tf.reduce_sum(self.error)
        self.g = tf.gradients(self.loss, self.q_online)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        self.train_op = optimizer.minimize(self.loss,
                                           var_list=th.get_vars('online'))
        self.copy_op = th.make_copy_op('online', 'target')
        self.saver = tf.train.Saver(var_list=th.get_vars('online'))

        self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(),
                                          self.dqn.get_input_dtype(),
                                          replay_memory_size, frame_history)
        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (self.epsilon -
                              self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.sess.run(tf.initialize_all_variables())

        if restore_network_file is not None:
            self.saver.restore(self.sess, restore_network_file)
            print('Restored network from file')
        self.sess.run(self.copy_op)

    def update_q_values(self):
        S1, A, R, S2, T, M1, M2 = self.replay_buffer.sample(self.batch_size)
        Aonehot = np.zeros((self.batch_size, self.num_actions),
                           dtype=np.float32)
        Aonehot[list(range(len(A))), A] = 1

        [_, loss, q_online, maxQ, q_target, r, y, error, delta,
         g] = self.sess.run(
             [
                 self.train_op, self.loss, self.q_online, self.maxQ,
                 self.q_target, self.r, self.y, self.error, self.delta, self.g
             ],
             feed_dict={
                 self.inp_frames: S1,
                 self.inp_actions: Aonehot,
                 self.inp_sp_frames: S2,
                 self.inp_reward: R,
                 self.inp_terminated: T,
                 self.inp_mask: M1,
                 self.inp_sp_mask: M2
             })
        return loss

    def run_learning_episode(self, environment, max_episode_steps=100000):
        episode_steps = 0
        total_reward = 0
        for steps in range(max_episode_steps):
            if environment.is_current_state_terminal():
                break

            state = environment.get_current_state()
            if np.random.uniform(0, 1) < self.epsilon:
                action = np.random.choice(
                    environment.get_actions_for_state(state))
            else:
                action = self.get_action(state)

            if self.replay_buffer.size() > self.replay_start_size:
                self.epsilon = max(self.epsilon_min,
                                   self.epsilon - self.epsilon_delta)

            state, action, reward, next_state, is_terminal = environment.perform_action(
                action)
            total_reward += reward
            self.replay_buffer.append(state[-1], action, reward,
                                      next_state[-1], is_terminal)
            if (self.replay_buffer.size() > self.replay_start_size) and (
                    self.action_ticker % self.update_freq == 0):
                loss = self.update_q_values()
            if (self.action_ticker -
                    self.replay_start_size) % self.target_copy_freq == 0:
                self.sess.run(self.copy_op)
            self.action_ticker += 1
            episode_steps += 1
        return episode_steps, total_reward

    def get_action(self, state):
        size = list(np.array(list(range(len(self.dqn.get_input_shape())))) + 1)
        state_input = np.transpose(state, size + [0])

        [q_values] = self.sess.run(
            [self.q_online],
            feed_dict={
                self.inp_frames: [state_input],
                self.inp_mask: np.ones((1, self.frame_history),
                                       dtype=np.float32)
            })
        return np.argmax(q_values[0])

    def save_network(self, file_name):
        self.saver.save(self.sess, file_name)
Exemplo n.º 44
0
    def __init__(self,
                 dqn,
                 num_actions,
                 gamma=0.99,
                 learning_rate=0.00025,
                 replay_start_size=50000,
                 epsilon_start=1.0,
                 epsilon_end=0.01,
                 epsilon_steps=1000000,
                 update_freq=4,
                 target_copy_freq=30000,
                 replay_memory_size=1000000,
                 frame_history=4,
                 batch_size=32,
                 error_clip=1,
                 restore_network_file=None,
                 double=True):
        self.dqn = dqn
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history]
        inp_dtype = self.dqn.get_input_dtype()
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.gamma = gamma
        with tf.variable_scope('online'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            mask = tf.reshape(self.inp_mask, mask_shape)
            masked_input = self.inp_frames * mask
            self.q_online = self.dqn.construct_q_network(masked_input)
        with tf.variable_scope('target'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            sp_mask = tf.reshape(self.inp_sp_mask, mask_shape)
            masked_sp_input = self.inp_sp_frames * sp_mask
            self.q_target = self.dqn.construct_q_network(masked_sp_input)

        if double:
            with tf.variable_scope('online', reuse=True):
                self.q_online_prime = self.dqn.construct_q_network(
                    masked_sp_input)
            self.maxQ = tf.gather_nd(
                self.q_target,
                tf.transpose([
                    tf.range(0, 32, dtype=tf.int32),
                    tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32)
                ], [1, 0]))
        else:
            self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1)

        self.r = tf.sign(self.inp_reward)
        use_backup = tf.cast(tf.logical_not(self.inp_terminated),
                             dtype=tf.float32)
        self.y = self.r + use_backup * gamma * self.maxQ
        self.delta = tf.reduce_sum(self.inp_actions * self.q_online,
                                   reduction_indices=1) - self.y
        self.error = tf.where(
            tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta),
            error_clip * tf.abs(self.delta))
        self.loss = tf.reduce_sum(self.error)
        self.g = tf.gradients(self.loss, self.q_online)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        self.train_op = optimizer.minimize(self.loss,
                                           var_list=th.get_vars('online'))
        self.copy_op = th.make_copy_op('online', 'target')
        self.saver = tf.train.Saver(var_list=th.get_vars('online'))

        self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(),
                                          self.dqn.get_input_dtype(),
                                          replay_memory_size, frame_history)
        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (self.epsilon -
                              self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.sess.run(tf.initialize_all_variables())

        if restore_network_file is not None:
            self.saver.restore(self.sess, restore_network_file)
            print('Restored network from file')
        self.sess.run(self.copy_op)
Exemplo n.º 45
0
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)

        nets = nets_dm

        # init replay memory
        self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype])

        # own replay memory
        self.replay_memory = deque(maxlen=rm_size)

        # start tf session
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=threads,
            log_device_placement=False,
            allow_soft_placement=True))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA)
        self.theta_q = nets.theta_q(dimO, dimA)
        self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test, sum_p = nets.policy(obs, self.theta_p)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub((ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma))
        act_expl = act_test + noise

        # test
        q, sum_q = nets.qfunction(obs, act_test, self.theta_q, name= 'q_mu_of_s')
        # training
        # policy loss
        meanq = tf.reduce_mean(q, 0)
        wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=lrp)
        grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q optimization
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")
        # q
        q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q, name= 'qs_a')
        # q targets
        act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt)
        q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt, name='qsprime_aprime')
        q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=lrq)
        grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        # logging
        log_obs = [] if dimO[0] > 20 else [tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0])]
        log_act = [] if dimA[0] > 20 else [tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in
                                           range(dimA[0])]
        log_act2 = [] if dimA[0] > 20 else [tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in
                                            range(dimA[0])]
        log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)]
        log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)]
        log_noise = [tf.histogram_summary('noise', noise_var)]
        log_train = log_obs + log_act + log_act2 + log_misc + log_grad + log_noise

        merged = tf.merge_all_summaries()
        # initialize tf log writer
        self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf", self.sess.graph, flush_secs=20)

        # init replay memory for recording episodes
        max_ep_length = 10000
        self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype)

        # tf functions
        with self.sess.as_default():
            self.act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train_q = Fun([obs, act_train, rew, obs2, term2], [train_q], log_train, self.writer)
            self._train_p = Fun([obs], [train_p])
            self._train_p = Fun([obs], [train_p], log_obs, self.writer)
            self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q], merged, self.writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)
Exemplo n.º 46
0
from network import Network
from agent import Agent
from replay_memory import ReplayMemory

NUM_EPISODE = 1000
RENDER = False
REWARD_SUM_QUEUE_SIZE = 100

MEMORY_SIZE = 2000
TRAIN_START = 1000

if __name__ == "__main__":
    env = gym.make('CartPole-v0')
    network = Network("cpu:0")
    agent = Agent(network)
    replay_memory = ReplayMemory(MEMORY_SIZE)

    reward_sum_queue = []
    reward_sum_history = []
    reward_sum_avg_history = []
    for n_episode in range(NUM_EPISODE):
        state = env.reset()

        done = False
        reward_sum = 0.0
        while not done:
            if RENDER:
                env.render()

            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
Exemplo n.º 47
0
class QEngine:
    def __init__(self, **kwargs):
        self.setup = kwargs
        self._initialize(**kwargs)
        if "game" in kwargs:
            del kwargs["game"]

    def _prepare_for_save(self):
        self.setup["epsilon"] = self.epsilon
        self.setup["steps"] = self.steps
        self.setup["skiprate"] = self.skiprate

    # TODO why isn't it in init?
    # There was some reason but can't remember it now.
    def _initialize(self, game=None, network_args=None, actions=None, name=None,
                    net_type="dqn",  # TODO change to the actual class name?
                    reshaped_x=None,
                    reshaped_y=None,
                    skiprate=3,
                    history_length=4,
                    batchsize=64,
                    update_pattern=(1, 1),
                    replay_memory_size=10000,
                    backprop_start_step=10000,
                    start_epsilon=1.0,
                    end_epsilon=0.1,
                    epsilon_decay_start_step=50000,
                    epsilon_decay_steps=100000,
                    reward_scale=1.0,  # TODO useless?
                    melt_steps=10000,

                    shaping_on=False,
                    count_time=False,
                    one_hot_time=False,
                    count_time_interval=1,
                    count_time_max=2100,

                    use_game_variables=True,
                    rearrange_misc=False,

                    remember_n_actions=4,
                    one_hot_nactions=False,

                    misc_scale=None,  # TODO seems useless
                    results_file=None,
                    params_file=None,
                    config_file=None,

                    no_timeout_terminal=False  # TODO seems useless
                    ):

        if game is not None:
            self.game = game
            self.config_file = None
        elif config_file is not None:
            self.config_file = config_file
            self.game = initialize_doom(self.config_file)
        else:
            raise Exception("No game, no config file. Dunno how to initialize doom.")

        if network_args is None:
            network_args = dict()

        if count_time:
            self.count_time = bool(count_time)
            if self.count_time:
                self.one_hot_time = one_hot_time
                self.count_time_max = int(count_time_max)
                self.count_time_interval = int(count_time_interval)
                if one_hot_time:
                    self.count_time_len = int(self.count_time_max / self.count_time_interval)
                else:
                    self.count_time_len = 1
        else:
            self.count_time_len = 0
            self.count_time = False

        self.name = name
        if reward_scale is not None:
            self.reward_scale = reward_scale
        else:
            self.reward_scale = 1.0
        self.rearrange_misc = rearrange_misc
        self.batchsize = batchsize
        self.history_length = max(history_length, 1)
        self.update_pattern = update_pattern
        self.epsilon = max(min(start_epsilon, 1.0), 0.0)
        self.end_epsilon = min(max(end_epsilon, 0.0), self.epsilon)
        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilon_decay_stride = (self.epsilon - end_epsilon) / epsilon_decay_steps
        self.epsilon_decay_start = epsilon_decay_start_step
        self.skiprate = max(skiprate, 0)
        self.shaping_on = shaping_on
        self.steps = 0
        self.melt_steps = melt_steps
        self.backprop_start_step = max(backprop_start_step, batchsize)
        self.one_hot_nactions = one_hot_nactions
        self.no_timeout_terminal = no_timeout_terminal
        if results_file:
            self.results_file = results_file
        else:
            self.results_file = "results/" + name + ".res"
        if params_file:
            self.params_file = params_file
        else:
            self.params_file = "params/" + name

        if self.game.get_available_game_variables_size() > 0 and use_game_variables:
            self.use_game_variables = True
        else:
            self.use_game_variables = False

        self.last_shaping_reward = 0

        self.learning_mode = True

        if actions is None:
            self.actions = generate_default_actions(self.game)
        else:
            self.actions = actions
        self.actions_num = len(self.actions)
        self.actions_stats = np.zeros([self.actions_num], np.int)

        # changes img_shape according to the history size
        self.channels = self.game.get_screen_channels()
        if self.history_length > 1:
            self.channels *= self.history_length

        if reshaped_x is None:
            x = self.game.get_screen_width()
            y = self.game.get_screen_height()
            scale_x = scale_y = 1.0
        else:
            x = reshaped_x
            scale_x = float(x) / self.game.get_screen_width()

            if reshaped_y is None:
                y = int(self.game.get_screen_height() * scale_x)
                scale_y = scale_x
            else:
                y = reshaped_y
                scale_y = float(y) / self.game.get_screen_height()

        img_shape = [self.channels, y, x]

        # TODO check if it is slow (it seems that no)
        if scale_x == 1 and scale_y == 1:
            def convert(img):
                img = img.astype(np.float32) / 255.0
                return img
        else:
            def convert(img):
                img = img.astype(np.float32) / 255.0
                new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype)
                for i in xrange(img.shape[0]):
                    # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True)
                    new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA)
                return new_image
        self.convert_image = convert

        if self.use_game_variables:
            single_state_misc_len = int(self.game.get_available_game_variables_size() + self.count_time_len)
        else:
            single_state_misc_len = int(self.count_time_len)
        self.single_state_misc_len = single_state_misc_len

        self.remember_n_actions = remember_n_actions
        total_misc_len = int(single_state_misc_len * self.history_length)

        if remember_n_actions > 0:
            self.remember_n_actions = remember_n_actions
            if self.one_hot_nactions:
                self.action_len = int(2 ** floor(log(len(self.actions), 2)))
            else:
                self.action_len = len(self.actions[0])
            self.last_action = np.zeros([self.action_len], dtype=np.float32)
            self.last_n_actions = np.zeros([remember_n_actions * self.action_len], dtype=np.float32)
            total_misc_len += len(self.last_n_actions)

        if total_misc_len > 0:
            self.misc_state_included = True
            self.current_misc_state = np.zeros(total_misc_len, dtype=np.float32)
            if single_state_misc_len > 0:
                if misc_scale is not None:
                    self.misc_scale = np.array(misc_scale, dtype=np.float32)
                else:
                    self.misc_scale = None
        else:
            self.misc_state_included = False

        state_format = dict()
        state_format["s_img"] = img_shape
        state_format["s_misc"] = total_misc_len
        self.replay_memory = ReplayMemory(state_format, replay_memory_size, batchsize)

        network_args["state_format"] = state_format
        network_args["actions_number"] = len(self.actions)

        if net_type in ("dqn", None, ""):
            self.approximator = approximators.DQN(**network_args)
        elif net_type in ["duelling", "dueling"]:
            self.approximator = approximators.DuelingDQN(**network_args)
        else:
            if locate('approximators.' + net_type) is not None:
                self.approximator = locate('approximators.' + net_type)(**network_args)
            else:
                raise Exception("Unsupported approximator type.")

        self.current_image_state = np.zeros(img_shape, dtype=np.float32)

    def _update_state(self):
        raw_state = self.game.get_state()
        img = self.convert_image(raw_state.image_buffer)
        state_misc = None
        if self.single_state_misc_len > 0:
            state_misc = np.zeros(self.single_state_misc_len, dtype=np.float32)
            if self.use_game_variables:
                game_variables = raw_state.game_variables.astype(np.float32)
                state_misc[0:len(game_variables)] = game_variables
                count_time_start = len(game_variables)
            else:
                count_time_start = 0

            if self.count_time:
                raw_time = raw_state.number
                processed_time = int(min(self.count_time_max, raw_time) / self.count_time_interval)
                if self.one_hot_time:
                    num_one_hot = processed_time - 1
                    state_number = np.zeros([self.count_time_len], dtype=np.float32)
                    state_number[num_one_hot] = 1
                    '''
                    # TODO make it available in options
                    # HACK1 that uses health and count as one hot at once
                    hp = int(raw_state.game_variables[0])
                    state = raw_time
                    state_number = np.zeros([self.count_time_len], dtype=np.float32)
                    state_number[hp - 1] = 1
                    state_number[99 + state] = 1
                    # HACK1 ends
                    '''
                    '''
                    # TODO make it available in options
                    # HACK2 that uses health as one hot
                    hp = int(raw_state.game_variables[0])
                    state_number = np.zeros([self.count_time_len], dtype=np.float32)
                    state_number[hp - 1] = 1
                    # HACK2 ends
                     '''
                else:
                    state_number = processed_time

                state_misc[count_time_start:] = state_number

            if self.misc_scale is not None:
                state_misc = state_misc * self.misc_scale

        if self.history_length > 1:
            pure_channels = self.channels / self.history_length
            self.current_image_state[0:-pure_channels] = self.current_image_state[pure_channels:]
            self.current_image_state[-pure_channels:] = img

            if self.single_state_misc_len > 0:
                misc_len = len(state_misc)
                hist_len = self.history_length

                # TODO don't move count_time when it's one hot - it's useless and performance drops slightly
                if self.rearrange_misc:
                    for i in xrange(misc_len):
                        cms_part = self.current_misc_state[i * hist_len:(i + 1) * hist_len]
                        cms_part[0:hist_len - 1] = cms_part[1:]
                        cms_part[-1] = state_misc[i]
                else:
                    cms = self.current_misc_state
                    cms[0:(hist_len - 1) * misc_len] = cms[misc_len:hist_len * misc_len]
                    cms[(hist_len - 1) * misc_len:hist_len * misc_len] = state_misc

        else:
            self.current_image_state[:] = img
            if self.single_state_misc_len > 0:
                self.current_misc_state[0:len(state_misc)] = state_misc

        if self.remember_n_actions:
            self.last_n_actions[:-self.action_len] = self.last_n_actions[self.action_len:]

            self.last_n_actions[-self.action_len:] = self.last_action
            self.current_misc_state[-len(self.last_n_actions):] = self.last_n_actions

    def new_episode(self, update_state=False):
        self.game.new_episode()
        self.reset_state()
        self.last_shaping_reward = 0
        if update_state:
            self._update_state()

    def set_last_action(self, index):
        if self.one_hot_nactions:
            self.last_action.fill(0)
            self.last_action[index] = 1
        else:
            self.last_action[:] = self.actions[index]

    # Return current state including history
    def _current_state(self):
        if self.misc_state_included:
            s = [self.current_image_state, self.current_misc_state]
        else:
            s = [self.current_image_state]
        return s

    # Return current state's COPY including history.
    def _current_state_copy(self):
        if self.misc_state_included:
            s = [self.current_image_state.copy(), self.current_misc_state.copy()]
        else:
            s = [self.current_image_state.copy()]
        return s

    # Sets the whole state to zeros.
    def reset_state(self):
        self.current_image_state.fill(0.0)

        if self.misc_state_included:
            self.current_misc_state.fill(0.0)
            if self.remember_n_actions > 0:
                self.set_last_action(0)
                self.last_n_actions.fill(0)

    def make_step(self):
        self._update_state()
        # TODO Check if not making the copy still works
        a = self.approximator.estimate_best_action(self._current_state_copy())
        self.actions_stats[a] += 1
        self.game.make_action(self.actions[a], self.skiprate + 1)
        if self.remember_n_actions:
            self.set_last_action(a)

    def make_sleep_step(self, sleep_time=1 / 35.0):
        self._update_state()
        a = self.approximator.estimate_best_action(self._current_state_copy())
        self.actions_stats[a] += 1

        self.game.set_action(self.actions[a])
        if self.remember_n_actions:
            self.set_last_action(a)
        for i in xrange(self.skiprate):
            self.game.advance_action(1, False, True)
            sleep(sleep_time)
        self.game.advance_action()

        sleep(sleep_time)

    def check_timeout(self):
        return (self.game.get_episode_time() - self.game.get_episode_start_time() >= self.game.get_episode_timeout())

    # Performs a learning step according to epsilon-greedy policy.
    # The step spans self.skiprate +1 actions.
    def make_learning_step(self):
        self.steps += 1
        # epsilon decay
        if self.steps > self.epsilon_decay_start and self.epsilon > self.end_epsilon:
            self.epsilon = max(self.epsilon - self.epsilon_decay_stride, 0)

            # Copy because state will be changed in a second
        s = self._current_state_copy();

        # With probability epsilon choose a random action:
        if self.epsilon >= random.random():
            a = random.randint(0, len(self.actions) - 1)
        else:
            a = self.approximator.estimate_best_action(s)
        self.actions_stats[a] += 1

        # make action and get the reward
        if self.remember_n_actions:
            self.set_last_action(a)

        r = self.game.make_action(self.actions[a], self.skiprate + 1)
        r = np.float32(r)
        if self.shaping_on:
            sr = np.float32(doom_fixed_to_double(self.game.get_game_variable(GameVariable.USER1)))
            r += sr - self.last_shaping_reward
            self.last_shaping_reward = sr

        r *= self.reward_scale

        # update state s2 accordingly and add transition
        if self.game.is_episode_finished():
            if (not self.no_timeout_terminal) or (not self.check_timeout()):
                s2 = None
                self.replay_memory.add_transition(s, a, s2, r, terminal=True)
        else:
            self._update_state()
            s2 = self._current_state()
            self.replay_memory.add_transition(s, a, s2, r, terminal=False)

        # Perform q-learning once for a while
        if self.replay_memory.size >= self.backprop_start_step and self.steps % self.update_pattern[0] == 0:
            for a in xrange(self.update_pattern[1]):
                self.approximator.learn(self.replay_memory.get_sample())

        # Melt the network sometimes
        if self.steps % self.melt_steps == 0:
            self.approximator.melt()

    # Runs a single episode in current mode. It ignores the mode if learn==true/false
    def run_episode(self, sleep_time=0):
        self.new_episode()
        if sleep_time == 0:
            while not self.game.is_episode_finished():
                self.make_step()
        else:
            while not self.game.is_episode_finished():
                self.make_sleep_step(sleep_time)

        return np.float32(self.game.get_total_reward())

    # Utility stuff
    def get_actions_stats(self, clear=False, norm=True):
        stats = self.actions_stats.copy()
        if norm:
            stats = stats / np.float32(self.actions_stats.sum())
            stats[stats == 0.0] = -1
            stats = np.around(stats, 3)

        if clear:
            self.actions_stats.fill(0)
        return stats

    def get_steps(self):
        return self.steps

    def get_epsilon(self):
        return self.epsilon

    def get_network(self):
        return self.approximator.network

    def set_epsilon(self, eps):
        self.epsilon = eps

    def set_skiprate(self, skiprate):
        self.skiprate = max(skiprate, 0)

    def get_skiprate(self):
        return self.skiprate

    def get_mean_loss(self):
        return self.approximator.get_mean_loss()

    # Saves network weights to a file
    def save_params(self, filename, quiet=False):
        if not quiet:
            print "Saving network weights to " + filename + "..."
        self._prepare_for_save()
        params = get_all_param_values(self.approximator.network)
        pickle.dump(params, open(filename, "wb"))
        if not quiet:
            print "Saving finished."

    # Loads network weights from the file
    def load_params(self, filename, quiet=False):
        if not quiet:
            print "Loading network weights from " + filename + "..."
        params = pickle.load(open(filename, "rb"))
        set_all_param_values(self.approximator.network, params)
        set_all_param_values(self.approximator.frozen_network, params)

        if not quiet:
            print "Loading finished."

            # Loads the whole engine with params from file

    def get_network_architecture(self):
        return get_all_param_values(self.get_network())

    def print_setup(self):
        print "\nNetwork architecture:"
        for p in self.get_network_architecture():
            print p.shape
        print "\n*** Engine setup ***"
        for k in self.setup.keys():
            if k == "network_args":
                print"network_args:"
                net_args = self.setup[k]
                for k2 in net_args.keys():
                    print "\t", k2, ":", net_args[k2]
            else:
                print k, ":", self.setup[k]

    @staticmethod
    def load(filename, game=None, config_file=None, quiet=False):
        if not quiet:
            print "Loading qengine from " + filename + "..."

        params = pickle.load(open(filename, "rb"))

        qengine_args = params[0]
        network_weights = params[1]

        steps = qengine_args["steps"]
        epsilon = qengine_args["epsilon"]
        del (qengine_args["epsilon"])
        del (qengine_args["steps"])
        if game is None:
            if config_file is not None:
                game = initialize_doom(config_file)
                qengine_args["config_file"] = config_file
            elif "config_file" in qengine_args and qengine_args["config_file"] is not None:
                game = initialize_doom(qengine_args["config_file"])
            else:
                raise Exception("No game, no config file. Dunno how to initialize doom.")
        else:
            qengine_args["config_file"] = None

        qengine_args["game"] = game
        qengine = QEngine(**qengine_args)
        set_all_param_values(qengine.approximator.network, network_weights)
        set_all_param_values(qengine.approximator.frozen_network, network_weights)

        if not quiet:
            print "Loading finished."
            qengine.steps = steps
            qengine.epsilon = epsilon
        return qengine

    # Saves the whole engine with params to a file
    def save(self, filename=None, quiet=False):
        if filename is None:
            filename = self.params_file
        if not quiet:
            print "Saving qengine to " + filename + "..."
        self._prepare_for_save()
        network_params = get_all_param_values(self.approximator.network)
        params = [self.setup, network_params]
        pickle.dump(params, open(filename, "wb"))
        if not quiet:
            print "Saving finished."
Exemplo n.º 48
0
def train(active_mv):
    senv = ShapeNetEnv(FLAGS)
    replay_mem = ReplayMemory(FLAGS)

    log_string('====== Starting burning in memories ======')
    burn_in(senv, replay_mem)
    log_string('====== Done. {} trajectories burnt in ======'.format(
        FLAGS.burn_in_length))

    rollout_obj = Rollout(active_mv, senv, replay_mem, FLAGS)
    # burn in(pretrain) for MVnet
    if FLAGS.burn_in_iter > 0:
        for i in range(FLAGS.burnin_start_iter,
                       FLAGS.burnin_start_iter + FLAGS.burn_in_iter):

            rollout_obj.go(i,
                           verbose=True,
                           add_to_mem=True,
                           mode=FLAGS.burnin_mode,
                           is_train=True)
            mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size)

            tic = time.time()
            out_stuff = run_step(mvnet_input, mode='burnin', is_training=True)

            if (i + 1
                ) % FLAGS.save_every_step == 0 and i > FLAGS.burnin_start_iter:
                save_pretrain(active_mv, i + 1)

            if (((i + 1) % FLAGS.test_every_step == 0
                 and i > FLAGS.burnin_start_iter)
                    or (FLAGS.eval0 and i == FLAGS.burnin_start_iter)):
                evaluate_burnin(
                    active_mv,
                    FLAGS.test_episode_num,
                    replay_mem,
                    i + 1,
                    rollout_obj,
                    mode=FLAGS.burnin_mode,
                    override_mvnet_input=(batch_to_single_mvinput(mvnet_input)
                                          if FLAGS.reproj_mode else None))

    for i_idx in range(FLAGS.max_iter):

        t0 = time.time()

        if np.random.uniform() < FLAGS.epsilon:
            rollout_obj.go(i_idx,
                           verbose=True,
                           add_to_mem=True,
                           mode=FLAGS.explore_mode,
                           is_train=True)
        else:
            rollout_obj.go(i_idx, verbose=True, add_to_mem=True, is_train=True)
        t1 = time.time()

        mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size)
        t2 = time.time()

        out_stuff = active_mv.run_step(mvnet_input,
                                       mode='train',
                                       is_training=True)
        t3 = time.time()

        train_log(i_idx, out_stuff, (t0, t1, t2, t3))

        if (i_idx + 1) % FLAGS.save_every_step == 0 and i_idx > 0:
            save(active_mv, i_idx + 1, i_idx + 1, i_idx + 1)

        if (i_idx + 1) % FLAGS.test_every_step == 0 and i_idx > 0:
            print('Evaluating active policy')
            evaluate(active_mv,
                     FLAGS.test_episode_num,
                     replay_mem,
                     i_idx + 1,
                     rollout_obj,
                     mode='active')
            print('Evaluating random policy')
            evaluate(active_mv,
                     FLAGS.test_episode_num,
                     replay_mem,
                     i_idx + 1,
                     rollout_obj,
                     mode='oneway')
def train(params):
    
    # Load Atari rom and prepare ALE environment 
    atari = GymEnvironment(params.random_start_wait, params.show_game)

    # Initialize two Q-Value Networks one for training and one for target prediction
    dqn_train  = DeepQNetwork(
        params=params,
        num_actions=atari.num_actions,
        network_name="qnetwork-train",
        trainable=True
    )

    # Q-Network for predicting target Q-values
    dqn_target= DeepQNetwork(
        params=params,
        num_actions=atari.num_actions,
        network_name="qnetwork-target",
        trainable=False
    )
    
    # Initialize replay memory for storing experience to sample batches from
    replay_mem = ReplayMemory(params.replay_capacity, params.batch_size)

    # Small structure for storing the last four screens
    history = ScreenHistory(params)

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    replay_mem_dump   = os.path.abspath(os.path.join(params.output_dir, "replay_memory.hdf5"))
    checkpoint_dir    = os.path.abspath(os.path.join(params.output_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    train_step         = 0
    count_actions      = np.zeros(atari.num_actions)   # Count per action (only greedy)
    count_act_random   = 0  # Count of random actions
    count_act_greedy   = 0  # Count of greedy actions

    # Histories of qvalues and loss for running average
    qvalues_hist = collections.deque([0]*params.interval_summary,  maxlen=params.interval_summary)
    loss_hist    = collections.deque([10]*params.interval_summary, maxlen=params.interval_summary)

    # Time measurements
    dt_batch_gen    = collections.deque([0]*10, maxlen=10)
    dt_optimization = collections.deque([0]*10, maxlen=10)
    dt_train_total  = collections.deque([0]*10, maxlen=10)

    # Optionally load pre-initialized replay memory from disk
    if params.replay_mem_dump is not None and params.is_train:
        print("Loading pre-initialized replay memory from HDF5 file.")
        replay_mem.load(params.replay_mem_dump)


    # Initialize a new game and store the screens in the history
    reward, screen, is_terminal = atari.new_random_game()
    for _ in xrange(params.history_length):
        history.add(screen)

    # Initialize the TensorFlow session
    gpu_options = tf.GPUOptions(
       per_process_gpu_memory_fraction=0.4
    )

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        # Initialize the TensorFlow session
        init = tf.initialize_all_variables()
        sess.run(init)

        # Only save trainable variables and the global step to disk
        tf_vars_to_save = tf.trainable_variables() + [dqn_train.global_step]
        saver = tf.train.Saver(tf_vars_to_save, max_to_keep=40)

        if params.model_file is not None:
            # Load pre-trained model from disk
            saver.restore(sess, params.model_file)
            train_step, learning_rate = sess.run([dqn_train.global_step, dqn_train.learning_rate])
            print("Restarted training from model file. Step = %06i, Learning Rate = %.5f" % (train_step, learning_rate))


        # Initialize summary writer
        dqn_train.build_summary_writer(sess)

        # Initialize the target Q-Network fixed with the same weights
        update_target_network(sess, "qnetwork-train", "qnetwork-target")


        for step in xrange(params.num_steps):

            replay_mem_size = replay_mem.num_examples()
            if params.is_train and replay_mem_size < params.train_start and step % 1000 == 0:
                print("Initializing replay memory %i/%i" % (step, params.train_start))

            # Epsilon Greedy Exploration: with the probability of epsilon
            # choose a random action, otherwise go greedy with the action
            # having the maximal Q-value. Note the minimum episolon of 0.1
            if params.is_train:
                epsilon = max(0.1, 1.0-float(train_step*params.train_freq) / float(params.epsilon_step))
            else:
                epsilon = 0.05


            ################################################################
            ####################### SELECT A MOVE ##########################
            ################################################################

            # Either choose a random action or predict the action using the Q-network
            do_random_action = (random.random() < epsilon)
            if do_random_action or (replay_mem_size < params.train_start and params.is_train):
                action_id = random.randrange(atari.num_actions)
                count_act_random += 1
            else:

                # Get the last screens from the history and perform
                # feed-forward through the network to compute Q-values
                feed_dict  = { dqn_train.pl_screens: history.get() }
                qvalues    = sess.run(dqn_train.qvalues, feed_dict=feed_dict)

                # Choose the best action based on the approximated Q-values
                qvalue_max = np.max(qvalues[0])
                action_id  = np.argmax(qvalues[0])

                count_act_greedy += 1
                count_actions[action_id] += 1
                qvalues_hist.append(qvalue_max)


            ################################################################
            ####################### PLAY THE MOVE ##########################
            ################################################################

            # Play the selected action (either random or predicted) on the Atari game
            # Note that the action is performed for k = 4 frames (frame skipping)
            cumulative_reward, screen, is_terminal = atari.act(action_id)

            # Perform reward clipping and add the example to the replay memory
            cumulative_reward = min(+1.0, max(-1.0, cumulative_reward))

            # Add the screen to short term history and replay memory
            history.add(screen)

            # Add experience to replay memory
            if params.is_train:
                replay_mem.add(action_id, cumulative_reward, screen, is_terminal)

            # Check if we are game over, and if yes, initialize a new game
            if is_terminal:
                reward, screen, is_terminal = atari.new_random_game()
                replay_mem.add(0, reward, screen, is_terminal)
                history.add(screen)


            ################################################################
            ###################### TRAINING MODEL ##########################
            ################################################################


            if params.is_train and step > params.train_start and step % params.train_freq == 0:

                t1 = time.time()

                # Prepare batch and train the network
                # TODO: set actions with terminal == 1 to reward = -1 ??
                screens_in, actions, rewards, screens_out, terminals = replay_mem.sample_batch()

                dt_batch_gen.append(time.time() - t1)
                t2 = time.time()

                # Compute the target rewards from the previously fixed network
                # Note that the forward run is performed on the output screens.
                qvalues_target = sess.run(
                    dqn_target.qvalues,
                    feed_dict={ dqn_target.pl_screens: screens_out }
                )

                # Inputs for trainable Q-network
                feed_dict = {
                    dqn_train.pl_screens   : screens_in,
                    dqn_train.pl_actions   : actions,
                    dqn_train.pl_rewards   : rewards,
                    dqn_train.pl_terminals : terminals,
                    dqn_train.pl_qtargets  : np.max(qvalues_target, axis=1),

                }

                # Actual training operation
                _, loss, train_step = sess.run([dqn_train.train_op,
                                                dqn_train.loss,
                                                dqn_train.global_step],
                                                feed_dict=feed_dict)

                t3 = time.time()
                dt_optimization.append(t3 - t2)
                dt_train_total.append(t3 - t1)

                # Running average of the loss
                loss_hist.append(loss)

                 # Check if the returned loss is not NaN
                if np.isnan(loss):
                    print("[%s] Training failed with loss = NaN." %
                          datetime.now().strftime("%Y-%m-%d %H:%M"))


                # Once every n = 10000 frames update the Q-network for predicting targets
                if train_step % params.network_update_rate == 0:
                    print("[%s] Updating target network." % datetime.now().strftime("%Y-%m-%d %H:%M"))
                    update_target_network(sess, "qnetwork-train", "qnetwork-target")


                ################################################################
                ####################### MODEL EVALUATION #######################
                ################################################################

                if params.is_train and train_step % params.eval_frequency == 0:

                    eval_total_reward = 0
                    eval_num_episodes = 0
                    eval_num_rewards = 0
                    eval_episode_max_reward = 0
                    eval_episode_reward = 0
                    eval_actions = np.zeros(atari.num_actions)

                    # Initialize new game without random start moves
                    reward, screen, terminal = atari.new_game()
                    for _ in range(4):
                        history.add(screen)

                    for eval_step in range(params.eval_steps):

                        if random.random() < params.eval_epsilon:
                            # Random action
                            action_id = random.randrange(atari.num_actions)
                        else:
                            # Greedy action
                            # Get the last screens from the history and perform
                            # feed-forward through the network to compute Q-values
                            feed_dict_eval  = { dqn_train.pl_screens: history.get() }
                            qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict_eval)

                            # Choose the best action based on the approximated Q-values
                            qvalue_max = np.max(qvalues[0])
                            action_id  = np.argmax(qvalues[0])

                        # Keep track of how many of each action is performed
                        eval_actions[action_id] += 1

                        # Perform the action
                        reward, screen, terminal = atari.act(action_id)
                        history.add(screen)

                        eval_episode_reward += reward
                        if reward > 0:
                            eval_num_rewards += 1

                        if terminal:
                            eval_total_reward += eval_episode_reward
                            eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward)
                            eval_episode_reward = 0
                            eval_num_episodes += 1

                            reward, screen, terminal = atari.new_game()
                            for _ in range(4):
                                history.add(screen)

                    # Send statistics about the environment to TensorBoard
                    eval_update_ops = [
                        dqn_train.eval_rewards.assign(eval_total_reward),
                        dqn_train.eval_num_rewards.assign(eval_num_rewards),
                        dqn_train.eval_max_reward.assign(eval_episode_max_reward),
                        dqn_train.eval_num_episodes.assign(eval_num_episodes),
                        dqn_train.eval_actions.assign(eval_actions / np.sum(eval_actions))

                    ]
                    sess.run(eval_update_ops)
                    summaries = sess.run(dqn_train.eval_summary_op, feed_dict=feed_dict)
                    dqn_train.train_summary_writer.add_summary(summaries, train_step)

                    print("[%s] Evaluation Summary" % datetime.now().strftime("%Y-%m-%d %H:%M"))
                    print("  Total Reward: %i" % eval_total_reward)
                    print("  Max Reward per Episode: %i" % eval_episode_max_reward)
                    print("  Num Episodes: %i" % eval_num_episodes)
                    print("  Num Rewards: %i" % eval_num_rewards)


                ################################################################
                ###################### PRINTING / SAVING #######################
                ################################################################

                # Write a training summary to disk
                if params.is_train and train_step % params.interval_summary == 0:

                    avg_dt_batch_gen    = sum(dt_batch_gen)    / float(len(dt_batch_gen))
                    avg_dt_optimization = sum(dt_optimization) / float(len(dt_optimization))
                    avg_dt_total        = sum(dt_train_total)  / float(len(dt_train_total))
                    # print("Avg. Time Batch Preparation: %.3f seconds" % avg_dt_batch_gen)
                    # print("Avg. Time Train Operation:   %.3f seconds" % avg_dt_train_op)
                    # print("Avg. Time Total per Batch:   %.3f seconds (%.2f samples/second)" %
                    #       (avg_dt_total, (1.0/avg_dt_total)*params.batch_size))

                    # Send statistics about the environment to TensorBoard
                    update_game_stats_ops = [
                        dqn_train.avg_reward_per_game.assign(atari.avg_reward_per_episode()),
                        dqn_train.max_reward_per_game.assign(atari.max_reward_per_episode),
                        dqn_train.avg_moves_per_game.assign(atari.avg_steps_per_episode()),
                        dqn_train.total_reward_replay.assign(replay_mem.total_reward()),
                        dqn_train.num_games_played.assign(atari.episode_number),
                        dqn_train.actions_random.assign(count_act_random),
                        dqn_train.actions_greedy.assign(count_act_greedy),
                        dqn_train.runtime_batch.assign(avg_dt_batch_gen),
                        dqn_train.runtime_train.assign(avg_dt_optimization),
                        dqn_train.runtime_total.assign(avg_dt_total),
                        dqn_train.samples_per_second.assign((1.0/avg_dt_total)*params.batch_size)
                    ]
                    sess.run(update_game_stats_ops)

                    # Build and save summaries
                    summaries = sess.run(dqn_train.train_summary_op, feed_dict=feed_dict)
                    dqn_train.train_summary_writer.add_summary(summaries, train_step)

                    avg_qvalue = avg_loss = 0
                    for i in xrange(len(qvalues_hist)):
                        avg_qvalue += qvalues_hist[i]
                        avg_loss   += loss_hist[i]

                    avg_qvalue /= float(len(qvalues_hist))
                    avg_loss   /= float(len(loss_hist))

                    format_str = "[%s] Step %06i, ReplayMemory = %i, Epsilon = %.4f, "\
                                 "Episodes = %i, Avg.Reward = %.2f, Max.Reward = %.2f, Avg.QValue = %.4f, Avg.Loss = %.6f"
                    print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), train_step,
                                        replay_mem.num_examples(), epsilon, atari.episode_number,
                                        atari.avg_reward_per_episode(), atari.max_reward_per_episode,
                                        avg_qvalue, avg_loss))

                    # For debugging purposes, dump the batch to disk
                    #print("[%s] Writing batch images to file (debugging)" %
                    #      datetime.now().strftime("%Y-%m-%d %H:%M"))
                    #batch_output_dir = os.path.join(params.output_dir, "batches/%06i/" % train_step)
                    #replay_mem.write_batch_to_disk(batch_output_dir, screens_in, actions, rewards, screens_out)

                # Write model checkpoint to disk
                if params.is_train and train_step % params.interval_checkpoint == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step=train_step)
                    print("[%s] Saving TensorFlow model checkpoint to disk." %
                          datetime.now().strftime("%Y-%m-%d %H:%M"))

                    # Dump the replay memory to disk
                    # TODO: fix this!
                    # print("[%s] Saving replay memory to disk." %
                    #       datetime.now().strftime("%Y-%m-%d %H:%M"))
                    # replay_mem.save(replay_mem_dump)

                    sum_actions = float(reduce(lambda x, y: x+y, count_actions))
                    action_str = ""
                    for action_id, action_count in enumerate(count_actions):
                        action_perc = action_count/sum_actions if not sum_actions == 0 else 0
                        action_str += "<%i, %s, %i, %.2f> " % \
                                      (action_id, atari.action_to_string(action_id),
                                       action_count, action_perc)

                    format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s"
                    print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"),
                                        count_act_random, count_act_greedy, action_str))

        print("Finished training Q-network.")
Exemplo n.º 50
0
class Agent:

  def __init__(self, dimO, dimA,
    nets=nets_dm,
    tau =.001, # fdsla
    discount =.99, 
    pl2 =.0, 
    ql2 =.01, 
    lrp =.0001, 
    lrq =.001, 
    ou_theta = 0.15, 
    ou_sigma = 0.2, 
    rm_size = 500000, 
    rm_dtype = 'float32',
    mb_size = 32,
    threads = 4,**kwargs):
    dimA = list(dimA)
    dimO = list(dimO)

    # init replay memory
    self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype])
    self.mb_size = mb_size
    # start tf session
    self.sess = tf.Session(config=tf.ConfigProto(
      inter_op_parallelism_threads=threads,
      log_device_placement=False,
      allow_soft_placement=True))

    # create tf computational graph
    #
    self.theta_p = nets.theta_p(dimO, dimA)
    self.theta_q = nets.theta_q(dimO, dimA)
    self.theta_pt, update_pt = exponential_moving_averages(
      self.theta_p, tau)
    self.theta_qt, update_qt = exponential_moving_averages(
      self.theta_q, tau)

    obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
    act_test, sum_p = nets.policy(obs, self.theta_p)

    # explore
    noise_init = tf.zeros([1]+dimA)
    noise_var = tf.Variable(noise_init)
    self.ou_reset = noise_var.assign(noise_init)
    noise = noise_var.assign_sub(
      (ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma))
    act_expl = act_test + noise

    # test
    q, sum_q = nets.qfunction(obs, act_test, self.theta_q)

    # training
    # policy loss
    meanq = tf.reduce_mean(q, 0)
    wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var)
             for var in self.theta_p])  # weight decay
    loss_p = -meanq + wd_p
    # policy optimization
    optim_p = tf.train.AdamOptimizer(learning_rate=lrp)
    grads_and_vars_p = optim_p.compute_gradients(
      loss_p, var_list=self.theta_p)
    optimize_p = optim_p.apply_gradients(grads_and_vars_p)
    with tf.control_dependencies([optimize_p]):
      train_p = tf.group(update_pt)

    # q optimization
    act_train = tf.placeholder(tf.float32, [None] + dimA, "act_train")
    rew = tf.placeholder(tf.float32, [None], "rew")
    obs2 = tf.placeholder(tf.float32, [None] + dimO, "obs2")
    term2 = tf.placeholder(tf.bool, [None], "term2")
    # q
    q, sum_qq = nets.qfunction(obs, act_train, self.theta_q)
    # q targets
    act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt)
    q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
    q_target = tf.stop_gradient(tf.select(term2,rew,rew + discount*q2))
    # = tf.stop_gradient(rew + discount * q2)
    # q loss
    mb_td_error = tf.square(q - q_target)
    mean_td_error = tf.reduce_mean(mb_td_error, 0)
    wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var)
             for var in self.theta_q])  # weight decay
    loss_q = mean_td_error + wd_q
    # q optimization
    optim_q = tf.train.AdamOptimizer(learning_rate=lrq)
    grads_and_vars_q = optim_q.compute_gradients(
      loss_q, var_list=self.theta_q)
    optimize_q = optim_q.apply_gradients(grads_and_vars_q)
    with tf.control_dependencies([optimize_q]):
      train_q = tf.group(update_qt)

    # logging
    log_obs = [] if dimO[0]>20 else [tf.histogram_summary("obs/"+str(i),obs[:,i]) for i in range(dimO[0])]
    log_act = [] if dimA[0]>20 else [tf.histogram_summary("act/inf"+str(i),act_test[:,i]) for i in range(dimA[0])]
    log_act2 = [] if dimA[0]>20 else [tf.histogram_summary("act/train"+str(i),act_train[:,i]) for i in range(dimA[0])]
    log_misc = [sum_p, sum_qq, tf.histogram_summary("qfunction/td_error", mb_td_error)]
    log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)]
    log_train = log_obs + log_act + log_act2 + log_misc + log_grad

    # initialize tf log writer
    self.writer = tf.train.SummaryWriter(
      "./tf", self.sess.graph, flush_secs=20)

    # init replay memory for recording episodes
    max_ep_length = 10000
    self.rm_log = ReplayMemory(max_ep_length,dimO,dimA,rm_dtype) 

    # tf functions
    with self.sess.as_default():
      self._act_test = Fun(obs,act_test)
      self._act_expl = Fun(obs,act_expl)
      self._reset = Fun([],self.ou_reset)
      self._train = Fun([obs,act_train,rew,obs2,term2],[train_p,train_q],log_train,self.writer)

    # initialize tf variables
    self.saver = tf.train.Saver(max_to_keep=1)
    ckpt = tf.train.latest_checkpoint("./tf")
    if ckpt:
      self.saver.restore(self.sess,ckpt)
    else:
      self.sess.run(tf.initialize_all_variables())

    self.sess.graph.finalize()

    self.t = 0  # global training time (number of observations)

  def reset(self, obs):
    self._reset()
    self.observation = np.squeeze(obs)  # initial observation

  def act(self, test=False, logging=False):
    obs = np.expand_dims(self.observation, axis=0)
    action = self._act_test(obs) if test else self._act_expl(obs)
    self.action = np.atleast_1d(np.squeeze(action, axis=0)) # TODO: remove this hack
    return self.action

  def observe(self, rew, term, obs2, test=False):
    
    rew = self.reward(rew) # internal reward # TODO: outsource

    if not test:
      self.t = self.t + 1
      self.rm.enqueue(self.observation, term, self.action, rew)

      # save parameters etc.
      if (self.t+45000) % 50000 == 0: # TODO: correct
        s = self.saver.save(self.sess,"./tf/c",self.t)
        print("DDPG Checkpoint: " + s)


    self.observation = np.squeeze(obs2)  # current observation <- obs2
    return rew

  def train(self, logging=False):
    obs, act, rew, obs2, term2, info = self.rm.minibatch(size=self.mb_size)
    self._train(obs,act,rew,obs2,term2,log=logging,global_step=self.t)


  def reward(self,external_reward,logging=False):
    """ calculate internal reward """

    ra = - .1 * np.mean(np.square(self.action))
    rint = external_reward + ra

    if logging:
      self.write_scalar('reward/ext',external_reward)
      self.write_scalar('reward/a',ra)
      self.write_scalar('reward/rint',rint)

    return rint

  def write_scalar(self,tag,val):
    s = tf.Summary(value=[tf.Summary.Value(tag=tag,simple_value=val)])
    self.writer.add_summary(s,self.t)

  def __del__(self):
    self.sess.close()