Пример #1
0
def play(args):
    # Create environment
    env = gym.make(args.env)
    num_actions = env.action_space.n

    state_buf = StateBuffer(args)

    # Define input placeholders
    state_ph = tf.placeholder(
        tf.uint8,
        (None, args.frame_height, args.frame_width, args.frames_per_state))

    # Instantiate DQN network
    DQN = DeepQNetwork(num_actions, state_ph, scope='DQN_main')
    DQN_predict_op = DQN.predict()

    # Create session
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    # Load ckpt file
    loader = tf.train.Saver()
    if args.ckpt_file is not None:
        ckpt = args.ckpt_dir + '/' + args.ckpt_file
    else:
        ckpt = tf.train.latest_checkpoint(args.ckpt_dir)

    loader.restore(sess, ckpt)
    print('%s restored.\n\n' % ckpt)

    for ep in range(0, args.num_eps):
        # Reset environment and state buffer for next episode
        reset_env_and_state_buffer(env, state_buf, args)
        step = 0
        ep_done = False
        initial_steps = np.random.randint(1, args.max_initial_random_steps + 1)

        while not ep_done:
            time.sleep(0.05)
            env.render()

            # Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy.
            if step < initial_steps:
                action = env.action_space.sample()
            else:
                state = np.expand_dims(state_buf.get_state(), 0)
                action = sess.run(DQN_predict_op, {state_ph: state})

            frame, _, ep_terminal, _ = env.step(action)
            frame = preprocess_image(frame, args.frame_width,
                                     args.frame_height)
            state_buf.add(frame)
            step += 1

            # Episode can finish either by reaching terminal state or max episode steps
            if ep_terminal or step == args.max_ep_length:
                ep_done = True
Пример #2
0
def test(args):
    # Create environment
    env = gym.make(args.env)
    num_actions = env.action_space.n

    # Set random seeds for reproducability
    env.seed(args.random_seed)
    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)

    # Initialise state buffer
    state_buf = StateBuffer(args)

    # Define input placeholders
    state_ph = tf.placeholder(
        tf.uint8,
        (None, args.frame_height, args.frame_width, args.frames_per_state))

    # Instantiate DQN network
    DQN = DeepQNetwork(num_actions, state_ph, scope='DQN_main')
    DQN_predict_op = DQN.predict()

    # Create session
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    # Load ckpt file
    loader = tf.train.Saver()
    if args.ckpt_file is not None:
        ckpt = args.ckpt_dir + '/' + args.ckpt_file
    else:
        ckpt = tf.train.latest_checkpoint(args.ckpt_dir)

    loader.restore(sess, ckpt)
    sys.stdout.write('%s restored.\n\n' % ckpt)
    sys.stdout.flush()

    ckpt_split = ckpt.split('-')
    train_ep = ckpt_split[-1]

    # Create summary writer to write summaries to disk
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph)

    # Create summary op to save episode reward to Tensorboard log
    reward_var = tf.Variable(0.0, trainable=False)
    tf.summary.scalar("Average Test Reward", reward_var)
    summary_op = tf.summary.merge_all()

    ## Begin testing

    env.reset()
    rewards = []

    for test_ep in range(args.num_eps_test):
        # Reset environment and state buffer for next episode
        reset_env_and_state_buffer(env, state_buf, args)
        ep_reward = 0
        step = 0
        ep_done = False

        initial_steps = np.random.randint(1, args.max_initial_random_steps + 1)

        sys.stdout.write('\n')
        sys.stdout.flush()

        while not ep_done:
            if args.render:
                env.render()
            else:
                env.render(mode='rgb_array')

            #Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy.
            if step < initial_steps:
                test_action = env.action_space.sample()
            else:
                test_state = np.expand_dims(state_buf.get_state(), 0)
                test_action = sess.run(DQN_predict_op, {state_ph: test_state})

            test_frame, test_reward, test_ep_terminal, _ = env.step(
                test_action)

            test_frame = preprocess_image(test_frame, args.frame_width,
                                          args.frame_height)
            state_buf.add(test_frame)

            ep_reward += test_reward
            step += 1

            sys.stdout.write(
                '\x1b[2K\rTest episode {:d}/{:d} \t Steps = {:d} \t Reward = {:.2f}'
                .format(test_ep, args.num_eps_test, step, ep_reward))
            sys.stdout.flush()

            # Episode can finish either by reaching terminal state or max episode steps
            if test_ep_terminal or step == args.max_ep_length:
                rewards.append(ep_reward)
                ep_done = True

    mean_reward = np.mean(rewards)
    error_reward = ss.sem(rewards)

    sys.stdout.write(
        '\n\nTesting complete \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'.
        format(mean_reward, error_reward))
    sys.stdout.flush()

    # Log average episode reward for Tensorboard visualisation
    summary_str = sess.run(summary_op, {reward_var: mean_reward})
    summary_writer.add_summary(summary_str, train_ep)

    # Write results to file
    if args.results_file is not None:
        if not os.path.exists(args.results_dir):
            os.makedirs(args.results_dir)
        output_file = open(args.results_dir + '/' + args.results_file, 'a')
        output_file.write(
            'Training Episode {}: \t Average reward = {:.2f} +/- {:.2f} /ep \n\n'
            .format(train_ep, mean_reward, error_reward))
        output_file.flush()
        sys.stdout.write('Results saved to file \n\n')
        sys.stdout.flush()

    env.close()
Пример #3
0
def play(args):
    generate_gifs = args.gifs
    save_images = args.save or args.gifs
    print(save_images)
    render_mode = 'rgb_array'
    scale_image = 16

    ACTION_SPACE = np.array([1, 2, 3, 4], dtype=np.uint8)
    # Function to get a random action
    def sample_action_space():
        return random.choice(ACTION_SPACE)

    # Function to convert actionID (1, 2, 3, 4) to actionQID (0, 1, 2, 3)
    def actionID_to_actionQID(actionID):
        return actionID-1

    # Function to convert actionQID (0, 1, 2, 3) to actionID (1, 2, 3, 4)
    def actionQID_to_actionID(actionQID):
        return actionQID+1

    # Create environment
    env = gym.make(args.env)
    num_actions = 4


    state_buf = StateBuffer(args)
    state_shape = (args.grid_height, args.grid_width, args.num_surfaces, args.grids_per_state)
    load_model_path = None

    # Creating target directory if images are to be stored
    if save_images and not os.path.exists('images'):
        try:
            os.makedirs('images')
            os.chdir('images')
            os.makedirs('steps')
            os.makedirs('gif')

        except OSError:
            print('Error: Creating images target directory. ')
    else:
        try:
            os.chdir('images')
            if not os.path.exists('steps'):
                try:
                    os.makedirs('steps')
                except OSError:
                    print('Error: Creating steps target directory. ')


            if not os.path.exists('gif'):
                try:
                    os.makedirs('gif')
                except OSError:
                    print('Error: Creating gif target directory. ')

        except OSError:
            print('Error: Entering images target directory. ')


    if args.checkpoint_file is not None:    # Resume training
        load_model_path = os.path.join(args.checkpoint_dir, args.checkpoint_file)
        assert os.path.exists(load_model_path+'.index'), 'Path "{}" does not exist!'.format(load_model_path+'.index')

        start_step = args.checkpoint_file.split('/')[-1].split('-')[-1]
        assert len(start_step)>0, "Invalid checkpoint file for extracting start_step"
        start_step = int(start_step)
    else:   # Train from scratch
        # Create another directory for this training
        args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.log_filename.split('.')[0])
        start_step = 0

    # Create checkpoint directory
    if not os.path.exists(args.checkpoint_dir):
        os.makedirs(args.checkpoint_dir)

    DQN_target = DQNModel(state_shape, num_actions, load_model_path=load_model_path, name='DQN_target')

    for ep in range(0, args.num_eps):
        # Reset environment and state buffer for next episode
        reset_env_and_state_buffer(env, state_buf, args)
        step = 0
        ep_done = False
        initial_steps = np.random.randint(1, args.max_initial_random_steps+1)

        while not ep_done:
            time.sleep(0.05)
            img = env.render(mode = render_mode)
            plt.imshow(img)
            display.clear_output(wait=True)
            display.display(plt.gcf())
            #Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy.
            if step < initial_steps:
                actionID = sample_action_space()
            else:
                if random.random() < args.epsilon_value:   # Take random action
                    actionID = sample_action_space()
                    print("Random Action\n")
                else:   # Take greedy action
                    state = tf.convert_to_tensor(state_buf.get_state(), dtype=tf.float32)
                    state = state[tf.newaxis, ...]      # Add an axis for batch
                    actionQID = DQN_target.predict(state)
                    actionID = actionQID_to_actionID(int(actionQID))    # convert from Tensor to int
                    print("Greedy Action\n")

            observation, reward, terminal, _ = env.step(actionID, observation_mode='tiny_rgb_array')
            grid = preprocess_observation(args, observation)
            state_buf.add(grid)
            step += 1

            if save_images:
                img = Image.fromarray(np.array(env.render(render_mode, scale=scale_image)), 'RGB')
                img.save(os.path.join('steps', 'observation_{}_{}.png'.format(ep, step)))

            # Episode can finish either by reaching terminal state or max episode steps
            if terminal or step == args.max_ep_length:
                ep_done = True

            if generate_gifs:
                print('')
                import imageio

                with imageio.get_writer(os.path.join('gif', 'episode_{}.gif'.format(ep)), mode='I', fps=1) as writer:

                    for t in range(args.max_ep_length):
                        try:
                            filename = os.path.join('steps', 'observation_{}_{}.png'.format(ep, t))
                            image = imageio.imread(filename)
                            writer.append_data(image)
                        except:
                            pass
Пример #4
0
def train(args):

    # Function to return exploration rate based on current step
    def exploration_rate(current_step, exp_rate_start, exp_rate_end,
                         exp_step_end):
        if current_step < exp_step_end:
            exploration_rate = current_step * (
                (exp_rate_end - exp_rate_start) / (float(exp_step_end))) + 1
        else:
            exploration_rate = exp_rate_end

        return exploration_rate

    # Function to update target network parameters with main network parameters
    def update_target_network(from_scope, to_scope):
        from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      from_scope)
        to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

        op_holder = []

        # Update old network parameters with new network parameters
        for from_var, to_var in zip(from_vars, to_vars):
            op_holder.append(to_var.assign(from_var))

        return op_holder

    # Create environment
    env = gym.make(args.env)
    num_actions = env.action_space.n

    # Initialise replay memory and state buffer
    replay_mem = ReplayMemory(args)
    state_buf = StateBuffer(args)

    # Define input placeholders
    state_ph = tf.placeholder(
        tf.uint8,
        (None, args.frame_height, args.frame_width, args.frames_per_state))
    action_ph = tf.placeholder(tf.int32, (None))
    target_ph = tf.placeholder(tf.float32, (None))

    # Instantiate DQN network
    DQN = DeepQNetwork(
        num_actions,
        state_ph,
        action_ph,
        target_ph,
        args.learning_rate,
        scope='DQN_main'
    )  # Note: One scope cannot be the prefix of another scope (e.g. cannot name this scope 'DQN' and
    # target network scope 'DQN_target', as a search for vars in 'DQN' scope will return both networks' vars)
    DQN_predict_op = DQN.predict()
    DQN_train_step_op = DQN.train_step()

    # Instantiate DQN target network
    DQN_target = DeepQNetwork(num_actions, state_ph, scope='DQN_target')

    update_target_op = update_target_network('DQN_main', 'DQN_target')

    # Create session
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    # Add summaries for Tensorboard visualisation
    tf.summary.scalar('Loss', DQN.loss)
    reward_var = tf.Variable(0.0, trainable=False)
    tf.summary.scalar("Episode Reward", reward_var)
    epsilon_var = tf.Variable(args.epsilon_start, trainable=False)
    tf.summary.scalar("Exploration Rate", epsilon_var)
    summary_op = tf.summary.merge_all()

    # Define saver for saving model ckpts
    model_name = 'model.ckpt'
    checkpoint_path = os.path.join(args.ckpt_dir, model_name)
    if not os.path.exists(args.ckpt_dir):
        os.makedirs(args.ckpt_dir)
    saver = tf.train.Saver(max_to_keep=201)

    # Create summary writer to write summaries to disk
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph)

    # Load ckpt file if given
    if args.ckpt_file is not None:
        loader = tf.train.Saver()  #Restore all variables from ckpt
        ckpt = args.ckpt_dir + '/' + args.ckpt_file
        ckpt_split = ckpt.split('-')
        step_str = ckpt_split[-1]
        start_step = int(step_str)
        loader.restore(sess, ckpt)
    else:
        start_step = 0
        sess.run(tf.global_variables_initializer())
        sess.run(update_target_op)

    ## Begin training

    env.reset()

    ep_steps = 0
    episode_reward = 0
    episode_rewards = []
    duration_values = []

    # Initially populate replay memory by taking random actions
    sys.stdout.write('\nPopulating replay memory with random actions...\n')
    sys.stdout.flush()

    for random_step in range(1, args.initial_replay_mem_size + 1):

        if args.render:
            env.render()
        else:
            env.render(mode='rgb_array')

        action = env.action_space.sample()
        frame, reward, terminal, _ = env.step(action)
        frame = preprocess_image(frame, args.frame_width, args.frame_height)
        replay_mem.add(action, reward, frame, terminal)

        if terminal:
            env.reset()

        sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(
            random_step, args.initial_replay_mem_size))
        sys.stdout.flush()

    # Begin training process
    reset_env_and_state_buffer(env, state_buf, args)
    sys.stdout.write('\n\nTraining...\n\n')
    sys.stdout.flush()

    for train_step in range(start_step + 1, args.num_steps_train + 1):
        start_time = time.time()
        # Run 'train_frequency' iterations in the game for every training step
        for _ in range(0, args.train_frequency):
            ep_steps += 1

            if args.render:
                env.render()
            else:
                env.render(mode='rgb_array')

            # Use an epsilon-greedy policy to select action
            epsilon = exploration_rate(train_step, args.epsilon_start,
                                       args.epsilon_end, args.epsilon_step_end)
            if random.random() < epsilon:
                #Choose random action
                action = env.action_space.sample()
            else:
                #Choose action with highest Q-value according to network's current policy
                current_state = np.expand_dims(state_buf.get_state(), 0)
                action = sess.run(DQN_predict_op, {state_ph: current_state})

            # Take action and store experience
            frame, reward, terminal, _ = env.step(action)
            frame = preprocess_image(frame, args.frame_width,
                                     args.frame_height)
            state_buf.add(frame)
            replay_mem.add(action, reward, frame, terminal)
            episode_reward += reward

            if terminal or ep_steps == args.max_ep_steps:
                # Collect total reward of episode
                episode_rewards.append(episode_reward)
                # Reset episode reward and episode steps counters
                episode_reward = 0
                ep_steps = 0
                # Reset environment and state buffer for next episode
                reset_env_and_state_buffer(env, state_buf, args)

        ## Training step
        # Get minibatch from replay mem
        states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch(
        )
        # Calculate target by passing next states through the target network and finding max future Q
        future_Q = sess.run(DQN_target.output, {state_ph: next_states_batch})
        max_future_Q = np.max(future_Q, axis=1)
        # Q values of the terminal states is 0 by definition
        max_future_Q[terminals_batch] = 0
        targets = rewards_batch + (max_future_Q * args.discount_rate)

        # Execute training step
        if train_step % args.save_log_step == 0:
            # Train and save logs
            average_reward = sum(episode_rewards) / len(episode_rewards)
            summary_str, _ = sess.run(
                [summary_op, DQN_train_step_op], {
                    state_ph: states_batch,
                    action_ph: actions_batch,
                    target_ph: targets,
                    reward_var: average_reward,
                    epsilon_var: epsilon
                })
            summary_writer.add_summary(summary_str, train_step)
            # Reset rewards buffer
            episode_rewards = []
        else:
            # Just train
            _ = sess.run(
                DQN_train_step_op, {
                    state_ph: states_batch,
                    action_ph: actions_batch,
                    target_ph: targets
                })

        # Update target networks
        if train_step % args.update_target_step == 0:
            sess.run(update_target_op)

        # Calculate time per step and display progress to console
        duration = time.time() - start_time
        duration_values.append(duration)
        ave_duration = sum(duration_values) / float(len(duration_values))

        sys.stdout.write('\x1b[2K\rStep {:d}/{:d} \t ({:.3f} s/step)'.format(
            train_step, args.num_steps_train, ave_duration))
        sys.stdout.flush()

        # Save checkpoint
        if train_step % args.save_ckpt_step == 0:
            saver.save(sess, checkpoint_path, global_step=train_step)
            sys.stdout.write('\n Checkpoint saved\n')
            sys.stdout.flush()

            # Reset time calculation
            duration_values = []
Пример #5
0
def train(args):
    ACTION_SPACE = np.array([1, 2, 3, 4], dtype=np.uint8)
    # Function to get a random actionID
    def sample_action_space():
        return random.choice(ACTION_SPACE)

    # Function to convert actionID (1, 2, 3, 4) to actionQID (0, 1, 2, 3)
    def actionID_to_actionQID(actionID):
        return actionID-1

    # Function to convert actionQID (0, 1, 2, 3) to actionID (1, 2, 3, 4)
    def actionQID_to_actionID(actionQID):
        return actionQID+1

    # Function to return epsilon based on current step
    def get_epsilon(current_step, epsilon_start, epsilon_end, epsilon_decay_step):
        if current_step < epsilon_decay_step:
            return epsilon_start + (epsilon_end - epsilon_start) / float(epsilon_decay_step) * current_step
        else:
            return epsilon_end

    # Get logger for training
    logger = logging.getLogger('train')

    # Check if GPU is available
    logger.info("Num GPUs Available: %d", len(tf.config.experimental.list_physical_devices('GPU')))

    # Create environment
    env = gym.make(args.env)
    num_actions = 4     # Push (up, down, left, right): 1, 2, 3, 4
    env.unwrapped.set_maxsteps(args.max_step)
    env.unwrapped.set_rewards(
            [args.env_penalty_for_step, 
                args.env_reward_box_on_target, 
                args.env_penalty_box_off_target, 
                args.env_reward_finished])

    # Set random seeds for reproducability
    random.seed(args.random_seed)
    env.seed(args.random_seed)
    np.random.seed(args.random_seed)
    tf.random.set_seed(args.random_seed)

    # Initialize replay memory and state buffer
    replay_mem = ReplayMemory(args)
    state_buf = StateBuffer(args)

    # Check if resume from training
    load_model_path = None
    if args.checkpoint_file is not None:    # Resume training
        load_model_path = os.path.join(args.checkpoint_dir, args.checkpoint_file)
        assert os.path.exists(load_model_path+'.index'), 'Path "{}" does not exist!'.format(load_model_path+'.index')

        start_step = args.checkpoint_file.split('/')[-1].split('-')[-1]
        assert len(start_step)>0, "Invalid checkpoint file for extracting start_step"
        start_step = int(start_step)
    else:   # Train from scratch
        # Create another directory for this training
        args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.log_filename.split('.')[0])
        start_step = 0

    # Create checkpoint directory
    if not os.path.exists(args.checkpoint_dir):
        os.makedirs(args.checkpoint_dir)

    # Instantiate DQN and DQN_target
    state_shape = (args.grid_height, args.grid_width, args.num_surfaces, args.grids_per_state)
    DQN = DQNModel(state_shape, num_actions, args.learning_rate, load_model_path=load_model_path, name='DQN')
    DQN_target = DQNModel(state_shape, num_actions, load_model_path=load_model_path, name='DQN_target')

    ## Begin training
    env.reset()

    # Populate replay memory to initial_replay_mem_size
    logger.info("Populating replay memory with random actions...")

    for si in range(args.initial_replay_mem_size):
        if args.render:
            env.render(mode='human')
        else:
            env.render(mode='tiny_rgb_array')

        actionID = sample_action_space()
        observation, reward, terminal, _ = env.step(actionID, observation_mode='tiny_rgb_array')
        grid = preprocess_observation(args, observation)
        replay_mem.add(actionID, reward, grid, terminal)

        if terminal:
            env.reset()

        sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(si+1, args.initial_replay_mem_size))
        sys.stdout.flush()

    # Start training
    reward_one_episode = 0
    reward_episodes = []
    step_one_episode = 0
    step_episodes = []
    Qval_steps = []
    duration_steps = []

    # Create tf summary writer to write summaries to disk
    # ./logs/train/20200318_120026
    tf_train_log_dir = os.path.join(args.log_dir.replace('train', 'tf_train'), args.log_filename.split('.')[0])
    if not os.path.exists(tf_train_log_dir):
        os.makedirs(tf_train_log_dir)
    train_summary_writer = tf.summary.create_file_writer(tf_train_log_dir)
    train_summary_writer.set_as_default()
    if args.save_tb_trace:
        # Model graphs
        tf.summary.trace_on(graph=True, profiler=True)

    reset_env_and_state_buffer(env, state_buf, args)
    logger.info("Start training...")
    for si in range(start_step+1, args.num_steps_train+1):
        start_time = time.time()

        ## Playing Step
        # Perform a step
        if args.render:
            env.render(mode='human')
        else:
            env.render(mode='tiny_rgb_array')

        # Select a random action based on epsilon-greedy algorithm
        epsilon = get_epsilon(si, args.epsilon_start, args.epsilon_end, args.epsilon_decay_step)
        if random.random() < epsilon:   # Take random action
            actionID = sample_action_space()
        else:   # Take greedy action
            state = tf.convert_to_tensor(state_buf.get_state(), dtype=tf.float32)
            state = state[tf.newaxis, ...]      # Add an axis for batch
            actionQID = DQN.predict(state)
            actionID = actionQID_to_actionID(int(actionQID))    # convert from Tensor to int

        # Take the action and store state transition
        observation, reward, terminal, _ = env.step(actionID, observation_mode='tiny_rgb_array')
        grid = preprocess_observation(args, observation)
        state_buf.add(grid)
        replay_mem.add(actionID, reward, grid, terminal)
        # Accumulate reward and increment step
        reward_one_episode += reward
        step_one_episode += 1

        if terminal:
            # Save the accumulate reward for this episode
            reward_episodes.append(reward_one_episode)
            reward_one_episode = 0
            # Save the number of steps for this episode
            step_episodes.append(step_one_episode)
            step_one_episode = 0
            # Reset environment and state buffer
            reset_env_and_state_buffer(env, state_buf, args)

        ## Training Step
        # Sample a random minibatch of transitions from ReplayMemory
        states_batch, actionID_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch()
        actionQID_batch = actionID_to_actionQID(actionID_batch)
        # Infer DQN_target for Q(S', A)
        next_states_batch = tf.convert_to_tensor(next_states_batch, dtype=tf.float32)
        next_states_Qvals = DQN_target.infer(next_states_batch)
        max_next_states_Qvals = tf.math.reduce_max(next_states_Qvals, axis=1, name='maxQ')
        assert max_next_states_Qvals.shape == (args.batch_size,), "Wrong dimention for predicted next state Q vals"
        # Set Q(S', A) for all terminal state S'
        max_next_states_Qvals = tf.math.multiply(max_next_states_Qvals, np.invert(terminals_batch), name='remove_terminals')
        # Save average maximum predicted Q values
        Qval_steps.append(np.mean(max_next_states_Qvals[max_next_states_Qvals != 0]))
        # Calculate the traget Q values
        targetQs = rewards_batch + args.discount_rate * max_next_states_Qvals

        # Pass to DQN
        states_batch = tf.cast(states_batch, tf.float32)
        targetQs = tf.cast(targetQs, tf.float32)
        DQN.train_step(states_batch, actionQID_batch, targetQs)

        # Update DQN_target every args.update_target_step steps
        if si % args.update_target_step == 0:
            update_save_path = os.path.join(args.checkpoint_dir, 'DQN_Update')
            DQN.save_model(update_save_path)
            DQN_target.load_model(update_save_path)

        duration = time.time() - start_time
        duration_steps.append(duration)

        # Save log
        if si % args.save_log_step == 0:
            avg_training_loss = DQN.get_training_loss()

            logger.info("{Training Step: %d/%d}", si, args.num_steps_train)
            logger.info("Number of Episodes: %d", len(reward_episodes))
            logger.info("Recent Step Exploration Rate: %.5f", epsilon)
            logger.info("Average Per-Episode Reward: %.5f", sum(reward_episodes)/float(len(reward_episodes)))
            logger.info("Average Per-Episode Step: %.3f", sum(step_episodes)/float(len(step_episodes)))
            logger.info("Average Per-Step Maximum Predicted Q Value: %.8f", sum(Qval_steps)/float(len(Qval_steps)))
            logger.info("Average Per-Step Training Loss: %.8f", avg_training_loss)
            logger.info("Average Per-Step Training Time: %.5f second", sum(duration_steps)/float(len(duration_steps)))

            tf.summary.scalar('Episodes', len(reward_episodes), step=si, description='Number of Episodes')
            tf.summary.scalar('epsilon', epsilon, step=si, description='Recent Step Exploration Rate')
            tf.summary.scalar('avgReward', sum(reward_episodes)/float(len(reward_episodes)), step=si, description='Average Per-Episode Reward')
            tf.summary.scalar('avgStep', sum(step_episodes)/float(len(step_episodes)), step=si, description='Average Per-Episode Step Count')
            tf.summary.scalar('avgQval', sum(Qval_steps)/float(len(Qval_steps)), step=si, description='Average Per-Step Maximum Predicted Q Value')
            tf.summary.scalar('avgTrainLoss', avg_training_loss, step=si, description='Average Per-Step Training Loss')
            tf.summary.scalar('avgTrainTime', sum(duration_steps)/float(len(duration_steps)), step=si, description='Average Per-Step Training Time')

            if args.save_tb_trace:
                # Save computation graph
                tf.summary.trace_export(name="model_trace", step=si, profiler_outdir=tf_train_log_dir)

            # Reset the parameters
            reward_episodes = []
            step_episodes = []
            duration_steps = []
            Qval_steps = []

        # Save checkpoint
        if si % args.save_checkpoint_step == 0:
            save_checkpoint_path = os.path.join(args.checkpoint_dir, 
                    'DQN_Train')
            DQN.save_model(save_checkpoint_path, ckpt_number=si)
            # Duplicate the current logfile
            src_log_filepath = os.path.join(args.log_dir, args.log_filename)
            dst_log_filepath = os.path.join(args.checkpoint_dir, 
                    'DQN_Train_{}.log'.format(si))
            shutil.copyfile(src_log_filepath, dst_log_filepath)

    # Training finished
    logger.info("Finished training...")
    # Save trained network
    save_final_network_path = os.path.join(args.checkpoint_dir, 'DQN_Trained')
    DQN.save_model(save_final_network_path, ckpt_number=args.num_steps_train)
Пример #6
0
def test(args):
    ACTION_SPACE = np.array([1, 2, 3, 4], dtype=np.uint8)

    # Function to get a random action
    def sample_action_space():
        return random.choice(ACTION_SPACE)

    # Function to convert actionID (1, 2, 3, 4) to actionQID (0, 1, 2, 3)
    def actionID_to_actionQID(actionID):
        return actionID - 1

    # Function to convert actionQID (0, 1, 2, 3) to actionID (1, 2, 3, 4)
    def actionQID_to_actionID(actionQID):
        return actionQID + 1

    def get_actionID(step, initial_steps, epsilon):
        #Choose random action for initial steps to ensure every episode has a random start point. Then choose action with highest Q-value according to network's current policy.
        if step < initial_steps:
            actionID = sample_action_space()
        else:
            if random.random() < epsilon:  # Take random action
                actionID = sample_action_space()
            else:  # Take greedy action
                state = tf.convert_to_tensor(state_buf.get_state(),
                                             dtype=tf.float32)
                state = state[tf.newaxis, ...]  # Add an axis for batch
                actionQID = DQN_target.predict(state)
                actionID = actionQID_to_actionID(
                    int(actionQID))  # convert from Tensor to int
        return actionID

    # Create environment
    env = gym.make(args.env)

    # Set random seeds for reproducability
    env.seed(args.random_seed)
    np.random.seed(args.random_seed)
    tf.random.set_seed(args.random_seed)

    state_buf = StateBuffer(args)
    state_shape = (args.grid_height, args.grid_width, args.num_surfaces,
                   args.grids_per_state)
    num_actions = 4

    epsilons = [0.1]  # [0.9, 0.5, 0.28, 0.2, 0.15, 0.1, 0.05]
    checkpoint_paths = get_checkpoint_paths(args)
    num_checkpoints = len(checkpoint_paths)

    out_file = open(args.results_file, "a+")
    out_file.write(
        "pathStr, epsilon, mean_reward, error_reward, mean_step, ons, offs, wins\n\r"
    )

    for epsilon in epsilons:
        for cp_id in range(0, num_checkpoints):
            path = checkpoint_paths[cp_id]

            out_str = "Starting Checkpoint test: {} \t {}/{} \t Epsilon: {}\n\r".format(
                path, cp_id + 1, num_checkpoints, epsilon)
            #output(out_str, out_file)
            print(out_str)

            #if args.checkpoint_list is not None:
            DQN_target = DQNModel(state_shape,
                                  num_actions,
                                  load_model_path=path,
                                  name='DQN_target')

            #Begin Testing
            rewards = []
            step_totals = []
            cp_totals = Counts()
            for ep in range(0, args.num_eps):

                # Reset environment and state buffer for next episode
                reset_env_and_state_buffer(env, state_buf, args)
                ep_reward = 0
                ep_totals = Counts()

                step = 0
                ep_done = False
                initial_steps = np.random.randint(
                    1, args.max_initial_random_steps + 1)

                while not ep_done:
                    if args.render:
                        env.render()
                    else:
                        env.render(mode='tiny_rgb_array')

                    actionID = get_actionID(step, initial_steps, epsilon)

                    observation, reward, terminal, _ = env.step(
                        actionID, observation_mode='tiny_rgb_array')

                    grid = preprocess_observation(args, observation)
                    state_buf.add(grid)

                    step += 1
                    ep_reward += reward
                    ep_totals.reward_update(reward)

                    # Episode can finish either by reaching terminal state or max episode steps
                    if terminal or step == args.max_step:
                        cp_totals.update_all(ep_totals)
                        step_totals.append(step)

                        out_str = 'Test ep {:d}/{:d} \t Steps = {:d} \t Reward = {:.2f} \t\n\r'.format(
                            ep + 1, args.num_eps, step, ep_reward, actionID)
                        #output(out_str, out_file)
                        print(out_str)

                        out_str = ep_totals.get_str()
                        #output(out_str, out_file)
                        print(out_str)

                        rewards.append(ep_reward)
                        ep_done = True

            mean_step = np.mean(step_totals)
            mean_reward = np.mean(rewards)
            error_reward = ss.sem(rewards)

            if not path:
                pathStr = "Beginning"
            else:
                pathStr = path

            out_str = pathStr + ' Checkpoint Testing complete \n\r'
            #output(out_str, out_file)
            print(out_str)

            out_str = 'Average reward = {:.2f} +/- {:.2f} /ep\t Average steps: {}\n\r'.format(
                mean_reward, error_reward, mean_step)
            #output(out_str, out_file)
            print(out_str)

            out_str = 'Totals: ' + cp_totals.get_str() + '\tEpsilon: ' + str(
                epsilon) + '\n\r\n\r'
            print(out_str)
            #output(out_str, out_file)

            out_str = '{},{},{:.2f},{:.2f},{:.2f},{},{},{},\n\r'.format(
                pathStr, epsilon, mean_reward, error_reward, mean_step,
                cp_totals.on, cp_totals.off, cp_totals.win)
            output(out_str, out_file)
    out_file.close()
    env.close()