예제 #1
0
 def _before_sim_loop(self):
     n_state = self._env.observation_space.shape[0]
     n_action = self._env.action_space.n
     self._algo = DDQN(n_state, n_action, self._algo_params)
     self._algo.update_net()
     self._score = 0.0
     self._score_sum = 0.0
예제 #2
0
class GameManager():
    def __init__(self):
        # Init game state
        self.episode = 0.0
        self.win_counter = 0.0

        self.state = CardGameState(self)
        self.brain = DDQN()

        self.episode_reward = 0
        self.game_history = list()

    def update(self, dt):
        pass

    def auto_play(self):
        while self.episode < MAX_EPISODES:
            action = self.brain.get_action(self.state)

            action_to_store = np.zeros(3)
            action_to_store[action] = 1

            self.state.process(action)
            # receive game result
            reward = self.state.reward
            done = self.state.terminal

            self.episode_reward += reward

            self.brain.train(self.state, self.state.s_t, action_to_store,
                             reward, self.state.s_t1, done)

            self.state.t += 1

            self.state.update()

            if done:
                self.episode += 1
                win_rate = 0.0

                if self.episode_reward == 1:
                    self.game_history.append(1)
                else:
                    self.game_history.append(0)

                if len(self.game_history) < GAME_HISTORY_SIZE:
                    win_rate = np.sum(self.game_history) / float(
                        len(self.game_history)) * 100.0
                else:
                    self.game_history.pop(0)
                    win_rate = np.sum(
                        self.game_history) / GAME_HISTORY_SIZE * 100.0

                print("Episode {} | Win Rate = {}".format(
                    self.episode, win_rate))

                self.brain.write_summary(win_rate, self.episode)

                self.episode_reward = 0
                self.state.reset()
예제 #3
0
class Tester():
    def __init__(self, render_flag):
        self.model = DDQN(36, 36)
        self.render_flag = render_flag
        self.width = 6
        self.height = 6
        self.env = MineSweeper(self.width, self.height, 6)
        if (self.render_flag):
            self.renderer = Render(self.env.state)
        self.load_models(20000)

    def get_action(self, state):
        state = state.flatten()
        mask = (1 - self.env.fog).flatten()
        action = self.model.act(state, mask)
        return action

    def load_models(self, number):
        path = "pre-trained\ddqn_dnn" + str(number) + ".pth"
        dict = torch.load(path)
        self.model.load_state_dict(dict['current_state_dict'])
        self.model.epsilon = 0

    def do_step(self, action):
        i = int(action / self.width)
        j = action % self.width

        if (self.render_flag):
            self.renderer.state = self.env.state
            self.renderer.draw()
            self.renderer.bugfix()
        next_state, terminal, reward = self.env.choose(i, j)
        return next_state, terminal, reward
예제 #4
0
    def __init__(self,width,height,bomb_no,render_flag):

        self.width = width
        self.height = height
        self.bomb_no = bomb_no
        self.box_count = width*height
        self.env = MineSweeper(self.width,self.height,self.bomb_no)
        self.current_model = DDQN(self.box_count,self.box_count)
        self.target_model = DDQN(self.box_count,self.box_count)
        self.target_model.eval()
        self.optimizer = torch.optim.Adam(self.current_model.parameters(),lr=0.003,weight_decay=1e-5)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,step_size=2000,gamma=0.95)
        self.target_model.load_state_dict(self.current_model.state_dict())
        self.buffer = Buffer(100000)
        self.gamma = 0.99
        self.render_flag = render_flag
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.90
        self.reward_threshold = 0.12
        self.reward_step = 0.01
        self.batch_size = 4096
        self.tau = 5e-5
        self.log = open("./Logs/ddqn_log.txt",'w')

        if(self.render_flag):
            self.Render = Render(self.env.state)
예제 #5
0
 def __init__(self, render_flag):
     self.model = DDQN(36, 36)
     self.render_flag = render_flag
     self.width = 6
     self.height = 6
     self.env = MineSweeper(self.width, self.height, 6)
     if (self.render_flag):
         self.renderer = Render(self.env.state)
     self.load_models(20000)
예제 #6
0
    def __init__(self):
        # Init game state
        self.episode = 0.0
        self.win_counter = 0.0

        self.state = CardGameState(self)
        self.brain = DDQN()

        self.episode_reward = 0
        self.game_history = list()
예제 #7
0
class DDQNRunner(Runner):
    def __init__(self, env_name, algo_params, runner_params):
        super(DDQNRunner, self).__init__(env_name, 'DDQN', algo_params, runner_params)

    def _before_sim_loop(self):
        n_state = self._env.observation_space.shape[0]
        n_action = self._env.action_space.n
        self._algo = DDQN(n_state, n_action, self._algo_params)
        self._algo.update_net()
        self._score = 0.0
        self._score_sum = 0.0

    def _episode_sim(self, n_epi):
        s = self._env.reset()
        done = False
        self._score = 0.0
        n_step = 0

        if self._train:
            self._algo.epsilon = max(0.01, self._algo.start_epsilon - 0.01*(n_epi/200))
        else:
            self._algo.epsilon = 0.0

        while not done:
            a = self._algo.sample_action(torch.from_numpy(s).float())
            s_prime, r, done, info = self._step_wrapper(self._env.step(a))
            
            if self._train:
                self._algo.append_data((s,a,r/self._reward_scale,s_prime, done))
            if self._save_step_log:
                self._write_step_log(n_step, n_epi, s, a, r, done)

            s = s_prime
            self._score += r
            n_step += 1
        
        self._score_sum += self._score 

    def _after_sim(self, n_epi, print_log, cond_check):
        super()._after_sim(n_epi, print_log, cond_check)
        
        if not self._done and self._train:
            if self._algo.buffer_size() > self._algo.n_train_start:
                self._algo.train_net()
            if n_epi % self._algo.update_interval==0:
                self._algo.update_net()
                
    def _print_log(self, n_epi, avg_score):
        super()._print_log(n_epi, avg_score)
        print(f"n_buffer : {self._algo.buffer_size()}, "\
                    + f"eps : {self._algo.epsilon*100:.1f}%")
예제 #8
0
def main():
    value_function = Sequential(
            Linear(in_features=4, out_features=128),
            ReLU(),
            Linear(in_features=128, out_features=128),
            ReLU(),
            Linear(in_features=128, out_features=32),
            ReLU(),
            Linear(in_features=32, out_features=2)
        ).to(torch.device("cuda:0"))

    optimizer = RMSprop(params=value_function.parameters(), alpha=0.95, lr=0.0001)

    agent = DDQN(
        value_function=value_function,
        optimizer=optimizer,
        lr_scheduler=LambdaLR(optimizer=optimizer, lr_lambda=lambda e: max(0.9999 ** e, 0.1)),
        gamma=0.95,
        epsilon_fn=lambda x: 0.9999**x,
        replay_buffer_size=10000,
        replay_batch_size=128,
        start_training_at=1024,
        unfreeze_freq=64,
        device=torch.device("cuda:0"),
        verbose=True
    )

    run_ddqn(agent, render=True)
예제 #9
0
    def _initialise_ddqn(self):
        """ Initialise the DDQN

        :return: None
        """
        self._ddqn2 = DDQN(state_size=len(self._columns),
                           action_size=len(self._columns),
                           seed=0,
                           technique=self._rl_technique)
예제 #10
0
    def __init__(self):
        self.last_action = Action()
        self.time_step = 0
        self.total_time_step = 0
        self.episode_step = 0
        self.populating_phase = False

        self.model_save_interval = 30

        # Switch learning phase / evaluation phase
        self.policy_frozen = False

        self.ddqn = DDQN()
        self.state = np.zeros(
            (config.rl_agent_history_length, config.ale_screen_channels,
             config.ale_scaled_screen_size[1],
             config.ale_scaled_screen_size[0]),
            dtype=np.float32)
        self.exploration_rate = self.ddqn.exploration_rate
        self.exploration_rate_for_evaluation = 0.05
        self.last_observed_screen = None
예제 #11
0
def main():
    if len(sys.argv) != 2:
        print('usage: python ' + sys.argv[0] + ' [weights_path]')
        exit(0)

    weights_path = sys.argv[1]

    env = gym.make(GYM)
    env = gym.wrappers.Monitor(env, "./video", force=True)
    input_shape = env.observation_space.shape[0]
    output_shape = env.action_space.n
    print('environment: in: ({}) out: ({})'.format(input_shape, output_shape))

    ddqn = DDQN(input_shape, output_shape)
    if os.path.exists(weights_path):
        ddqn.load_weights(weights_path)

    state = env.reset()
    state = np.expand_dims(state, 0)
    tot_reward = 0
    for _ in range(1000):
        env.render()

        q_values = ddqn.predict(state)
        action = np.argmax(q_values)

        next_state, reward, done, info = env.step(action)
        next_state = np.expand_dims(next_state, 0)
        state = next_state
        tot_reward += reward

        if done:
            break

    env.close()
    print('total reward: {}'.format(tot_reward))
예제 #12
0
def run_ddqn(agent: DDQN, render: bool = True):
    env = gym.make("CartPole-v1")
    draw = env.render if render else lambda: ...

    # Train forever.
    while True:
        next_state = env.reset()
        reward = 0
        done = False
        while True:
            action = agent.train_step(state=next_state, reward=reward, episode_ended=done)
            if done:
                break
            next_state, reward, done, info = env.step(action)
            draw()
예제 #13
0
	def __init__(self):
		self.last_action = Action()
		self.time_step = 0
		self.total_time_step = 0
		self.episode_step = 0
		self.populating_phase = False

		self.model_save_interval = 30

		# Switch learning phase / evaluation phase
		self.policy_frozen = False

		self.ddqn = DDQN()
		self.state = np.zeros((config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32)
		self.exploration_rate = self.ddqn.exploration_rate
		self.exploration_rate_for_evaluation = 0.05
		self.last_observed_screen = None
예제 #14
0
def main():
    env = gym.make("VideoPinball-ram-v0")
    state_size = env.observation_space.shape[0]
    num_actions = env.action_space.n

    # Initialize model
    dqn_model = DQN(state_size, num_actions)
    ddqn_model = DDQN(state_size, num_actions)

    # TODO:
    # 1) Train your model for 650 episodes, passing in the environment and the agent.
    # 2) Append the total reward of the episode into a list keeping track of all of the rewards.
    # 3) After training, print the average of the last 50 rewards you've collected.

    dqn_rwds = []
    ddqn_rwds = []

    print('start train')

    num_games = 100
    for i in range(num_games):
        if i % 10 == 0:
            print('step:', i)
        ddqn_rwd = generate_trajectory(env, ddqn_model)
        ddqn_rwds.append(ddqn_rwd)
        dqn_rwd = generate_trajectory(env, dqn_model)
        dqn_rwds.append(dqn_rwd)

    env.close()

    print("DQN rewards")
    print(dqn_rwds)
    print("DDQN Rewards")
    print(ddqn_rwds)

    # TODO: Visualize your rewards.
    visualize_data(np.array(dqn_rwds), np.array(ddqn_rwds))
예제 #15
0
def main():
    if len(sys.argv) != 3:
        print('usage: python ' + sys.argv[0] + ' epochs [weights_path]')
        exit(0)

    epochs = int(sys.argv[1])
    weights_path = sys.argv[2]

    env = gym.make(GYM)
    input_shape = env.observation_space.shape[0]
    output_shape = env.action_space.n
    print('environment: in: ({}) out: ({})'.format(input_shape, output_shape))

    ddqn = DDQN(input_shape, output_shape)
    if os.path.exists(weights_path):
        ddqn.load_weights(weights_path)

    optimizer = tf.train.AdamOptimizer()

    # init training
    replay_buffer = Replay_buffer(REPLAY_BUFFER_SIZE)
    target_network = DDQN(input_shape, output_shape)
    target_network.set_weights(ddqn.get_weights())

    target_reset_count = 0
    train_counter = 0
    e_counter = 0
    epsilon_explore = E_START
    loss_value = 0
    rewards_x_epoch = []
    e_x_time = []
    loss_x_time = []

    for epc in range(epochs):
        state = env.reset()
        state = np.expand_dims(state, 0)
        tot_reward = 0
        for _ in range(1000):
            # action selection
            if random.random() <= epsilon_explore:
                action = random.randint(0, output_shape - 1)
            else:
                q_values = ddqn.predict(state)
                action = np.argmax(q_values)

            # simulation
            next_state, reward, done, info = env.step(action)
            next_state = np.expand_dims(next_state, 0)
            replay_buffer.add((state, action, reward, next_state, 0 if done else 1))
            state = next_state
            tot_reward += reward

            # training
            train_counter += 1
            if train_counter > TRAINING_START and train_counter % TRAINING_FREQ == 0:
                (batch_states, batch_actions, batch_s_t1, batch_rewards, batch_final) = replay_buffer.sample(BATCH_SIZE)

                with tf.GradientTape() as tape:
                    # actual prediction
                    action_indexes = tf.stack([tf.range(BATCH_SIZE, dtype=tf.int64), batch_actions], axis=1)
                    y_prediction = tf.gather_nd(ddqn(batch_states), action_indexes)

                    # targets
                    amax = tf.argmax(ddqn(batch_s_t1), axis=1)
                    amax = tf.stack([tf.range(BATCH_SIZE, dtype=tf.int64), amax], axis=1)
                    batch_target_y = target_network(batch_s_t1)
                    target_expected_rewards = tf.gather_nd(batch_target_y, amax)
                    y_target = batch_rewards + (DISCOUNT * target_expected_rewards * batch_final)

                    # loss
                    loss_value = tf.reduce_mean(tf.pow(y_target - y_prediction, 2))

                grads = tape.gradient(loss_value, ddqn.trainable_variables)

                optimizer.apply_gradients(zip(grads, ddqn.trainable_variables))

            if train_counter > TRAINING_START:
                e_counter += 1
                epsilon_explore = get_epsilon(e_counter)
            if e_counter > E_RESET_FREQ:
                e_counter = 0

            e_x_time.append(epsilon_explore)
            loss_x_time.append(float(loss_value))

            target_reset_count += 1
            if target_reset_count == TARGET_RESET_FREQ:
                target_network.set_weights(ddqn.get_weights())
                target_reset_count = 0

            if done:
                break

        rewards_x_epoch.append(tot_reward)

        if epc % 10 == 0:
                print('[{:2.1f}%], e: {:5.4f} - loss: {:10.6f} - last episode reward: {}'.format((epc * 100) / epochs, epsilon_explore, float(loss_value), tot_reward))

    ddqn.save_weights(weights_path, save_format='h5')

    with open('rewards.csv', 'w') as f:
        for rew in rewards_x_epoch:
            f.write("%s," % rew)

    with open('epsilon.csv', 'w') as f:
        for e in e_x_time:
            f.write("%s," % e)

    with open('loss.csv', 'w') as f:
        for l in loss_x_time:
            f.write("%s," % l)

    # let's try it
    obs = env.reset()
    obs = np.expand_dims(obs, 0)
    for _ in range(1000):
        env.render()
        q_values = ddqn.predict(obs)
        action = np.argmax(q_values)
        obs, reward, done, info = env.step(action)
        obs = np.expand_dims(obs, 0)

        if done:
            break

    env.close()
예제 #16
0
파일: test.py 프로젝트: davidmkwon/rl
# set up environment tools
USE_GPU = torch.cuda.is_available()
mod_action_space = [2, 3, 4, 5]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = Env(device)
agent = Agent(eps=dum_val,
              eps_min=dum_val,
              eps_max=dum_val,
              eps_decay=dum_val,
              num_actions=len(mod_action_space),
              device=device)
agent.turn_eps_off()
stack = Frstack(initial_frame=env.state)

# create policy net and load saved weights
policy_net = DDQN(NUM_FRAMES, len(mod_action_space))
if USE_GPU:
    policy_net.cuda()


def test():
    policy_net.load_state_dict(torch.load(POLICY_NET_PATH))
    policy_net.eval()

    print("testing...")
    all_rewards = []
    all_images = []

    for episode in range(NUM_TEST_EPISODES):
        env.reset()
        episode_reward = 0
예제 #17
0
        pass

    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def update_stats(self, **stats):
        self._write_logs(stats, self.step)

    def _write_logs(self, logs, index):
        with self.writer.as_default():
            for name, value in logs.items():
                tf.summary.scalar(name, value, step=index)
                self.step += 1
                self.writer.flush()


agent = DDQN(10, (env.OBSERVATION_SPACE_VALUES))
agent.load_weights(MODEL_FILE)
tensorboard = ModifiedTensorBoard(
    log_dir="logs/{}-{}".format(MODEL_NAME, int(time.time())))
# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):

    # Update tensorboard step every episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    step = 1

    # Reset environment and get initial state
    current_state = env.reset()
예제 #18
0
env = gym.make('Pendulum-v0')
env = env.unwrapped
env.seed(1)

action_space = 11
n_features = 3
memory0 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE)
memory1 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE)

sess0 = tf.Session()
sess1 = tf.Session()
dqn = [
    DDQN(action_space,
         n_features,
         memory0,
         name='dqn0',
         learning_rate=LEARNING_RATE,
         e_greedy_increment=0.001,
         double_q=False,
         sess=sess0),
    DDQN(action_space,
         n_features,
         memory1,
         name='dqn1',
         learning_rate=LEARNING_RATE,
         e_greedy_increment=0.001,
         double_q=False,
         sess=sess1)
]

sess0.run(tf.global_variables_initializer())
sess1.run(tf.global_variables_initializer())
예제 #19
0
class Agent(RLGlueAgent):
    def __init__(self):
        self.last_action = Action()
        self.time_step = 0
        self.total_time_step = 0
        self.episode_step = 0
        self.populating_phase = False

        self.model_save_interval = 30

        # Switch learning phase / evaluation phase
        self.policy_frozen = False

        self.ddqn = DDQN()
        self.state = np.zeros(
            (config.rl_agent_history_length, config.ale_screen_channels,
             config.ale_scaled_screen_size[1],
             config.ale_scaled_screen_size[0]),
            dtype=np.float32)
        self.exploration_rate = self.ddqn.exploration_rate
        self.exploration_rate_for_evaluation = 0.05
        self.last_observed_screen = None

    def preprocess_screen(self, observation):
        screen_width = config.ale_screen_size[0]
        screen_height = config.ale_screen_size[1]
        new_width = config.ale_scaled_screen_size[0]
        new_height = config.ale_scaled_screen_size[1]
        if len(observation.intArray) == 100928:
            observation = np.asarray(observation.intArray[128:],
                                     dtype=np.uint8).reshape(
                                         (screen_width, screen_height, 3))
            observation = spm.imresize(observation, (new_height, new_width))
            # Clip the pixel value to be between 0 and 1
            if config.ale_screen_channels == 1:
                # Convert RGB to Luminance
                observation = np.dot(observation[:, :, :],
                                     [0.299, 0.587, 0.114])
                observation = observation.reshape((new_height, new_width, 1))
            observation = observation.transpose(2, 0, 1) / 255.0
            observation /= (np.max(observation) + 1e-5)
        else:
            # Greyscale
            if config.ale_screen_channels == 3:
                raise Exception(
                    "You forgot to add --send_rgb option when you run ALE.")
            observation = np.asarray(observation.intArray[128:]).reshape(
                (screen_width, screen_height))
            observation = spm.imresize(observation, (new_height, new_width))
            # Clip the pixel value to be between 0 and 1
            observation = observation.reshape(
                (1, new_height, new_width)) / 255.0
            observation /= (np.max(observation) + 1e-5)

        observed_screen = observation
        if self.last_observed_screen is not None:
            observed_screen = np.maximum(observation,
                                         self.last_observed_screen)

        self.last_observed_screen = observation
        return observed_screen

    def agent_init(self, taskSpecString):
        pass

    def reshape_state_to_conv_input(self, state):
        return state.reshape(
            (1, config.rl_agent_history_length * config.ale_screen_channels,
             config.ale_scaled_screen_size[1],
             config.ale_scaled_screen_size[0]))

    def dump_result(self, reward, q_max=None, q_min=None):
        if self.time_step % 50 == 0:
            if self.policy_frozen is False:
                print "time_step:", self.time_step,

            print "reward:", reward,
            print "eps:", self.exploration_rate,
            if q_min is None:
                print ""
            else:
                print "Q ::",
                print "max:", q_max,
                print "min:", q_min

    def dump_state(self, state=None, prefix=""):
        if state is None:
            state = self.state
        state = self.reshape_state_to_conv_input(state)
        for h in xrange(config.rl_agent_history_length):
            start = h * config.ale_screen_channels
            end = start + config.ale_screen_channels
            image = state[0, start:end, :, :]
            if config.ale_screen_channels == 1:
                image = image.reshape((image.shape[1], image.shape[2]))
            elif config.ale_screen_channels == 3:
                image = image.transpose(1, 2, 0)
            image = np.uint8(image * 255.0)
            image = Image.fromarray(image)
            image.save(("%sstate-%d.png" % (prefix, h)))

    def learn(self, reward, epsode_ends=False):
        if self.policy_frozen is False:

            self.ddqn.store_transition_in_replay_memory(
                self.reshape_state_to_conv_input(self.last_state),
                self.last_action.intArray[0], reward,
                self.reshape_state_to_conv_input(self.state), epsode_ends)
            if self.total_time_step <= config.rl_replay_start_size:
                # A uniform random policy is run for 'replay_start_size' frames before learning starts
                # 経験を積むためランダムに動き回るらしい。
                print "Initial exploration before learning starts:", "%d/%d" % (
                    self.total_time_step, config.rl_replay_start_size)
                self.populating_phase = True
                self.exploration_rate = config.rl_initial_exploration
            else:
                self.populating_phase = False
                self.ddqn.decrease_exploration_rate()
                self.exploration_rate = self.ddqn.exploration_rate

                if self.total_time_step % (
                        config.rl_action_repeat * config.rl_update_frequency
                ) == 0 and self.total_time_step != 0:
                    self.ddqn.replay_experience()

                if self.total_time_step % config.rl_target_network_update_frequency == 0 and self.total_time_step != 0:
                    print "Target has been updated."
                    self.ddqn.update_target()

    def agent_start(self, observation):
        print "Episode", self.episode_step, "::", "total_time_step:",
        if self.total_time_step > 1000:
            print int(self.total_time_step / 1000), "K"
        else:
            print self.total_time_step
        observed_screen = self.preprocess_screen(observation)
        self.state[0] = observed_screen

        return_action = Action()
        action, q_max, q_min = self.ddqn.eps_greedy(
            self.reshape_state_to_conv_input(self.state),
            self.exploration_rate)
        return_action.intArray = [action]

        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()

        return return_action

    def agent_step(self, reward, observation):
        observed_screen = self.preprocess_screen(observation)
        self.state = np.roll(self.state, 1, axis=0)
        self.state[0] = observed_screen

        ########################### DEBUG ###############################
        # if self.total_time_step % 500 == 0 and self.total_time_step != 0:
        # 	self.dump_state()

        self.learn(reward)

        return_action = Action()
        q_max = None
        q_min = None
        if self.time_step % config.rl_action_repeat == 0:
            action, q_max, q_min = self.ddqn.eps_greedy(
                self.reshape_state_to_conv_input(self.state),
                self.exploration_rate)
        else:
            action = self.last_action.intArray[0]
        return_action.intArray = [action]

        self.dump_result(reward, q_max, q_min)

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(return_action)
            self.last_state = self.state.copy()
            self.time_step += 1
            self.total_time_step += 1

        return return_action

    def agent_end(self, reward):
        self.learn(reward, epsode_ends=True)

        # [Optional]
        ## Visualizing the results
        self.dump_result(reward)

        if self.policy_frozen is False:
            self.time_step = 0
            self.total_time_step += 1
            self.episode_step += 1

    def agent_cleanup(self):
        pass

    def agent_message(self, inMessage):
        if inMessage.startswith("freeze_policy"):
            self.policy_frozen = True
            self.exploration_rate = self.exploration_rate_for_evaluation
            return "The policy was freezed."

        if inMessage.startswith("unfreeze_policy"):
            self.policy_frozen = False
            self.exploration_rate = self.ddqn.exploration_rate
            return "The policy was unfreezed."

        if inMessage.startswith("save_model"):
            if self.populating_phase is False:
                self.ddqn.save()
            return "The model was saved."
예제 #20
0
env.seed(args.seed)
np.random.seed(args.seed)

obs_shape_list = env.observation_space.shape
action_shape = env.action_space.n

if len(obs_shape_list) > 1:
    shapes = [(80, 80), (1, ), (1, ), (1, ), (80, 80)]
    dtypes = [np.uint8, np.uint8, np.float32, np.bool, np.uint8]
    model_type = "CNN"
else:
    shapes = [(obs_shape_list[0], ), (1, ), (1, ), (1, ),
              (obs_shape_list[0], )]
    dtypes = [np.float32, np.uint8, np.float32, np.bool, np.float32]
    model_type = "DNN"
qnet = DDQN(shapes[0] + (args.frames, ), action_shape, model_type, args)
if args.predictor:
    pred = Predictor(shapes[0], action_shape, args)
kws = ['obs', 'action', 'reward', 'done', 'new_obs']
memory = FullReplayMemory(args.buffer_size, kws, shapes, dtypes)
writer = tf.summary.create_file_writer("logs/{}_{}".format(
    args.scenario,
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")))

total_numsteps = 0
timestep = 0
t_start = time.time()
epsilon = args.epsilon

# total_parameters = np.sum([np.prod(v.get_shape().as_list()) for v in qnet.q1.trainable_variables])
with writer.as_default():
예제 #21
0
파일: main.py 프로젝트: davidmkwon/rl
We will ignore actions 0 and 1.
'''
mod_action_space = [2, 3, 4, 5]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = Env(device)
agent = Agent(eps=EPS_MAX,
              eps_min=EPS_MIN,
              eps_max=EPS_MAX,
              eps_decay=EPS_DECAY,
              num_actions=len(mod_action_space),
              device=device)
memory = PriorityReplayBuffer(MEMORY_SIZE)
stack = Frstack(initial_frame=env.state)

# initialize policy and target network
policy_net = DDQN(NUM_FRAMES, len(mod_action_space))
target_net = DDQN(NUM_FRAMES, len(mod_action_space))
if USE_GPU:
    policy_net.cuda()
    target_net.cuda()
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
# TODO: consider RMSProp vs Adam - DeepMind paper uses RMSProp
optimizer = optim.Adam(params=policy_net.parameters(), lr=ALPHA)


def experience_replay():
    # experience tuple - (state, action, next_state, reward, done)
    batch, idxs, is_weights = memory.sample(BATCH_SIZE)
    batch = list(zip(*batch))
n_actions = env.action_space.n
n_features = env.state.shape[0]
print('actions=', n_actions, 'n_features=', n_features)

memory0 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE)
memory1 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE)

sess0 = tf.Session()
sess1 = tf.Session()

dqn = [
    DDQN(n_actions,
         n_features,
         memory0,
         name='dqn0',
         learning_rate=LEARNING_RATE,
         reward_delay=GAMMA,
         replace_target_iter=200,
         double_q=False,
         sess=sess0),
    DDQN(n_actions,
         n_features,
         memory1,
         name='dqn1',
         learning_rate=LEARNING_RATE,
         reward_delay=GAMMA,
         replace_target_iter=200,
         double_q=False,
         sess=sess1)
]
예제 #23
0
env = gym.make('Pendulum-v0')
env = env.unwrapped
env.seed(1)

action_space = 11
n_features = 3
memory0 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE)
memory1 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE)

sess = tf.Session()
with tf.variable_scope('dqn'):
    dqn = DDQN(n_actions=action_space,
               n_features=n_features,
               memory=memory0,
               name='dqn',
               learning_rate=LEARNING_RATE,
               e_greedy_increment=0.001,
               double_q=False,
               sess=sess)

with tf.variable_scope('ddqn'):
    ddqn = DDQN(n_actions=action_space,
                n_features=n_features,
                memory=memory1,
                name='ddqn',
                learning_rate=LEARNING_RATE,
                e_greedy_increment=0.001,
                double_q=True,
                sess=sess)

sess.run(tf.global_variables_initializer())
예제 #24
0
class Driver():

    def __init__(self,width,height,bomb_no,render_flag):

        self.width = width
        self.height = height
        self.bomb_no = bomb_no
        self.box_count = width*height
        self.env = MineSweeper(self.width,self.height,self.bomb_no)
        self.current_model = DDQN(self.box_count,self.box_count)
        self.target_model = DDQN(self.box_count,self.box_count)
        self.target_model.eval()
        self.optimizer = torch.optim.Adam(self.current_model.parameters(),lr=0.003,weight_decay=1e-5)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,step_size=2000,gamma=0.95)
        self.target_model.load_state_dict(self.current_model.state_dict())
        self.buffer = Buffer(100000)
        self.gamma = 0.99
        self.render_flag = render_flag
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.90
        self.reward_threshold = 0.12
        self.reward_step = 0.01
        self.batch_size = 4096
        self.tau = 5e-5
        self.log = open("./Logs/ddqn_log.txt",'w')

        if(self.render_flag):
            self.Render = Render(self.env.state)

    
    def load_models(self,number):
        path = "./pre-trained/ddqn_dnn"+str(number)+".pth"
        weights = torch.load(path)
        self.current_model.load_state_dict(weights['current_state_dict'])
        self.target_model.load_state_dict(weights['target_state_dict'])
        self.optimizer.load_state_dict(weights['optimizer_state_dict'])
        self.current_model.epsilon = weights['epsilon']


    ### Get an action from the DDQN model by supplying it State and Mask
    def get_action(self,state,mask):
        state = state.flatten()
        mask = mask.flatten()
        action = self.current_model.act(state,mask)
        return action

    ### Does the action and returns Next State, If terminal, Reward, Next Mask
    def do_step(self,action):
        i = int(action/self.width)
        j = action%self.width
        if(self.render_flag):
            self.Render.state = self.env.state
            self.Render.draw()
            self.Render.bugfix()
        next_state,terminal,reward = self.env.choose(i,j)
        next_fog = 1-self.env.fog
        return next_state,terminal,reward,next_fog
    
    ### Reward Based Epsilon Decay 
    def epsilon_update(self,avg_reward):
        if(avg_reward>self.reward_threshold):
            self.current_model.epsilon = max(self.epsilon_min,self.current_model.epsilon*self.epsilon_decay)
            self.reward_threshold+= self.reward_step
    
    def TD_Loss(self):
        ### Samples batch from buffer memory
        state,action,mask,reward,next_state,next_mask,terminal = self.buffer.sample(self.batch_size)

        ### Converts the variabls to tensors for processing by DDQN
        state      = Variable(FloatTensor(float32(state)))
        mask      = Variable(FloatTensor(float32(mask)))
        next_state = FloatTensor(float32(next_state))
        action     = LongTensor(float32(action))
        next_mask      = FloatTensor(float32(next_mask))
        reward     = FloatTensor(reward)
        done       = FloatTensor(terminal)


        ### Predicts Q value for present and next state with current and target model
        q_values      = self.current_model(state,mask)
        next_q_values = self.target_model(next_state,next_mask)

        # Calculates Loss:
        #    If not Terminal:
        #        Loss = (reward + gamma*Q_val(next_state)) - Q_val(current_state)
        #    If Terminal:
        #        Loss = reward - Q_val(current_state)

        q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value     = next_q_values.max(1)[0]
        expected_q_value = reward + self.gamma * next_q_value * (1 - done)
        loss = (q_value - expected_q_value.detach()).pow(2).mean()
        loss_print = loss.item()    

        # Propagates the Loss
        self.optimizer.zero_grad()
        loss.backward()

        self.optimizer.step()
        self.scheduler.step()

        for target_param, local_param in zip(self.target_model.parameters(), self.current_model.parameters()):
            target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
        return loss_print

    def save_checkpoints(self,batch_no):
        path = "./pre-trained/ddqn_dnn"+str(batch_no)+".pth"
        torch.save({
            'epoch': batch_no,
            'current_state_dict': self.current_model.state_dict(),
            'target_state_dict' : self.target_model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'epsilon':self.current_model.epsilon
        }, path)

    def save_logs(self,batch_no,avg_reward,loss,wins):
        res = [
                    str(batch_no),
                    "\tAvg Reward: ",
                    str(avg_reward),
                    "\t Loss: ",
                    str(loss),
                    "\t Wins: ", 
                    str(wins),
                    "\t Epsilon: ",
                    str(self.current_model.epsilon)
        ]
        log_line = " ".join(res)
        print(log_line)
        self.log.write(log_line+"\n")
        self.log.flush()
예제 #25
0
def main(_):
    #env = gym.make("Frostbite-v0")
    env = gym.make ("MsPacman-v0")
    n_s = env.observation_space.shape[0]
    n_a = env.action_space.n

    
    pre = Preprocessor()
    
    with tf.Session() as sess:
        dqn = DDQN(input_shape=[FLAGS.batch_size, 84, 84, 4], action_n=n_a, N=FLAGS.N)
        #dqn = DQN(input_shape=[FLAGS.batch_size, n_s], action_n=n_a)
        global_step = 0

        saver = tf.train.Saver()
        if FLAGS.restore and os.path.exists("./data/model.ckpt"):
            saver.restore(sess, "./data/model.ckpt")
            #Rs = np.loadtxt("R.csv", delimiter=',')
        else:
            sess.run(tf.global_variables_initializer())
            
        for episode in range(FLAGS.episode):
            
            obs = env.reset()
            #s = env.reset()
            pre.init(obs)
            done = False
            step = 0
            limit = env.spec.tags.get("wrapper_config.TimeLimit.max_episode_steps")
            s = pre.state
            
            while not done and step < limit:

                # epsilon decay
                epsilon = 1.0 if global_step < FLAGS.replay_start_size else \
                          max(FLAGS.min_epsilon, np.interp(
                              global_step, [0, FLAGS.decay], [1.0, FLAGS.min_epsilon]))
                
                # epsilon greedy
                if global_step < FLAGS.replay_start_size or np.random.rand() < epsilon:
                    a = env.action_space.sample()
                else:
                    a = dqn.greedy(s[np.newaxis], sess)

                obs, r, done, _ = env.step(a)
                s_ = pre.get_state(obs)
                #s_, r, done, _ = env.step(a)
                                              
                dqn.set_exp((s, a, r*FLAGS.reward_scale, done, s_))

                s = s_
                
                if global_step >= FLAGS.replay_start_size:
                    dqn.update(sess)

                if global_step % FLAGS.sync_freq == 0:
                    dqn.update_target(sess)
                
                step += 1
                global_step += 1

            if FLAGS.save and episode % FLAGS.save_freq == 0:
                saver.save(sess, "./checkpoint/model.ckpt", global_step=global_step)                

            # Evaluation
            if episode % FLAGS.eval == 0:
                obs = env.reset()
                pre.init(obs)
                done = False
                s = pre.state
                R = 0
                step = 0
                epsilon = 0.01
                while not done and step < limit:
                    if np.random.rand() < epsilon:
                        a = env.action_space.sample()
                    else:
                        a = dqn.greedy(s[np.newaxis], sess)

                    obs, r, done, _ = env.step(a)
                    s_ = pre.get_state(obs)
                    s = s_
                    if FLAGS.render:
                        env.render()

                    R += r
                    step += 1
                    
                print("epoch:{}, step:{}, R:{}".format(episode, global_step, R))
                with open('R.csv', 'a')  as f:
                    
                    f.write("{},".format(R))
                gc.collect()
    pre_train_steps = 10000
    max_epLength = args.max_episode_length
    # load previous saved model
    load_model = args.load_model
    # location of model
    path = args.model_path
    # rate to update target network
    tau = 0.001
    reward_exit_arena = args.exit_reward

    tf.reset_default_graph()

    # init DDQNs
    n_actions = [env.action_space[i].n for i in range(env.n)]
    state_sizes = [env.observation_space[i].shape[0] for i in range(env.n)]
    mainQN = [DDQN(n_actions[i], state_sizes[i]) for i in range(env.n)]
    targetQN = [DDQN(n_actions[i], state_sizes[i]) for i in range(env.n)]

    # init tensorflow
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    trainables = tf.trainable_variables()
    targetOps = updateTargetGraph(trainables, tau)

    # assign experience buffer for each agent
    experiences = [experience_buffer() for i in range(env.n)]

    # chance of random actions
    if args.testing:
        e = 0.1
        pre_train_steps = 0
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

# RL = DQN(s_dim = env.observation_space.shape[0],
# 		 a_dim = env.action_space.n,
# 		 learning_rate = 0.01,
# 		 e_greedy = 0.9,
# 		 replace_target_iter = 100,
# 		 memory_size = 2000,
# 		 e_greedy_increment = 0.001)

RL = DDQN(s_dim=env.observation_space.shape[0],
          a_dim=env.action_space.n,
          learning_rate=0.001,
          e_greedy=0.9,
          replace_target_iter=300,
          memory_size=3000,
          e_greedy_increment=0.0002)

total_steps = 0
total_reward = []
for i_episode in range(15):

    s = env.reset()
    ep_r = 0
    while True:
        env.render()
        a = RL.choose_action(s)
        s_, r, done, info = env.step(a)
예제 #28
0
class Agent(RLGlueAgent):
	def __init__(self):
		self.last_action = Action()
		self.time_step = 0
		self.total_time_step = 0
		self.episode_step = 0
		self.populating_phase = False

		self.model_save_interval = 30

		# Switch learning phase / evaluation phase
		self.policy_frozen = False

		self.ddqn = DDQN()
		self.state = np.zeros((config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32)
		self.exploration_rate = self.ddqn.exploration_rate
		self.exploration_rate_for_evaluation = 0.05
		self.last_observed_screen = None

	def preprocess_screen(self, observation):
		screen_width = config.ale_screen_size[0]
		screen_height = config.ale_screen_size[1]
		new_width = config.ale_scaled_screen_size[0]
		new_height = config.ale_scaled_screen_size[1]
		if len(observation.intArray) == 100928: 
			observation = np.asarray(observation.intArray[128:], dtype=np.uint8).reshape((screen_width, screen_height, 3))
			observation = spm.imresize(observation, (new_height, new_width))
			# Clip the pixel value to be between 0 and 1
			if config.ale_screen_channels == 1:
				# Convert RGB to Luminance
				observation = np.dot(observation[:,:,:], [0.299, 0.587, 0.114])
				observation = observation.reshape((new_height, new_width, 1))
			observation = observation.transpose(2, 0, 1) / 255.0
			observation /= (np.max(observation) + 1e-5)
		else:
			# Greyscale
			if config.ale_screen_channels == 3:
				raise Exception("You forgot to add --send_rgb option when you run ALE.")
			observation = np.asarray(observation.intArray[128:]).reshape((screen_width, screen_height))
			observation = spm.imresize(observation, (new_height, new_width))
			# Clip the pixel value to be between 0 and 1
			observation = observation.reshape((1, new_height, new_width)) / 255.0
			observation /= (np.max(observation) + 1e-5)

		observed_screen = observation
		if self.last_observed_screen is not None:
			observed_screen = np.maximum(observation, self.last_observed_screen)

		self.last_observed_screen = observation
		return observed_screen

	def agent_init(self, taskSpecString):
		pass

	def reshape_state_to_conv_input(self, state):
		return state.reshape((1, config.rl_agent_history_length * config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]))

	def dump_result(self, reward, q_max=None, q_min=None):
		if self.time_step % 50 == 0:
			if self.policy_frozen is False:
				print "time_step:", self.time_step,
				
			print "reward:", reward,
			print "eps:", self.exploration_rate,
			if q_min is None:
				print ""
			else:
				print "Q ::",
				print "max:", q_max,
				print "min:", q_min

	def dump_state(self, state=None, prefix=""):
		if state is None:
			state = self.state
		state = self.reshape_state_to_conv_input(state)
		for h in xrange(config.rl_agent_history_length):
			start = h * config.ale_screen_channels
			end = start + config.ale_screen_channels
			image = state[0,start:end,:,:]
			if config.ale_screen_channels == 1:
				image = image.reshape((image.shape[1], image.shape[2]))
			elif config.ale_screen_channels == 3:
				image = image.transpose(1, 2, 0)
			image = np.uint8(image * 255.0)
			image = Image.fromarray(image)
			image.save(("%sstate-%d.png" % (prefix, h)))

	def learn(self, reward, epsode_ends=False):
		if self.policy_frozen is False:

			self.ddqn.store_transition_in_replay_memory(self.reshape_state_to_conv_input(self.last_state), self.last_action.intArray[0], reward, self.reshape_state_to_conv_input(self.state), epsode_ends)
			if self.total_time_step <= config.rl_replay_start_size:
				# A uniform random policy is run for 'replay_start_size' frames before learning starts
				# 経験を積むためランダムに動き回るらしい。
				print "Initial exploration before learning starts:", "%d/%d" % (self.total_time_step, config.rl_replay_start_size)
				self.populating_phase = True
				self.exploration_rate = config.rl_initial_exploration
			else:
				self.populating_phase = False
				self.ddqn.decrease_exploration_rate()
				self.exploration_rate = self.ddqn.exploration_rate

				if self.total_time_step % (config.rl_action_repeat * config.rl_update_frequency) == 0 and self.total_time_step != 0:
					self.ddqn.replay_experience()

				if self.total_time_step % config.rl_target_network_update_frequency == 0 and self.total_time_step != 0:
					print "Target has been updated."
					self.ddqn.update_target()

	def agent_start(self, observation):
		print "Episode", self.episode_step, "::", "total_time_step:",
		if self.total_time_step > 1000:
			print int(self.total_time_step / 1000), "K"
		else:
			print self.total_time_step
		observed_screen = self.preprocess_screen(observation)
		self.state[0] = observed_screen

		return_action = Action()
		action, q_max, q_min = self.ddqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate)
		return_action.intArray = [action]

		self.last_action = copy.deepcopy(return_action)
		self.last_state = self.state.copy()

		return return_action

	def agent_step(self, reward, observation):
		observed_screen = self.preprocess_screen(observation)
		self.state = np.roll(self.state, 1, axis=0)
		self.state[0] = observed_screen

		########################### DEBUG ###############################
		# if self.total_time_step % 500 == 0 and self.total_time_step != 0:
		# 	self.dump_state()

		self.learn(reward)
		
		return_action = Action()
		q_max = None
		q_min = None
		if self.time_step % config.rl_action_repeat == 0:
			action, q_max, q_min = self.ddqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate)
		else:
			action = self.last_action.intArray[0]
		return_action.intArray = [action]

		self.dump_result(reward, q_max, q_min)

		if self.policy_frozen is False:
			self.last_action = copy.deepcopy(return_action)
			self.last_state = self.state.copy()
			self.time_step += 1
			self.total_time_step += 1

		return return_action

	def agent_end(self, reward):
		self.learn(reward, epsode_ends=True)

		# [Optional]
		## Visualizing the results
		self.dump_result(reward)

		if self.policy_frozen is False:
			self.time_step = 0
			self.total_time_step += 1
			self.episode_step += 1

	def agent_cleanup(self):
		pass

	def agent_message(self, inMessage):
		if inMessage.startswith("freeze_policy"):
			self.policy_frozen = True
			self.exploration_rate = self.exploration_rate_for_evaluation
			return "The policy was freezed."

		if inMessage.startswith("unfreeze_policy"):
			self.policy_frozen = False
			self.exploration_rate = self.ddqn.exploration_rate
			return "The policy was unfreezed."

		if inMessage.startswith("save_model"):
			if self.populating_phase is False:
				self.ddqn.save()
			return "The model was saved."
예제 #29
0
import gym

from dqn import DQN
from ddqn import DDQN
from dueling_ddqn import DuelingDDQN
from noisy_dqn import NoisyDQN
from categorical_dqn import CategoricalDQN
from rainbow import Rainbow

from utils.dqn_runner import vector_train
from utils.dqn_runner import evaluate

if __name__ == "__main__":
    env = gym.vector.make("CartPole-v1", num_envs=4, asynchronous=True)
    agent = DDQN(env.single_observation_space, env.single_action_space)

    returns = vector_train(agent, env, 50000, 450)

    eval_env = gym.make("CartPole-v1")
    evaluate(agent, eval_env, 1, True)