Пример #1
0
def qtrain(model, grid, **opt):
    global epsilon
    n_epoch = opt.get('epochs',1001)
    max_memory = opt.get('max_memory', 1000)
    data_size = opt.get('data_size', 50)
    actions=['Left','Up','Right','Down']
    qgrid = Qgrid(grid)
    experience = Experience(model, max_memory=max_memory)

    n_free_cells = len(qgrid.free_cells)
    hsize = qgrid.grid.size//2
    win_rate = 0.0
    imctr = 1
    all_state=[]
    for epoch in range(n_epoch):
        states=[]
        all_action=[]
        loss = 0.0
        rat_cell = random.choice(qgrid.free_cells)
        rat_cell = (2,3)
        qgrid.reset(rat_cell)
        game_over = False
        envstate = qgrid.observe()
        n_episodes = 0
        while not game_over:
            valid_actions = qgrid.valid_actions()
            if not valid_actions:
                break
            prev_envstate = envstate
            states.append((qgrid.state[0],qgrid.state[1]))
            if np.random.rand() < epsilon:
                action = random.choice(valid_actions)
            else:
                action = np.argmax(experience.predict(prev_envstate))
            all_action.append(actions[action])

            envstate, reward, game_status = qgrid.act(action)
            if game_status == 'win':
                game_over = True
            elif game_status == 'lose':
                game_over = True
            else:
                game_over = False
            episode = [prev_envstate, action, reward, envstate, game_over]
            experience.remember(episode)
            n_episodes += 1
            inputs, targets = experience.get_data(data_size=data_size)
            model.fit(
                inputs,
                targets,
                epochs=8,
                batch_size=16,
                verbose=0,
            )
            loss = model.evaluate(inputs, targets, verbose=0)
        print("epoch",epoch, 'actions',all_action)
        all_state.append((states))
    qgrid.draw(all_state)
Пример #2
0
    def play_episode(self):

        episode_reward = 0

        local_steps = 0

        done = False

        state = self.env.reset()

        while not done:

            action, _ = self.policy.sample_action(np.atleast_2d(state))

            action = action.numpy()[0]
            try:
                next_state, reward, done, _ = self.env.step(action)
            except:
                print("DEBUG", action)

            #reward = np.clip(reward, -5, 5)

            exp = Experience(state, action, reward, next_state, done)

            self.replay_buffer.push(exp)

            state = next_state

            episode_reward += reward

            local_steps += 1

            self.global_steps += 1

            if (len(self.replay_buffer) >= self.MIN_EXPERIENCES
                    and self.global_steps % self.UPDATE_PERIOD == 0):

                self.update_networks()

        return episode_reward, local_steps, tf.exp(self.log_alpha)
Пример #3
0
def qtrain(model, maze, **opt):
    global epsilon
    n_epoch = opt.get('epochs', 1001)
    max_memory = opt.get('max_memory', 1000)
    data_size = opt.get('data_size', 50)

    qmaze = Qmaze(maze)

    # Initialize experience replay object
    experience = Experience(model, max_memory=max_memory)

    win_history = []  # history of win/lose game
    n_free_cells = len(qmaze.free_cells)
    hsize = qmaze.maze.size // 2  # history window size
    win_rate = 0.0
    imctr = 1

    for epoch in range(n_epoch):
        states = []
        loss = 0.0
        rat_cell = random.choice(qmaze.free_cells)
        if epoch % 10 == 0:
            rat_cell = (2, 3)
        qmaze.reset(rat_cell)
        game_over = False
        # get initial envstate (1d flattened canvas)
        envstate = qmaze.observe()
        n_episodes = 0
        while not game_over:
            valid_actions = qmaze.valid_actions()
            if not valid_actions:
                break
            prev_envstate = envstate
            states.append((qmaze.state[0], qmaze.state[1]))
            # Get next action
            if np.random.rand() < epsilon:
                action = random.choice(valid_actions)
            else:
                action = np.argmax(experience.predict(prev_envstate))

            # Apply action, get reward and new envstate
            envstate, reward, game_status = qmaze.act(action)
            if game_status == 'win':
                win_history.append(1)
                game_over = True
            elif game_status == 'lose':
                win_history.append(0)
                game_over = True
            else:
                game_over = False

            # Store episode (experience)
            episode = [prev_envstate, action, reward, envstate, game_over]
            experience.remember(episode)
            n_episodes += 1

            # Train neural network model
            inputs, targets = experience.get_data(data_size=data_size)
            loss = model.fit(inputs, targets, 1)


#             value= model.predict(inputs)
#             loss = rm.mean_squared_error(value, targets)
#             loss.grad().update(rm.Adam(0.001))

        if len(win_history) > hsize:
            win_rate = sum(win_history[-hsize:]) / hsize
        template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} "
        print(
            template.format(epoch, n_epoch - 1, loss, n_episodes,
                            sum(win_history), win_rate))

    print(qmaze.draw(states))
    def learn(self, n_episodes, logdir="log"):

        logdir = Path(__file__).parent / logdir
        if logdir.exists():
            shutil.rmtree(logdir)
        self.summary_writer = tf.summary.create_file_writer(str(logdir))

        for episode in range(1, n_episodes+1):

            env = gym.make(self.env_name)

            frames = collections.deque(maxlen=4)
            frame = frame_preprocess(env.reset())
            for _ in range(self.n_frames):
                frames.append(frame)

            episode_rewards = 0
            episode_steps = 0
            done = False
            lives = 5
            while not done:
                self.steps += 1
                episode_steps += 1
                state = np.stack(frames, axis=2)[np.newaxis, ...]
                action = self.fqf_network.sample_action(state, epsilon=self.epsilon)
                next_frame, reward, done, info = env.step(action)
                episode_rewards += reward
                frames.append(frame_preprocess(next_frame))
                next_state = np.stack(frames, axis=2)[np.newaxis, ...]

                if done:
                    exp = Experience(state, action, reward, next_state, done)
                    self.replay_buffer.push(exp)
                    break
                else:
                    if info["ale.lives"] != lives:
                        #: life loss as episode ends
                        lives = info["ale.lives"]
                        exp = Experience(state, action, reward, next_state, True)
                    else:
                        exp = Experience(state, action, reward, next_state, done)

                    self.replay_buffer.push(exp)

                if (len(self.replay_buffer) > 50000) and (self.steps % self.update_period == 0):

                    loss, loss_fp, entropy = self.update_network()

                    with self.summary_writer.as_default():
                        tf.summary.scalar("loss", loss, step=self.steps)
                        tf.summary.scalar("loss_fp", loss_fp, step=self.steps)
                        tf.summary.scalar("entropy", entropy, step=self.steps)
                        tf.summary.scalar("epsilon", self.epsilon, step=self.steps)
                        tf.summary.scalar("buffer_size", len(self.replay_buffer), step=self.steps)
                        tf.summary.scalar("train_score", episode_rewards, step=self.steps)
                        tf.summary.scalar("train_steps", episode_steps, step=self.steps)

                #: Target update
                if self.steps % self.target_update_period == 0:
                    self.target_fqf_network.set_weights(
                        self.fqf_network.get_weights())

            print(f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}")

            if episode % 20 == 0:
                test_scores, test_steps = self.test_play(n_testplay=1)
                with self.summary_writer.as_default():
                    tf.summary.scalar("test_score", test_scores[0], step=self.steps)
                    tf.summary.scalar("test_step", test_steps[0], step=self.steps)

            if episode % 500 == 0:
                self.fqf_network.save_weights("checkpoints/fqfnet")
                print("Model Saved")
Пример #5
0
def qtrain(model, maze, **opt):
    global epsilon
    n_epoch = opt.get('epochs',1001)
    max_memory = opt.get('max_memory', 1000)
    data_size = opt.get('data_size', 50)

    qmaze = Qmaze(maze)
    experience = Experience(model, max_memory=max_memory)

    win_history = []
    n_free_cells = len(qmaze.free_cells)
    hsize = qmaze.maze.size//2
    win_rate = 0.0
    imctr = 1

    for epoch in range(n_epoch):
        states=[]
        loss = 0.0
        rat_cell = random.choice(qmaze.free_cells)
        if epoch%10==0:
            rat_cell = (2,3)
        qmaze.reset(rat_cell)
        game_over = False

        envstate = qmaze.observe()
        n_episodes = 0
        while not game_over:
            valid_actions = qmaze.valid_actions()
            if not valid_actions:
                break
            prev_envstate = envstate
            states.append((qmaze.state[0],qmaze.state[1]))

            if np.random.rand() < epsilon:
                action = random.choice(valid_actions)
            else:
                action = np.argmax(experience.predict(prev_envstate))

            envstate, reward, game_status = qmaze.act(action)
            if game_status == 'win':
                win_history.append(1)
                game_over = True
            elif game_status == 'lose':
                win_history.append(0)
                game_over = True
            else:
                game_over = False

            episode = [prev_envstate, action, reward, envstate, game_over]
            experience.remember(episode)
            n_episodes += 1

            inputs, targets = experience.get_data(data_size=data_size)
            loss=model.fit(inputs, targets,1)

        if len(win_history) > hsize:
            win_rate = sum(win_history[-hsize:]) / hsize
        template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} "
        print(template.format(epoch, n_epoch-1, loss, n_episodes, sum(win_history), win_rate))

    print(qmaze.draw(states))