Python Agent.learn示例

class AI:
    def __init__(self, fname):
        lr = 0.0005
        self.agent = Agent(gamma=0.99,
                           epsilon=0.0,
                           alpha=lr,
                           input_dims=6,
                           n_actions=2,
                           mem_size=60000,
                           batch_size=64,
                           epsilon_end=0.0,
                           fname=fname)
        self.observation = []
        self.action = 0
        self.n_step = 0
        self.fname = fname.split("/")[-1]

    def episode_start(self, observation):
        self.observation = observation

    def choose_action(self):
        self.action = self.agent.choose_action(self.observation)
        return self.action

    def step(self, observation_, reward, done):
        self.agent.remember(self.observation, self.action, reward,
                            observation_, int(done))
        self.observation = observation_
        if self.n_step % 3 == 0:
            self.agent.learn()
        self.n_step += 1

    def episode_end(self):
        self.agent.save_model()

示例#2

显示文件

文件： main.py 项目： jackal-u/cart_pole_demo

def start():
    env = gym.make('CartPole-v0')

    params = {
        'gamma': 0.8,
        'epsi_high': 0.9,
        'epsi_low': 0.05,
        'decay': 500,
        'lr': 0.001,
        'capacity': 10000,
        'batch_size': 64,
        'state_space_dim': env.observation_space.shape[0],
        'action_space_dim': env.action_space.n
    }
    agent = Agent(**params)

    score = []
    mean = []

    for episode in range(1000):
        s0 = env.reset()
        total_reward = 1
        for i in range(200):
            env.render()
            a0 = agent.act(s0)
            s1, r1, done, _ = env.step(a0)

            if done:
                r1 = -1

            agent.put(s0, a0, r1, s1)

            if done:
                break

            total_reward += r1
            s0 = s1
            agent.learn()

        score.append(total_reward)
        mean.append(sum(score[-100:]) / 100)
        print(total_reward)

示例#3

显示文件

def main():
    #make env and agent
    env = gym.make('LunarLander-v2')
    agent = Agent(gamma=0.99,
                  epsilon=1.0,
                  batch_size=64,
                  n_actions=4,
                  eps_end=0.01,
                  input_dims=[8],
                  lr=0.0001)

    scores, eps_history = [], []
    n_games = 500

    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            #ingame
            #get action from current view of game (observation)
            action = agent.choose_action(observation)
            #next frame
            observation_, reward, done, info = env.step(action)

            score += reward
            #store memory
            agent.store_transisation(observation, action, reward, observation_,
                                     done)
            agent.learn()

            #set next stage to current stage
            observation = observation_
        #append score and eps
        scores.append(score)
        eps_history.append(agent.epsilon)

        #print some nice statements
        avg_score = np.mean(scores[-100:])
        print(
            f'Episode: {i}   Score: {score}   Average Score: {avg_score}   Epsilon: {agent.epsilon}'
        )

示例#4

显示文件

def OldStuff():
    tf.compat.v1.disable_eager_execution()

    lr = 0.001
    numGames = 10000

    session = TriadGameSession()
    observation = session.getState()
    scores = []

    agent = Agent(gamma=0.99,
                  lr=lr,
                  epsilon=1.0,
                  epsilonDec=0.0005,
                  inputSize=[len(observation)],
                  numActions=session.getMaxActions(),
                  memSize=1000000,
                  batchSize=1024)

    for i in range(numGames):
        done = False
        score = 0
        session = TriadGameSession()
        observation = session.getState()
        while not done:
            action = agent.chooseAction(observation)
            observationNext, reward, done = session.step(action)
            score += reward
            agent.store(observation, action, reward, observationNext, done)
            observation = observationNext
            agent.learn()

        scores.append(score)
        avgScore = np.mean(scores[-100:])
        print('game:', i, 'score %.2f' % score, 'avgScore %.2f' % avgScore,
              'epsilon %.2f' % agent.epsilon)

    #agent.save()
    print('Finished!')

示例#5

显示文件

文件： main_dqn_lunar_lander.py 项目： ArshT/Reinforcement_Learning_Basic

        if i % 10 == 0 and i > 0:
            avg_score = np.mean(scores[max(0, i - 10):(i + 1)])
            print('episode: ', i, 'score: ', score,
                  ' average score %.3f' % avg_score,
                  'epsilon %.3f' % brain.EPSILON)
        else:
            print('episode: ', i, 'score: ', score)
        eps_history.append(brain.EPSILON)
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = brain.chooseAction(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            brain.storeTransition(observation, action, reward, observation_,
                                  done)
            observation = observation_
            brain.learn()

        scores.append(score)

    for i in range(10):
        done = False
        observation = env.reset()
        while not done:
            action = brain.chooseAction(observation)
            observation_, reward, done, info = env.step(action)
            observation = observation_
            env.render()

示例#6

显示文件

文件： main.py 项目： magiconline/LunarLander-v2

                n_actions=4, batch_size=64)

    scores = []
    eps_history = []

    for i in range(1, n_games+1):
        done = False
        score = 0
        obseervation = env.reset()
        while not done:
            if show:
                env.render()
            action = agent.choose_action(obseervation)
            obseervation_, reward, done, info = env.step(action)
            score += reward
            agent.remember(obseervation, action, reward, obseervation_, done)
            obseervation = obseervation_
            agent.learn()

        eps_history.append(agent.epsilon)
        scores.append(score)

        avg_score = np.mean(scores[max(0, i-100):i+1])
        print('epsiode', i, 'score ', score, 'avg score', avg_score)
        if i % 10 == 0 and i > 0:
            agent.save_model()

    plt.plot(scores)
    plt.plot(eps_history)
    plt.legend(['score', 'epsilon'], loc='upper left')
    plt.show()

示例#7

显示文件

def main():
    scores = []
    eps_history = []
    info_history = []

    # Random starting-points:
    env = sky.make(random=True,
                   xi=(301, 650 - 25),
                   yi=(100, 300 - 25),
                   width=15,
                   height=15,
                   v_initial=14)
    # Fixed starting-point:
    #env = sky.make(xi=550)

    agent = Agent(gamma=gamma,
                  epsilon=epsilon,
                  lr=lr,
                  input_dims=[imput_dimensions],
                  n_actions=n_actions,
                  mem_size=mem_size,
                  batch_size=batch_size,
                  epsilon_dec=epsilon_dec)

    if (load_checkpoint):
        agent.load_modes()

    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            '''
            one game: ending, when done=True
            '''
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, observation_,
                                   int(done))
            observation = observation_
            agent.learn()

        if i % 10 == 0 and i > 0:
            avg_score = np.mean(scores[max(0, i - 10):(i + 1)])
            print(i, 'episode', info, '|| score:', score,
                  '| average score: %.3f' % avg_score,
                  '| epsilon: %.3f' % agent.epsilon, '| training done:',
                  round(i / n_games, 2))
        else:
            print(i, 'episode', info, '|| score:', score)

        scores.append(score)
        eps_history.append(agent.epsilon)
        info_history.append(info)

    print('training ended with:',
          [[el, info_history.count(el)] for el in ('crashed', 'goal')])

    if (save_checkpoint):
        agent.save_models()
        print('[+] model saved')

    # -------------------
    # Plotting and output
    # -------------------
    x = [i + 1 for i in range(n_games)]

    # First axis: Scores
    fig, ax1 = plt.subplots()
    color = 'tab:red'
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('score per Episode', color=color)
    ax1.scatter(x, scores, color=color, s=2)
    ax1.tick_params(axis='y', labelcolor=color)

    # Second axis: epsilon
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    color = 'tab:blue'
    ax2.set_ylabel('epsilon',
                   color=color)  # we already handled the x-label with ax1
    ax2.plot(x, eps_history, color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    # Output
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.savefig(filename)

    return env

示例#8

显示文件

文件： main.py 项目： Raccoonn/Q-Learning-Pong

            # Update memory objects with states for each player
            if train_networks == True:
                memory_1.store_transition(p1_state, p1_action, p1_reward,
                                          p1_state_, int(done))
                memory_2.store_transition(p2_state, p2_action, p2_reward,
                                          p2_state_, int(done))

            # Train agent DeepQ Networks
            # Assign history values so it doesnt break when memcntr is low
            history_1, history_2 = None, None
            if train_networks == True and memory_1.mem_cntr > batch_size:
                history_1, history_2 = None, None

                if p1_type == 'Agent':
                    history_1 = agent_1.learn(
                        batch_size, memory_1.sample_buffer(batch_size))

                if p2_type == 'Agent':
                    history_2 = agent_2.learn(
                        batch_size, memory_2.sample_buffer(batch_size))

            # Update state for new step
            p1_state = p1_state_
            p2_state = p2_state_

            p1_tot += p1_reward
            p2_tot += p2_reward

            # Save networks if they are also being trained
            # Moved inside game loop, will save every 1000 frames
            # Games are getting longer and need to be saved in progress