예제 #1
0
def setup_Agent(filename, epsilon):
    """
    Function to initialize the DQN agent
    """
    # one hot vector (opponents move) on top of game board
    input_dims = 7 * 7

    action_space = tuple(range(7))
    n_actions = 7

    h1_dims = 512
    h2_dims = 256

    agent = Agent(lr=0.001,
                  gamma=0.95,
                  epsilon=epsilon,
                  epsilon_dec=0.995,
                  epsilon_min=0.01,
                  input_shape=input_dims,
                  h1_dims=h1_dims,
                  h2_dims=h2_dims,
                  action_space=action_space,
                  training_epochs=2,
                  fname=filename)

    memory = ReplayBuffer(50000, input_dims, n_actions)

    return agent, memory
예제 #2
0
def main(env, gamma, epsilon, final_epsilon, final_exp_step, lr, memory_size,
         target_update_freq, gradient_update_freq, batch_size, replay_start,
         val_freq, log_freq_by_step, log_freq_by_ep, val_epsilon, log_dir,
         weight_dir, steps):
    train_env = make_atari(env + "NoFrameskip-v4")
    val_env = make_atari(env + "NoFrameskip-v4", noop=False)

    agent = Agent(train_env,
                  DQN,
                  gamma=gamma,
                  epsilon=epsilon,
                  final_epsilon=final_epsilon,
                  final_exp_step=final_exp_step)
    trainer = Trainer(agent,
                      val_env,
                      lr=lr,
                      memory_size=memory_size,
                      target_update_freq=target_update_freq,
                      gradient_update_freq=gradient_update_freq,
                      batch_size=batch_size,
                      replay_start=replay_start,
                      val_freq=val_freq,
                      log_freq_by_step=log_freq_by_step,
                      log_freq_by_ep=log_freq_by_ep,
                      val_epsilon=val_epsilon,
                      log_dir=log_dir,
                      weight_dir=weight_dir)
    trainer.train(steps)
예제 #3
0
def test_agent():
    print("##################Running agent test##################")
    agent = Agent(state_shape, action_shape)
    state1 = np.array([1,2,3,4]).reshape(-1,4)
    state2 = np.array([2,3,4,5]).reshape(-1,4)
    out1 = agent.model.predict(state1)
    out2 = agent.model.predict(state2)
    print(out1)
    print(out2)
    assert np.all((out1[0]-out2[0])!=0), "Failed agent test - same output different state"
    print("Agent test passed :)\n\n")
예제 #4
0
파일: runDQN.py 프로젝트: Anelacu/PySnakeAI
 def main_loop(self):
     agent = Agent()
     numGames = 0
     top = 0
     while numGames < 100:
         food = Food(self.size, self.screen)
         food.food_new()
         snake = Snake(size=self.size)
         while not self.over:
             agent.epsilon = 100 - numGames
             oldState = agent.get_state(snake, food)
             if randint(0, 200) < agent.epsilon:
                 move = to_categorical(randint(0, 2),
                                       num_classes=3,
                                       dtype='int32')
             else:
                 predict = agent.model.predict(oldState.reshape(1, 11))
                 move = to_categorical(np.argmax(predict[0]),
                                       num_classes=3,
                                       dtype='int32')
             if np.array_equal(move, [1, 0, 0]):
                 snake.xVel = 10
                 print('condition1')
             elif np.array_equal(
                     move,
                 [0, 1, 0]) and snake.yVel == 0:  # right - going horizontal
                 snake.yVel = 10
                 print('condition2')
             elif np.array_equal(
                     move,
                 [0, 1, 0]) and snake.xVel == 0:  # right - going vertical
                 snake.xVel = 10
                 print('condition3')
             elif np.array_equal(
                     move,
                 [0, 0, 1]) and snake.yVel == 0:  # left - going horizontal
                 snake.yVel = -10
                 print('condition4')
             elif np.array_equal(
                     move,
                 [0, 0, 1]) and snake.xVel == 0:  # left - going vertical
                 snake.xVel = -10
                 print('condition5')
             snake.snake_move()
             self.check_collisions(snake, food)
             self.update_window(snake, food)
             self.clock.tick(10)
             newState = agent.get_state(snake, food)
             reward = agent.get_reward(self.foodCollide, self.over)
             agent.train_short(oldState, move, reward, newState, self.over)
             agent.write_memory(oldState, move, reward, newState, self.over)
         agent.replay(agent.mem)
         numGames += 1
         print(numGames)
예제 #5
0
def main():
    gym_env = gym.make('custom_gym:Xplane-v0')
    lr = 0.001
    gam = 0.01
    n_games = 1
    # nn_input = obs()
    agent = Agent(learning_rate=lr,
                  gamma=gam,
                  epsilon=1.0,
                  input_dims=(6, ),
                  n_actions=15,
                  batch_size=32,
                  file_name='AI_takeoff/saved_models/dq_model_2.h5')
    scores = []
    total_steps = []
    eps_hist = []
    agent.load_model()

    for i in range(n_games):
        try:
            done = False
            score = 0
            observation = gym_env.reset()
            time.sleep(2)
            observation_checkpoints = np.array([observation[0:2]])
            step_counter = 0
            print("GAME ITERATION ", i)
            while not done:
                action = agent.choose_action(observation)
                new_observation, reward, done = gym_env.step(action)
                step_counter = step_counter + 1
                score = score + reward
                agent.store_transition(observation, action, reward,
                                       new_observation, done)
                observation = new_observation
                # agent.learn()
                # This if statement checks if the airplane is stuck
                observation_checkpoints = np.append(observation_checkpoints,
                                                    [new_observation[0:2]],
                                                    axis=0)
                print(observation_checkpoints)
                print("stepcounter is", step_counter)
                if step_counter % 30 == 0:
                    if np.array_equal(
                            observation_checkpoints[step_counter - 30],
                            observation_checkpoints[step_counter - 1]):
                        done = True
            eps_hist.append(agent.epsilon)
            scores.append(score)
            total_steps.append(step_counter)
        except Exception as e:
            print(str(e))
예제 #6
0
 def __init__(self, fname):
     lr = 0.0005
     self.agent = Agent(gamma=0.99,
                        epsilon=0.0,
                        alpha=lr,
                        input_dims=6,
                        n_actions=2,
                        mem_size=60000,
                        batch_size=64,
                        epsilon_end=0.0,
                        fname=fname)
     self.observation = []
     self.action = 0
     self.n_step = 0
     self.fname = fname.split("/")[-1]
예제 #7
0
def main():
    agent = Agent()
    agent.load()

    total_reward = 0
    obs = env.reset()
    env.render()
    for _ in range(10000):
        act = agent.predict(obs)
        obs, reward, done, _ = env.step(act)
        total_reward += reward
        env.render()
        if done:
            print(f'total_reward: {total_reward}')
            env.close()
            break
예제 #8
0
def main(env_name=None):
    ENV_NAME = 'wumpus-v0'
    
    if env_name: ENV_NAME = env_name

    MODEL_DIR = f'models/{ENV_NAME}-dqn'
    MODEL_FILE = f'{ENV_NAME}-dqn.h5'
    CHECKPOINTS_DIR = f'models/{ENV_NAME}-dqn/checkpoints'
    TEST_IMG_DIR = f'tests/{ENV_NAME}-dqn'

    env = gym.make(ENV_NAME)
    env.reset()

    agent = Agent(learning_rate=0.01, gamma=0.95,
                    state_shape=env.observation_space.shape, actions=7,
                    batch_size=64,
                    epsilon_initial=0.0, epsilon_decay=0, epsilon_final=0.0,
                    replay_buffer_capacity=1000000,
                    model_name=MODEL_FILE, model_dir=MODEL_DIR,
                    ckpt_dir=CHECKPOINTS_DIR)
    agent.load_model()

    done = False
    score = 0
    steps_per_episode = 0
    state = env.reset()
    images = [env.render('rgb_array')]
    while not done:
        # Choose action according to policy, and execute
        action = agent.select_action(state)
        state, reward, done, _ = env.step(action)

        score += reward
        steps_per_episode += 1
        images.append(env.render('rgb_array'))

    # Generate GIF for the execution
    create_gif(
        f'{ENV_NAME}.gif',
        np.array(images),
        fps=1.0
    )

    print(
        f'Model \'{str(ENV_NAME)}\', score {score}, steps {steps_per_episode}')
예제 #9
0
def start():
    env = gym.make('CartPole-v0')

    params = {
        'gamma': 0.8,
        'epsi_high': 0.9,
        'epsi_low': 0.05,
        'decay': 500,
        'lr': 0.001,
        'capacity': 10000,
        'batch_size': 64,
        'state_space_dim': env.observation_space.shape[0],
        'action_space_dim': env.action_space.n
    }
    agent = Agent(**params)

    score = []
    mean = []

    for episode in range(1000):
        s0 = env.reset()
        total_reward = 1
        for i in range(200):
            env.render()
            a0 = agent.act(s0)
            s1, r1, done, _ = env.step(a0)

            if done:
                r1 = -1

            agent.put(s0, a0, r1, s1)

            if done:
                break

            total_reward += r1
            s0 = s1
            agent.learn()

        score.append(total_reward)
        mean.append(sum(score[-100:]) / 100)
        print(total_reward)
예제 #10
0
def main():
    #make env and agent
    env = gym.make('LunarLander-v2')
    agent = Agent(gamma=0.99,
                  epsilon=1.0,
                  batch_size=64,
                  n_actions=4,
                  eps_end=0.01,
                  input_dims=[8],
                  lr=0.0001)

    scores, eps_history = [], []
    n_games = 500

    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            #ingame
            #get action from current view of game (observation)
            action = agent.choose_action(observation)
            #next frame
            observation_, reward, done, info = env.step(action)

            score += reward
            #store memory
            agent.store_transisation(observation, action, reward, observation_,
                                     done)
            agent.learn()

            #set next stage to current stage
            observation = observation_
        #append score and eps
        scores.append(score)
        eps_history.append(agent.epsilon)

        #print some nice statements
        avg_score = np.mean(scores[-100:])
        print(
            f'Episode: {i}   Score: {score}   Average Score: {avg_score}   Epsilon: {agent.epsilon}'
        )
 def __init__(self):
     self._load_config()
     # Control parameter used to scale bid price
     self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08]
     self.eps_start = 0.95
     self.eps_end = 0.05
     self.anneal = 0.00005
     self._reset_episode()
     # DQN Network to learn Q function
     self.dqn_agent = Agent(state_size=7, action_size=7, seed=0)
     # Reward Network to reward function
     self.reward_net = RewardNet(state_action_size=8, reward_size=1, seed=0)
     self.dqn_state = None
     self.dqn_action = 3  # no scaling
     self.dqn_reward = 0
     # Reward-Dictionary
     self.reward_dict = {}
     self.S = []
     self.V = 0
     self.total_wins = 0
     self.total_rewards = 0.0
예제 #12
0
def OldStuff():
    tf.compat.v1.disable_eager_execution()

    lr = 0.001
    numGames = 10000

    session = TriadGameSession()
    observation = session.getState()
    scores = []

    agent = Agent(gamma=0.99,
                  lr=lr,
                  epsilon=1.0,
                  epsilonDec=0.0005,
                  inputSize=[len(observation)],
                  numActions=session.getMaxActions(),
                  memSize=1000000,
                  batchSize=1024)

    for i in range(numGames):
        done = False
        score = 0
        session = TriadGameSession()
        observation = session.getState()
        while not done:
            action = agent.chooseAction(observation)
            observationNext, reward, done = session.step(action)
            score += reward
            agent.store(observation, action, reward, observationNext, done)
            observation = observationNext
            agent.learn()

        scores.append(score)
        avgScore = np.mean(scores[-100:])
        print('game:', i, 'score %.2f' % score, 'avgScore %.2f' % avgScore,
              'epsilon %.2f' % agent.epsilon)

    #agent.save()
    print('Finished!')
예제 #13
0
def setup_Agent(filename, epsilon):
    """
    Function to initialize the DQN agent
    """
    input_dims = 6 * 7
    action_space = tuple(range(7))
    n_actions = 7

    h1_dims = 512
    h2_dims = 256

    agent = Agent(lr=0.001,
                  gamma=0.95,
                  epsilon=epsilon,
                  epsilon_dec=0.995,
                  epsilon_min=0.01,
                  input_shape=input_dims,
                  h1_dims=h1_dims,
                  h2_dims=h2_dims,
                  action_space=action_space,
                  training_epochs=1,
                  fname=filename)

    return agent
예제 #14
0
파일: example.py 프로젝트: Exception4U/dqn
import sys
import gym
from dqn import Agent

num_episodes = 5000

env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0"
env = gym.make(env_name)

agent = Agent(state_size=env.observation_space.shape,
              number_of_actions=env.action_space.n,
              save_name=env_name)

for e in xrange(num_episodes):
    observation = env.reset()
    done = False
    agent.new_episode()
    total_cost = 0.0
    total_reward = 0.0
    frame = 0
    while not done:
        frame += 1
        #env.render()
        action, values = agent.act(observation)
        #action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        total_cost += agent.observe(reward)
        total_reward += reward
    print "total reward", total_reward
    print "mean cost", total_cost / frame
예제 #15
0
파일: main.py 프로젝트: neesetifa/Malmo
# ------------------------------ Variable Declaration ----------------------------------
NUM_OF_ZOMBIES = 1
NUM_OF_VILLAGERS = 1
agent_host = MalmoPython.AgentHost()
malmoutils.parse_command_line(agent_host)
validate = True

num_reps = 300

#=======core part initialization====================================
#input size 5*5, you can change the size here
memory = MemoryD(5)
network_model, q_values_func = nn_model(input_shape=[5, 5])

agent = Agent(network_model, q_values_func, memory, 'train', 'ddqn')
#set learning rate to be 0.00025
agent.do_compile(optimizer=Adam(lr=0.00025), loss_func=mean_huber_loss)
agent.memoryD.clear()
#===================================================================

for iRepeat in range(num_reps):
    my_mission_record = malmoutils.get_default_recording_object(
        agent_host, "./Mission_{}".format(iRepeat + 1))
    #my_mission_record = MalmoPython.MissionRecordSpec('./' + "Mission_" + str(iRepeat) + ".tgz")
    #my_mission_record.recordRewards()
    #my_mission_record.recordMP4(24,400000)
    #my_mission_record.recordObservations()
    my_mission = MalmoPython.MissionSpec(GetMissionXML(mapblock, agent_host),
                                         validate)
예제 #16
0
scores = []
epsHistory = []
numGames = 1
batch_size = 32
n_actions = 6
input_dims = (185, 95)
crop_start = (15, 30)
crop_end = (200, 125)
starting_epsilon = 0.05 if LOAD_MODEL else 1.0

env = gym.make('SpaceInvaders-v0')
brain = Agent(gamma=0.95,
              epsilon=0.05,
              lr=0.003,
              input_dims=input_dims,
              batch_size=batch_size,
              n_actions=n_actions,
              max_mem_size=5000,
              save_path='models/')

if LOAD_MODEL:
    brain.load()
else:
    # load memory with random games
    while brain.mem_cntr < brain.mem_size:
        observation = env.reset()
        observation = preprocess(observation, crop_start, crop_end)
        done = False
        while not done:
            # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire
            action = env.action_space.sample()
예제 #17
0
파일: main.py 프로젝트: purutu/FFTriadBuddy
from triadgame import TriadGameSession
import numpy as np
import tensorflow as tf

tf.compat.v1.disable_eager_execution()

lr = 0.001
numGames = 10000

session = TriadGameSession()
observation = session.getState()
scores = []

agent = Agent(gamma=0.99, lr=lr, epsilon=1.0, epsilonDec=0.0005,
              inputSize=[len(observation)],
              numActions=session.getMaxActions(),
              memSize=1000000,
              batchSize=64)

for i in range(numGames):
    done = False
    score = 0
    session = TriadGameSession()
    observation = session.getState()
    while not done:
        action = agent.chooseAction(observation)
        observationNext, reward, done = session.step(action)
        score += reward
        agent.store(observation, action, reward, observationNext, done)
        observation = observationNext
        agent.learn()
예제 #18
0
def train(path, env):
    #env = Monitor(env, path, video_callable=video_callable, force=True)
    agent = Agent(env, path=path)
    agent.train()
    return agent
예제 #19
0
if __name__ == '__main__':

    env = gym.make('CartPole-v0')

    params = {
        'gamma': 0.8,
        'epsi_high': 0.9,
        'epsi_low': 0.05,
        'decay': 200,
        'lr': 0.001,
        'capacity': 10000,
        'batch_size': 64,
        'state_space_dim': env.observation_space.shape[0],
        'action_space_dim': env.action_space.n
    }
    agent = Agent(**params)

    score = []
    mean = []

    for episode in range(1000):
        s0 = env.reset()
        total_reward = 1
        while True:
            env.render()
            a0 = agent.act(s0)
            s1, r1, done, _ = env.step(a0)

            if done:
                r1 = -1
예제 #20
0
    env = make_env('BreakoutNoFrameskip-v4')
    #env = make_env('SpaceInvadersNoFrameskip-v4')

    test_rewards, test_qvalue, test_times = [], [], []
    scores, eps_history = [], []
    num_games = 10_000
    number_of_tests = 30  # Numero de pruebas a realizar
    n_steps, n_test = 0, 1  # Contador de steps y pruebas individuales
    n_test_instance = 1  # Contador de instancias de prueba
    test_every_frames = 520_000  # Realizar pruebas cada n frames
    load_checkpoint = False  # Cargar modelo (?)
    render = False

    agent = Agent(gamma=0.99,
                  epsilon=1.0,
                  alpha=0.00025,
                  input_dims=env.observation_space.shape,
                  n_actions=env.action_space.n,
                  mem_size=200_000,
                  eps_min=0.1,
                  batch_size=32,
                  replace=10_000,
                  eps_dec=1e-5,
                  save_name='dqn_model',
                  load_name='dqn_model_5000it.h5')

    if load_checkpoint:
        agent.epsilon = 0.1
        agent.load_models()

    last_ep = 0
    for episode in tqdm(range(num_games)):
예제 #21
0
def main():
    # Initialize environment, agent
    env = gym.make(ENV_NAME)
    summary_writer = tf.summary.create_file_writer(LOG_DIR)
    agent = Agent(learning_rate=0.01, gamma=0.95,
                  state_shape=env.observation_space.shape, actions=7,
                  batch_size=64,
                  epsilon_initial=0.9, epsilon_decay=1e-6, epsilon_final=0.01,
                  replay_buffer_capacity=1000000,
                  model_name=MODEL_FILE, model_dir=MODEL_DIR,
                  ckpt_dir=CHECKPOINTS_DIR, log_dir=LOG_DIR)

    scores = []
    for i in range(1, EPISODES + 1):
        done = False
        score = 0
        state = env.reset()
        steps_per_episode = 0

        # Play one episode
        while not done:
            # Choose action (epsilon greedy), and execute
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            score += reward

            # Store in experience replay buffer
            agent.store_experience(state, action,
                                   reward, next_state, done)
            state = next_state
            agent.train()
            steps_per_episode += 1
        if len(scores) == 100:
            scores.pop(0)
        scores.append(score)

        avg_score = np.mean(scores)
        min_score = np.min(scores)
        max_score = np.max(scores)

        print(
            f'Episode: {i}, Score {score:.2f}, Avg_score {avg_score:.2f}, Epsilon {agent.epsilon:.2f}')

        # Summaries for Tensorboard
        write_summaries(summary_writer, {
            'epsilon': agent.epsilon,
            'reward.episode': score,
            'reward.avg': avg_score,
            'reward.min': min_score,
            'reward.max': max_score,
            'steps.count': steps_per_episode
        }, i, ENV_NAME)

        # Save the model
        if i % SAVE_INTERVAL == 0:
            print(f'Saving model to \'{MODEL_FILE}\' [Overwriting]')
            agent.save_model()

        # Save checkpoint
        if i % CHECKPOINT_INTERVAL == 0:
            print(f'Adding checkpoint: \'{CHECKPOINTS_DIR}/episode-{i}.h5\'')
            agent.save_checkpoint(f'episode-{i}')
예제 #22
0
import numpy as np
from dqn import Agent
from utils import plotLearning, make_env

if __name__ == '__main__':    
    env = make_env('PongNoFrameskip-v4')

    num_games = 500
    load_checkpoint = False
    best_score = -21
    agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0001,
                  input_dims=(4,80,80), n_actions=6, mem_size=25000,
                  eps_min=0.02, batch_size=32, replace=1000, eps_dec=1e-5)

    if load_checkpoint:
        agent.load_models()

    filename = 'PongNoFrameskip-v4.png'

    scores, eps_history = [], []
    n_steps = 0

    for i in range(num_games):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            n_steps += 1
            score += reward
예제 #23
0
def main():
    scores = []
    eps_history = []
    info_history = []

    # Random starting-points:
    env = sky.make(random=True,
                   xi=(301, 650 - 25),
                   yi=(100, 300 - 25),
                   width=15,
                   height=15,
                   v_initial=14)
    # Fixed starting-point:
    #env = sky.make(xi=550)

    agent = Agent(gamma=gamma,
                  epsilon=epsilon,
                  lr=lr,
                  input_dims=[imput_dimensions],
                  n_actions=n_actions,
                  mem_size=mem_size,
                  batch_size=batch_size,
                  epsilon_dec=epsilon_dec)

    if (load_checkpoint):
        agent.load_modes()

    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            '''
            one game: ending, when done=True
            '''
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, observation_,
                                   int(done))
            observation = observation_
            agent.learn()

        if i % 10 == 0 and i > 0:
            avg_score = np.mean(scores[max(0, i - 10):(i + 1)])
            print(i, 'episode', info, '|| score:', score,
                  '| average score: %.3f' % avg_score,
                  '| epsilon: %.3f' % agent.epsilon, '| training done:',
                  round(i / n_games, 2))
        else:
            print(i, 'episode', info, '|| score:', score)

        scores.append(score)
        eps_history.append(agent.epsilon)
        info_history.append(info)

    print('training ended with:',
          [[el, info_history.count(el)] for el in ('crashed', 'goal')])

    if (save_checkpoint):
        agent.save_models()
        print('[+] model saved')

    # -------------------
    # Plotting and output
    # -------------------
    x = [i + 1 for i in range(n_games)]

    # First axis: Scores
    fig, ax1 = plt.subplots()
    color = 'tab:red'
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('score per Episode', color=color)
    ax1.scatter(x, scores, color=color, s=2)
    ax1.tick_params(axis='y', labelcolor=color)

    # Second axis: epsilon
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    color = 'tab:blue'
    ax2.set_ylabel('epsilon',
                   color=color)  # we already handled the x-label with ax1
    ax2.plot(x, eps_history, color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    # Output
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.savefig(filename)

    return env
예제 #24
0
    out2 = agent.model.predict(state2)
    print(out1)
    print(out2)
    assert np.all((out1[0]-out2[0])!=0), "Failed agent test - same output different state"
    print("Agent test passed :)\n\n")

state_shape=env.observation_space.shape # the state space
action_shape=env.action_space.n # the action space

#Testing Memory storage and sample
state = env.reset()
test_mem()
test_agent()

mem = Memory(10000, state_shape)
agent = Agent(state_shape, action_shape)

epsilon = 1
batch_size = 64

#action = env.action_space.sample()
#next_state, reward, done, info = env.step(action)

for game in range(n_games):

    state = env.reset()
    game_reward = 0

    for step in range(max_steps):
        #Render game
        if game % 10 == 0:
예제 #25
0
def main(argv):
	# Set seeds
	np.random.seed(FLAGS.seed)
	t.manual_seed(FLAGS.seed)

	# Create logfile
	f = create_exp_logfile(os.path.join(FLAGS.exp_log_dir, str(FLAGS.learning_rate), str(FLAGS.seed)))

	# Initialise agent and environment
	env = LunarLander()
	num_actions = env.num_actions()
	agent = Agent(body_type='ff', 
				  obs_num_features_or_obs_in_channels=FLAGS.observation_dimensions, 
				  fc_hidden_layer_size = FLAGS.fc_hidden_layer_size, 
				  output_actions = num_actions, 
				  use_target_net = FLAGS.use_target_net,
				  g = FLAGS.gamma, 
				  lr = FLAGS.learning_rate)

	# Initialise data structures
	c_buf = CircularBuffer(size=FLAGS.cb_size)
	er_buf = ExperienceReplayBuffer(size=FLAGS.er_size, batch_size=FLAGS.batch_size)

	# Initialise sampling range for e-greedy
	interval = t.distributions.uniform.Uniform(t.tensor([0.0]), t.tensor([1.0]))

	# Run
	step = 0
	episode_results = []
	state = env.reset()
	c_buf.append(t.from_numpy(state).float())

	while step < FLAGS.max_steps:
		# Agent select action
		eps = max(FLAGS.init_epsilon - (((FLAGS.init_epsilon - FLAGS.final_epsilon) / FLAGS.epsilon_anneal) * step), FLAGS.final_epsilon)

		if interval.sample() <= eps:
			action = np.random.randint(num_actions)
		else:
			action = agent.greedy_action(c_buf()).item()
		reward, next_state, terminal = env.act(action)
		terminal = 1 if terminal else 0

		er_buf.append(state, action, reward, next_state, terminal)
		state = next_state
		c_buf.append(t.from_numpy(state).float())

		if step > FLAGS.batch_size and step % FLAGS.update_frequency:
			batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminals = \
																				er_buf.sample()
			batch_states = t.from_numpy(batch_states).float()
			batch_actions = np.array(batch_actions)
			batch_rewards = np.array(batch_rewards)
			batch_next_states = t.from_numpy(batch_next_states).float()
			batch_terminals = np.array(batch_terminals)

			agent.optimise(batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminals)

		if step % FLAGS.target_network_update == 0:
			agent.sync()

		if terminal:
			episode_results.append(env.episode_return())
			state = env.reset()
			
		step += 1
	
		if step % FLAGS.evaluate == 0:
			f.write('{}, {}\n'.format(step, performance_avg(episode_results, FLAGS.num_episodes_average)))
			f.flush()

	f.close()
예제 #26
0
def run(env: LlvmEnv) -> None:

    agent = Agent(n_actions=15, input_dims=[69])
    env.observation_space = "InstCountNorm"
    agent.Q_eval.load_state_dict(torch.load("./H10-N4000-INSTCOUNTNORM.pth"))
    rollout(agent, env)
import gym
from dqn import DeepQNetwork, Agent
import numpy as np
from gym import wrappers

if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    brain = Agent(gamma=0.99,
                  epsilon=1.0,
                  n_actions=4,
                  batch_size=128,
                  input_dims=[8],
                  alpha=0.0003,
                  replace=64)

    scores = []
    eps_history = []
    num_games = 500
    score = 0

    for i in range(num_games):
        if i % 10 == 0 and i > 0:
            avg_score = np.mean(scores[max(0, i - 10):(i + 1)])
            print('episode: ', i, 'score: ', score,
                  ' average score %.3f' % avg_score,
                  'epsilon %.3f' % brain.EPSILON)
        else:
            print('episode: ', i, 'score: ', score)
        eps_history.append(brain.EPSILON)
        done = False
        observation = env.reset()
예제 #28
0
from dqn import Agent
import numpy as np
import gym
import matplotlib.pyplot as plt


if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    n_games = 300
    show = False
    agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0005, input_dims=8,
                n_actions=4, batch_size=64)

    scores = []
    eps_history = []

    for i in range(1, n_games+1):
        done = False
        score = 0
        obseervation = env.reset()
        while not done:
            if show:
                env.render()
            action = agent.choose_action(obseervation)
            obseervation_, reward, done, info = env.step(action)
            score += reward
            agent.remember(obseervation, action, reward, obseervation_, done)
            obseervation = obseervation_
            agent.learn()

        eps_history.append(agent.epsilon)
예제 #29
0
import gym
from keras.models import load_model
from dqn import Agent

env_name = 'CartPole-v0'
eps = 0.8
episodes = 5
env = gym.make(env_name)
model = load_model('./model/my_model.h5')
agent = Agent(env)

for episode in range(episodes):
    # initial state
    s = env.reset()

    done = False
    while not done:
        for i in range(50):
            a = agent.act(s, eps)
            env.render(a)
            s2, r, done, info = env.step(a)
            s = s2
env.close()
예제 #30
0
import gym
from dqn import Agent
from utils import PlotLearning
import numpy as np

if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    agent = Agent(gamma=0.99,
                  epsilon=1.0,
                  batch_size=64,
                  n_actions=4,
                  eps_end=0.01,
                  inp_dims=[8],
                  lr=0.001)
    scores, eps_history = [], []
    n_games = 500

    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, observation_,
                                   done)
            agent.learn()
            observation = observation_
        scores.append(score)
        eps_history.append(agent.epsilon)