示例#1
0
def load_agent(dim_state,num_action,step):

    # Load configuration.
    Cfg_filename = CFG_PATH + 'Cfg_' + TRAINING_DATE +'.json'
    with open(Cfg_filename,'r') as cfg:
        agent_cfg = json.load(cfg)[2] 
 
    # Delete "epsilon greedy" for validation.  
    agent_cfg['epsilon']=0

    # Create agent
    agent = DQNAgent(dim_state, num_action, agent_cfg)
    weights_agent = agent.critic.get_weights()
    num_weights = len(weights_agent)

    # Load weights
    Train_folder = CFG_PATH + TRAINING_DATE +'/'
    list_w = []
    for k in range(num_weights):
        W = np.loadtxt(Train_folder + 'Weight_' + str(k) + '_Step_' + str(step) +'.dat')
        list_w.append(W)

    agent.critic.set_weights(list_w)

    return agent
示例#2
0
def load_agent(filename, path_training, dim_state, num_action, step):

    # Load configuration.
    with open(filename, 'r') as cfg:
        agent_cfg = json.load(cfg)[2]

    # Delete "epsilon greedy" for validation.
    agent_cfg['epsilon'] = 0

    # Create agent
    agent = DQNAgent(dim_state, num_action, agent_cfg)
    weights_agent = agent.critic.get_weights()
    num_weights = len(weights_agent)

    # Load weights
    list_w = []
    for k in range(num_weights):
        W = np.loadtxt(path_training + 'Weight_' + str(k) + '_Step_' +
                       str(step) + '.dat')
        list_w.append(W)

    agent.critic.set_weights(list_w)

    return agent
示例#3
0
# Instantiating the agent
memory_size = 3000
state_size = len(state)
gamma = 0.96
epsilon_min = 0.01
batch_size = 64
action_size = len(SpreadTrading._actions)
train_interval = 10
learning_rate = 0.001

if not os.path.isfile("./model." + market + ".h5"):
    agent = DQNAgent(state_size=state_size,
                     action_size=action_size,
                     memory_size=memory_size,
                     episodes=episodes,
                     episode_length=episode_length,
                     train_interval=train_interval,
                     gamma=gamma,
                     learning_rate=learning_rate,
                     batch_size=batch_size,
                     epsilon_min=epsilon_min)

    # Warming up the agent
    for _ in range(memory_size):
        action = agent.act(state)
        next_state, reward, done, _ = environment.step(action)
        agent.observe(state, action, reward, next_state, done, warming_up=True)
    # Training the agent
    for ep in range(episodes):
        state = environment.reset()
        rew = np.float64(0)
        for _ in range(episode_length):
示例#4
0
# Instantiating the agent
memory_size = 3000
state_size = len(state)
gamma = 0.96
epsilon_min = 0.01
batch_size = 64
action_size = len(environment._actions)
train_interval = 10
learning_rate = 0.001

agent = DQNAgent(state_size=state_size,
                 action_size=action_size,
                 memory_size=memory_size,
                 episodes=episodes,
                 episode_length=episode_length,
                 train_interval=train_interval,
                 gamma=gamma,
                 learning_rate=learning_rate,
                 batch_size=batch_size,
                 epsilon_min=epsilon_min)
agent.brain = model

done = False
actions = ['buy', 'hold', 'sell']

while not done:
    action = agent.act(state)
    state, _, done, info = environment.step(action)
    if 'status' in info and info['status'] == 'Closed plot':
        done = True
    balance_X = api.balance()[market[3:]]
示例#5
0
    # 4. Configuration of agent in DQL. ------------------------------------
    max_memory = 500        
    gamma = 0
    per_step_eps = 0.5   
    epsilon = 1  
    epsilon_min = 0.1 
    epsilon_decay = epsilon_min ** (1/(per_step_eps*num_episodes*num_steps))    
    learning_rate = 0.0001    

    agent_cfg = {'max_memory':max_memory, 'gamma':gamma, 'epsilon': epsilon,
                    'epsilon_min':epsilon_min, 'epsilon_decay':epsilon_decay, 
                    'learning_rate':learning_rate}

    # Initialization of Agent
    agent = DQNAgent(dim_state, num_action, agent_cfg)
    # ----------------------------------------------------------------------

    # 5. Reward function Parameters ----------------------------------------
    psd_bands = [4, 10, 20, 100, 200]          # [Hz]. Frequency range for Power calculation. (verificar que no cambia usar np.array)
    plv_bands = [[1, 19], [20, 200]]           # [Hz]. Frequency bands of interest for  PLV computation.

    method = 'LfPlvAmpExp'
    constants = [1, -50, -3, -0.25]
    reward_cfg = {'fs':fs, 'psd_bands':psd_bands, 'plv_bands':plv_bands,'constants':constants, 'method':method}

    # Save all configurations 
    save_configurations(CFG_FILENAME, bgtc_cfg, algorithm_cfg, agent_cfg, agent.critic, reward_cfg)

    # ----------------------------------------------------------------------
    # ----------------------------------------------------------------------
示例#6
0
import matplotlib.pyplot as plt

env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
observation_state_size = brain.vector_observation_space_size
action_space_size = brain.vector_action_space_size

epsilon = 1.0
eps_decay = 0.99
eps_min = 0.001
gamma = 0.99
training_interval = 4

from dqnagent import DQNAgent
agent = DQNAgent(observation_state_size, action_space_size)
scores = []
last_hundred_scores = deque(maxlen=100)
for episode in range(0, 1000):
    env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
    state = env_info.vector_observations[0]
    # get the current state
    score = 0  # initialize the score
    epsilon = max(epsilon * eps_decay, eps_min)
    t = 0
    while (True):
        t += 1
        action = agent.select_action(state, epsilon)
        env_info = env.step(action)[
            brain_name]  # send the action to the environment
        next_state = env_info.vector_observations[0]  # get the next state
示例#7
0
env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
observation_state_size = brain.vector_observation_space_size
action_space_size = brain.vector_action_space_size

epsilon = 0
eps_decay = 0.99
eps_min = 0.001
gamma = 0.99
training_interval = 4

from dqnagent import DQNAgent

agent = DQNAgent(observation_state_size, action_space_size)
agent.network1.load_state_dict(torch.load('checkpoint4.pth'))
for episode in range(0, 2):
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    state = env_info.vector_observations[0]
    # get the current state
    score = 0  # initialize the score
    while (True):
        action = agent.select_action(state, 0)
        env_info = env.step(action)[
            brain_name]  # send the action to the environment
        next_state = env_info.vector_observations[0]  # get the next state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished
        score += reward  # update the score
        state = next_state  # roll over the state to next time step
示例#8
0
dqn_online.to(device)
dqn_target.to(device)
# optimizer = torch.optim.RMSprop(dqn_online.parameters(), lr=LR, momentum=0.95, eps=0.01) # paper used rmsprop
optimizer = torch.optim.Adam(dqn_online.parameters(), lr=LR)
if CKPT_ENABLED and os.path.exists(CKPT_FILENAME):
    progress = load_checkpoint(dqn_online, dqn_target, optimizer,
                               CKPT_FILENAME)
else:
    progress = []

dqn_target.eval()
mem_buffer = ReplayMemory(MEMORY_SIZE, STATE_SHAPE)

loss_fn = torch.nn.SmoothL1Loss()  # huber loss function
agent = DQNAgent(device, mem_buffer, dqn_online, dqn_target, optimizer,
                 loss_fn, GAMMA, BATCH_SIZE, UPDATE_ONLINE_INTERVAL,
                 UPDATE_TARGET_INTERVAL)

# training phase

# adjust these hyperparameters as necessary
num_episodes = 5000  # number of episodes to train for
explore_phase_length = 50000  # number of steps without any exploitation (paper used 50k)
epsilon = 1.0  # initial epsilon value (paper used 1.0)
epsilon_decrement_steps = 1000000  # how many steps to decrement epsilon to min value (paper used 1 million)
intermediate_epsilon = 0.1  # can be used to decay epsilon in two phases as recommended by openai (set equal to min_epsilon to disable)
min_epsilon = 0.01  # smallest possible value of epsilon (paper used 0.1 for dqn, 0.01 for ddqn)
epsilon_dec = (epsilon - intermediate_epsilon) / epsilon_decrement_steps
final_epsilon_decay = (intermediate_epsilon -
                       min_epsilon) / epsilon_decrement_steps
示例#9
0
import os
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "lib"))

import tictactoe.gym as gym
import tictactoe.agent as agent
import tictactoe.test_utils as utils

import tensorflow as tf

from tictactoe_dqn import TicTacToeDQN
from dqnagent import DQNAgent

env = gym.getEnv()
dqn1 = TicTacToeDQN()
dqn2 = TicTacToeDQN()
agent1 = DQNAgent(1, dqn1)
agent2 = DQNAgent(-1, dqn2)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
dqn1.set_session(sess)
dqn2.set_session(sess)
dqn1.load("./models/dqn1.ckpt")
dqn2.load("./models/dqn2.ckpt")

dual = agent.DualAgent(agent1, agent2)

print(utils.play(env, dual, render=True))
print(utils.test_player1(env, agent1))
print(utils.test_player2(env, agent2))
示例#10
0
def main(_):
    flags = tf.app.flags.FLAGS

    # set up an environment
    env = gym.make(config.environment)

    if config.record:
        env = gym.wrappers.Monitor(env, config.record_path, force=True)

    # set up an agent
    if config.continuous_action:
        agent = A3CAgent(env,
                         PiVNetwork,
                         history_length=1,
                         log_dir=config.log_dir)
    else:
        agent = DQNAgent(env,
                         QNetwork,
                         minibatch_size_limit=32,
                         replay_memory_size=1000000,
                         history_length=1,
                         target_update_step=200,
                         discount_factor=0.99,
                         learning_rate=0.0025,
                         initial_exploration=1.0,
                         final_exploration=0.01,
                         final_exploration_frame=10000,
                         replay_start_size=100,
                         log_dir=config.log_dir)

    print('Observation space: ', env.observation_space, 'Action space: ',
          env.action_space)

    if config.restore_model_path:
        agent.restore_variables(config.restore_model_path)

    total_frames = 0

    # training
    for episode in range(1, config.max_episodes + 1):
        terminal = False
        total_reward = 0
        frames = 0

        while not terminal:
            if config.train:
                a, s, r_t, terminal, info = agent.act_and_train()
            else:
                a, s, r_t, terminal, info = agent.act()
            if config.render:
                env.render()
            total_reward += r_t
            frames += 1
            total_frames += 1

        if episode % config.interval_to_save_model == 0:
            agent.save_variables(episode, config.save_model_dir)

        #print('Episode: ', episode, ' Frames: ', total_frames, ' R: ', total_reward, ' Epsilon: ', info['epsilon'])
        print('Episode: ', episode, ' Frames: ', total_frames, ' R: ',
              total_reward)
        agent.write_summary(episode, total_reward)

        agent.new_episode()
示例#11
0
def train():
    with tf.Session() as sess:
        DQNbrain = DQNAgent(
            sess,
            OUTPUT,
            INPUT,
            learning_rate=LEARNING_RATE,
            gamma=DISCOUNT,
            batch_size=MINIBATCH_SIZE,
            buffer_size=MEMORY_SIZE,
            target_update_step=TARGET_UPDATE,
            e_greedy=not LOAD_model,
            e_step=1000,
            gradient_norm=None,
        )

        if LOAD_model:
            DQNbrain.load_model(tf.train.latest_checkpoint(path))
        else:
            sess.run(tf.global_variables_initializer())

        all_rewards = []
        frame_rewards = []
        loss_list = []
        loss_frame = []
        recent_rlist = deque(maxlen=15)
        recent_rlist.append(0)
        episode, epoch, frame = 0, 0, 0
        start = timer()
        env.env.masspole = 0.05
        env.env.length = 2.
        #env.env.force_mag = 10.

        while np.mean(recent_rlist) < 499:
            episode += 1

            rall, count = 0, 0
            done = False
            s = env.reset()

            while not done:
                if RENDER:
                    env.render()

                frame += 1
                count += 1

                action, actions_value = DQNbrain.choose_action(s)

                s_, r, done, l = env.step(action)

                if done and count >= 500:
                    reward = 1
                elif done and count < 500:
                    reward = -10
                else:
                    reward = 0

                DQNbrain.memory_add(s, float(action), reward, s_, int(done))
                s = s_

                rall += r

                if frame > TRAIN_START and TRAIN:
                    loss = DQNbrain.learn()
                    loss_list.append(loss)
                    loss_frame.append(frame)

            recent_rlist.append(rall)
            all_rewards.append(rall)
            frame_rewards.append(frame)

            print(
                "Episode:{} | Frames:{} | Reward:{} | Recent reward:{}".format(
                    episode, frame, rall, np.mean(recent_rlist)))

        if os.path.isdir(path):
            shutil.rmtree(path)
        os.mkdir(path)
        ckpt_path = os.path.join(path, 'DQN.ckpt')
        if SAVE_model:
            DQNbrain.save_model(ckpt_path)

        plt.figure(figsize=(10, 8))
        plt.subplot(211)
        plt.title('Episode %s. Recent_reward: %s. Time: %s' %
                  (len(all_rewards), np.mean(recent_rlist),
                   timedelta(seconds=int(timer() - start))))
        plt.plot(frame_rewards, all_rewards)
        plt.ylim(0, 510)
        plt.subplot(212)
        plt.title('Loss')
        plt.plot(loss_frame, loss_list)
        #plt.ylim(0, 20)
        plt.show()
        plt.close()
示例#12
0
def test():
    with tf.Session() as sess:
        DQNbrain = DQNAgent(
            sess,
            OUTPUT,
            INPUT,
            learning_rate=LEARNING_RATE,
            gamma=DISCOUNT,
            batch_size=MINIBATCH_SIZE,
            buffer_size=MEMORY_SIZE,
            target_update_step=TARGET_UPDATE,
            e_greedy=not LOAD_model,
            e_step=1000,
            gradient_norm=None,
        )

        DQNbrain.load_model(tf.train.latest_checkpoint(path))

        masspole_list = np.arange(0.01, 0.21, 0.025)
        length_list = np.arange(0.5, 3, 0.25)

        performance_mtx = np.zeros(
            [masspole_list.shape[0], length_list.shape[0]])

        for im in range(masspole_list.shape[0]):
            for il in range(length_list.shape[0]):
                env.env.masspole = masspole_list[im]
                env.env.length = length_list[il]

                all_rewards = []

                for episode in range(5):

                    rall, count = 0, 0
                    done = False
                    s = env.reset()

                    while not done:
                        if RENDER:
                            env.render()

                        action, actions_value = DQNbrain.choose_action(s)

                        s_, r, done, _ = env.step(action)

                        s = s_

                        rall += r

                    all_rewards.append(rall)

                    print("Episode:{} | Reward:{} ".format(episode, rall))

                performance_mtx[im, il] = np.mean(all_rewards)

        fig, ax = plt.subplots()
        ims = ax.imshow(performance_mtx,
                        cmap=cm.gray,
                        interpolation=None,
                        vmin=0,
                        vmax=500)
        ax.set_xticks(
            np.arange(0, length_list.shape[0], length_list.shape[0] - 1))
        ax.set_xticklabels(['0.5', '3'])
        ax.set_yticks(
            np.arange(0, masspole_list.shape[0], masspole_list.shape[0] - 1))
        ax.set_yticklabels(['0.01', '0.20'])
        ax.set_xlabel('Pole length')
        ax.set_ylabel('Pole mass')
        ax.set_title('Robustness test: DQN')
        fig.colorbar(ims, ax=ax)
        plt.show()
        plt.close()
示例#13
0
configs = [
    pong_dddqn_config0, pong_dddqn_config1, pong_dddqn_config2,
    pong_dddqn_config3, pong_dddqn_config4, pong_dddqn_config5,
    pong_dddqn_config6, pong_dddqn_config7
]
env = make_atari_deepmind(env_name, skip=4)

import argparse

parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('index', metavar='N', type=int, nargs='+', help='an index')

args = parser.parse_args()
print('selected index:', args.index)
print(configs[args.index[0]])
agent = DQNAgent(env, sess, env_name, config=configs[args.index[0]])
env.reset()
agent.train()

import wrappers


#print(env.unwrapped.get_action_meanings())
def evaluate(env, t_max=10000):
    rewards = []
    env._max_episode_steps = 9999
    print('reset')
    s = env.reset()
    reward = 0
    for it in range(t_max):
        qvalues = agent.get_qvalues([s])