예제 #1
0
 def __init__(self, fname):
     lr = 0.0005
     self.agent = Agent(gamma=0.99,
                        epsilon=0.0,
                        alpha=lr,
                        input_dims=6,
                        n_actions=2,
                        mem_size=60000,
                        batch_size=64,
                        epsilon_end=0.0,
                        fname=fname)
     self.observation = []
     self.action = 0
     self.n_step = 0
     self.fname = fname.split("/")[-1]
예제 #2
0
    def __init__(self,
                 bandit,
                 epsilon,
                 alpha,
                 layersize=128,
                 UI=1000,
                 gm=0.99,
                 remember=False,
                 algorithm='DQNxR'):
        self.size = bandit.nvot
        if algorithm == 'DQNxR':
            seed = np.random.rand()  #DOESNT DO ANYTHING

            self.DQN = DQNxR(state_size=self.size,
                             action_size=bandit.N,
                             seed=seed,
                             alpha=alpha,
                             UI=UI,
                             batch_size=10,
                             gamma=gm,
                             tau=1e-3,
                             buffer_size=int(1e5))
            #print(vars(self.DQN))
            self.epsilon = epsilon
            self.last_state = None
            self.remember = remember
        elif algorithm == 'policygrad':
            self.DQN = None
            self.policy = PolicyGrad(state_space=self.size,
                                     action_space=bandit.N,
                                     hidden_layer_size=layersize,
                                     gamma=gm)
            self.optimizer = optim.Adam(self.policy.parameters(), lr=alpha)
            self.update_interval = UI
            self.remember = remember
예제 #3
0
def main(env, gamma, epsilon, final_epsilon, final_exp_step, lr, memory_size,
         target_update_freq, gradient_update_freq, batch_size, replay_start,
         val_freq, log_freq_by_step, log_freq_by_ep, val_epsilon, log_dir,
         weight_dir, steps):
    train_env = make_atari(env + "NoFrameskip-v4")
    val_env = make_atari(env + "NoFrameskip-v4", noop=False)

    agent = Agent(train_env,
                  DQN,
                  gamma=gamma,
                  epsilon=epsilon,
                  final_epsilon=final_epsilon,
                  final_exp_step=final_exp_step)
    trainer = Trainer(agent,
                      val_env,
                      lr=lr,
                      memory_size=memory_size,
                      target_update_freq=target_update_freq,
                      gradient_update_freq=gradient_update_freq,
                      batch_size=batch_size,
                      replay_start=replay_start,
                      val_freq=val_freq,
                      log_freq_by_step=log_freq_by_step,
                      log_freq_by_ep=log_freq_by_ep,
                      val_epsilon=val_epsilon,
                      log_dir=log_dir,
                      weight_dir=weight_dir)
    trainer.train(steps)
예제 #4
0
def main():
    agent = Agent()
    agent.load()

    total_reward = 0
    obs = env.reset()
    env.render()
    for _ in range(10000):
        act = agent.predict(obs)
        obs, reward, done, _ = env.step(act)
        total_reward += reward
        env.render()
        if done:
            print(f'total_reward: {total_reward}')
            env.close()
            break
예제 #5
0
def setup_Agent(filename, epsilon):
    """
    Function to initialize the DQN agent
    """
    # one hot vector (opponents move) on top of game board
    input_dims = 7 * 7

    action_space = tuple(range(7))
    n_actions = 7

    h1_dims = 512
    h2_dims = 256

    agent = Agent(lr=0.001,
                  gamma=0.95,
                  epsilon=epsilon,
                  epsilon_dec=0.995,
                  epsilon_min=0.01,
                  input_shape=input_dims,
                  h1_dims=h1_dims,
                  h2_dims=h2_dims,
                  action_space=action_space,
                  training_epochs=2,
                  fname=filename)

    memory = ReplayBuffer(50000, input_dims, n_actions)

    return agent, memory
예제 #6
0
def main(env_name=None):
    ENV_NAME = 'wumpus-v0'
    
    if env_name: ENV_NAME = env_name

    MODEL_DIR = f'models/{ENV_NAME}-dqn'
    MODEL_FILE = f'{ENV_NAME}-dqn.h5'
    CHECKPOINTS_DIR = f'models/{ENV_NAME}-dqn/checkpoints'
    TEST_IMG_DIR = f'tests/{ENV_NAME}-dqn'

    env = gym.make(ENV_NAME)
    env.reset()

    agent = Agent(learning_rate=0.01, gamma=0.95,
                    state_shape=env.observation_space.shape, actions=7,
                    batch_size=64,
                    epsilon_initial=0.0, epsilon_decay=0, epsilon_final=0.0,
                    replay_buffer_capacity=1000000,
                    model_name=MODEL_FILE, model_dir=MODEL_DIR,
                    ckpt_dir=CHECKPOINTS_DIR)
    agent.load_model()

    done = False
    score = 0
    steps_per_episode = 0
    state = env.reset()
    images = [env.render('rgb_array')]
    while not done:
        # Choose action according to policy, and execute
        action = agent.select_action(state)
        state, reward, done, _ = env.step(action)

        score += reward
        steps_per_episode += 1
        images.append(env.render('rgb_array'))

    # Generate GIF for the execution
    create_gif(
        f'{ENV_NAME}.gif',
        np.array(images),
        fps=1.0
    )

    print(
        f'Model \'{str(ENV_NAME)}\', score {score}, steps {steps_per_episode}')
예제 #7
0
class AI:
    def __init__(self, fname):
        lr = 0.0005
        self.agent = Agent(gamma=0.99,
                           epsilon=0.0,
                           alpha=lr,
                           input_dims=6,
                           n_actions=2,
                           mem_size=60000,
                           batch_size=64,
                           epsilon_end=0.0,
                           fname=fname)
        self.observation = []
        self.action = 0
        self.n_step = 0
        self.fname = fname.split("/")[-1]

    def episode_start(self, observation):
        self.observation = observation

    def choose_action(self):
        self.action = self.agent.choose_action(self.observation)
        return self.action

    def step(self, observation_, reward, done):
        self.agent.remember(self.observation, self.action, reward,
                            observation_, int(done))
        self.observation = observation_
        if self.n_step % 3 == 0:
            self.agent.learn()
        self.n_step += 1

    def episode_end(self):
        self.agent.save_model()
예제 #8
0
def test_agent():
    print("##################Running agent test##################")
    agent = Agent(state_shape, action_shape)
    state1 = np.array([1,2,3,4]).reshape(-1,4)
    state2 = np.array([2,3,4,5]).reshape(-1,4)
    out1 = agent.model.predict(state1)
    out2 = agent.model.predict(state2)
    print(out1)
    print(out2)
    assert np.all((out1[0]-out2[0])!=0), "Failed agent test - same output different state"
    print("Agent test passed :)\n\n")
 def __init__(self):
     self._load_config()
     # Control parameter used to scale bid price
     self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08]
     self.eps_start = 0.95
     self.eps_end = 0.05
     self.anneal = 0.00005
     self._reset_episode()
     # DQN Network to learn Q function
     self.dqn_agent = Agent(state_size=7, action_size=7, seed=0)
     # Reward Network to reward function
     self.reward_net = RewardNet(state_action_size=8, reward_size=1, seed=0)
     self.dqn_state = None
     self.dqn_action = 3  # no scaling
     self.dqn_reward = 0
     # Reward-Dictionary
     self.reward_dict = {}
     self.S = []
     self.V = 0
     self.total_wins = 0
     self.total_rewards = 0.0
예제 #10
0
def main():
    gym_env = gym.make('custom_gym:Xplane-v0')
    lr = 0.001
    gam = 0.01
    n_games = 1
    # nn_input = obs()
    agent = Agent(learning_rate=lr,
                  gamma=gam,
                  epsilon=1.0,
                  input_dims=(6, ),
                  n_actions=15,
                  batch_size=32,
                  file_name='AI_takeoff/saved_models/dq_model_2.h5')
    scores = []
    total_steps = []
    eps_hist = []
    agent.load_model()

    for i in range(n_games):
        try:
            done = False
            score = 0
            observation = gym_env.reset()
            time.sleep(2)
            observation_checkpoints = np.array([observation[0:2]])
            step_counter = 0
            print("GAME ITERATION ", i)
            while not done:
                action = agent.choose_action(observation)
                new_observation, reward, done = gym_env.step(action)
                step_counter = step_counter + 1
                score = score + reward
                agent.store_transition(observation, action, reward,
                                       new_observation, done)
                observation = new_observation
                # agent.learn()
                # This if statement checks if the airplane is stuck
                observation_checkpoints = np.append(observation_checkpoints,
                                                    [new_observation[0:2]],
                                                    axis=0)
                print(observation_checkpoints)
                print("stepcounter is", step_counter)
                if step_counter % 30 == 0:
                    if np.array_equal(
                            observation_checkpoints[step_counter - 30],
                            observation_checkpoints[step_counter - 1]):
                        done = True
            eps_hist.append(agent.epsilon)
            scores.append(score)
            total_steps.append(step_counter)
        except Exception as e:
            print(str(e))
예제 #11
0
def main():
    #make env and agent
    env = gym.make('LunarLander-v2')
    agent = Agent(gamma=0.99,
                  epsilon=1.0,
                  batch_size=64,
                  n_actions=4,
                  eps_end=0.01,
                  input_dims=[8],
                  lr=0.0001)

    scores, eps_history = [], []
    n_games = 500

    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            #ingame
            #get action from current view of game (observation)
            action = agent.choose_action(observation)
            #next frame
            observation_, reward, done, info = env.step(action)

            score += reward
            #store memory
            agent.store_transisation(observation, action, reward, observation_,
                                     done)
            agent.learn()

            #set next stage to current stage
            observation = observation_
        #append score and eps
        scores.append(score)
        eps_history.append(agent.epsilon)

        #print some nice statements
        avg_score = np.mean(scores[-100:])
        print(
            f'Episode: {i}   Score: {score}   Average Score: {avg_score}   Epsilon: {agent.epsilon}'
        )
예제 #12
0
def start():
    env = gym.make('CartPole-v0')

    params = {
        'gamma': 0.8,
        'epsi_high': 0.9,
        'epsi_low': 0.05,
        'decay': 500,
        'lr': 0.001,
        'capacity': 10000,
        'batch_size': 64,
        'state_space_dim': env.observation_space.shape[0],
        'action_space_dim': env.action_space.n
    }
    agent = Agent(**params)

    score = []
    mean = []

    for episode in range(1000):
        s0 = env.reset()
        total_reward = 1
        for i in range(200):
            env.render()
            a0 = agent.act(s0)
            s1, r1, done, _ = env.step(a0)

            if done:
                r1 = -1

            agent.put(s0, a0, r1, s1)

            if done:
                break

            total_reward += r1
            s0 = s1
            agent.learn()

        score.append(total_reward)
        mean.append(sum(score[-100:]) / 100)
        print(total_reward)
예제 #13
0
def OldStuff():
    tf.compat.v1.disable_eager_execution()

    lr = 0.001
    numGames = 10000

    session = TriadGameSession()
    observation = session.getState()
    scores = []

    agent = Agent(gamma=0.99,
                  lr=lr,
                  epsilon=1.0,
                  epsilonDec=0.0005,
                  inputSize=[len(observation)],
                  numActions=session.getMaxActions(),
                  memSize=1000000,
                  batchSize=1024)

    for i in range(numGames):
        done = False
        score = 0
        session = TriadGameSession()
        observation = session.getState()
        while not done:
            action = agent.chooseAction(observation)
            observationNext, reward, done = session.step(action)
            score += reward
            agent.store(observation, action, reward, observationNext, done)
            observation = observationNext
            agent.learn()

        scores.append(score)
        avgScore = np.mean(scores[-100:])
        print('game:', i, 'score %.2f' % score, 'avgScore %.2f' % avgScore,
              'epsilon %.2f' % agent.epsilon)

    #agent.save()
    print('Finished!')
예제 #14
0
def setup_Agent(filename, epsilon):
    """
    Function to initialize the DQN agent
    """
    input_dims = 6 * 7
    action_space = tuple(range(7))
    n_actions = 7

    h1_dims = 512
    h2_dims = 256

    agent = Agent(lr=0.001,
                  gamma=0.95,
                  epsilon=epsilon,
                  epsilon_dec=0.995,
                  epsilon_min=0.01,
                  input_shape=input_dims,
                  h1_dims=h1_dims,
                  h2_dims=h2_dims,
                  action_space=action_space,
                  training_epochs=1,
                  fname=filename)

    return agent
예제 #15
0
파일: RL_trainer.py 프로젝트: xuezzee/-
state_size = 122
action_size = 5

# Add some variables to keep track of the progress
scores_window, steps_window = [deque(maxlen=200) for _ in range(2)
                               ]  # a, b = [deque([]), deque([])]
agent_obs = [None] * flags.num_agents  # [None, None]
agent_obs_buffer = [None] * flags.num_agents
agent_action_buffer = [2] * flags.num_agents
max_steps = flags.episode_length
start_time = time.time()

# Load an RL agent and initialize it from checkpoint if necessary
# independent dqn/ppo -->每个人obs不同,同一个model
if flags.agent_type == "dqn":
    agent = DQN_Agent(state_size, action_size, flags.num_agents)
elif flags.agent_type == "ppo":
    agent = PPO_Agent(state_size, action_size, flags.num_agents)

if flags.load_model:
    start, eps = agent.load(project_root / 'checkpoints', 0, 1.0)
else:
    start, eps = 0, 1

if not flags.train:
    eps = 0.0

# Helper function to detect collisions
ACTIONS = {0: "up", 1: "right", 2: "down", 3: "left", 4: "stop"}

import gym
from dqn import DeepQNetwork, Agent
import numpy as np
from gym import wrappers

if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    brain = Agent(gamma=0.99,
                  epsilon=1.0,
                  n_actions=4,
                  batch_size=128,
                  input_dims=[8],
                  alpha=0.0003,
                  replace=64)

    scores = []
    eps_history = []
    num_games = 500
    score = 0

    for i in range(num_games):
        if i % 10 == 0 and i > 0:
            avg_score = np.mean(scores[max(0, i - 10):(i + 1)])
            print('episode: ', i, 'score: ', score,
                  ' average score %.3f' % avg_score,
                  'epsilon %.3f' % brain.EPSILON)
        else:
            print('episode: ', i, 'score: ', score)
        eps_history.append(brain.EPSILON)
        done = False
        observation = env.reset()
예제 #17
0
파일: example.py 프로젝트: Exception4U/dqn
import sys
import gym
from dqn import Agent

num_episodes = 5000

env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0"
env = gym.make(env_name)

agent = Agent(state_size=env.observation_space.shape,
              number_of_actions=env.action_space.n,
              save_name=env_name)

for e in xrange(num_episodes):
    observation = env.reset()
    done = False
    agent.new_episode()
    total_cost = 0.0
    total_reward = 0.0
    frame = 0
    while not done:
        frame += 1
        #env.render()
        action, values = agent.act(observation)
        #action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        total_cost += agent.observe(reward)
        total_reward += reward
    print "total reward", total_reward
    print "mean cost", total_cost / frame
예제 #18
0
파일: main.py 프로젝트: neesetifa/Malmo
# ------------------------------ Variable Declaration ----------------------------------
NUM_OF_ZOMBIES = 1
NUM_OF_VILLAGERS = 1
agent_host = MalmoPython.AgentHost()
malmoutils.parse_command_line(agent_host)
validate = True

num_reps = 300

#=======core part initialization====================================
#input size 5*5, you can change the size here
memory = MemoryD(5)
network_model, q_values_func = nn_model(input_shape=[5, 5])

agent = Agent(network_model, q_values_func, memory, 'train', 'ddqn')
#set learning rate to be 0.00025
agent.do_compile(optimizer=Adam(lr=0.00025), loss_func=mean_huber_loss)
agent.memoryD.clear()
#===================================================================

for iRepeat in range(num_reps):
    my_mission_record = malmoutils.get_default_recording_object(
        agent_host, "./Mission_{}".format(iRepeat + 1))
    #my_mission_record = MalmoPython.MissionRecordSpec('./' + "Mission_" + str(iRepeat) + ".tgz")
    #my_mission_record.recordRewards()
    #my_mission_record.recordMP4(24,400000)
    #my_mission_record.recordObservations()
    my_mission = MalmoPython.MissionSpec(GetMissionXML(mapblock, agent_host),
                                         validate)
예제 #19
0
scores = []
epsHistory = []
numGames = 1
batch_size = 32
n_actions = 6
input_dims = (185, 95)
crop_start = (15, 30)
crop_end = (200, 125)
starting_epsilon = 0.05 if LOAD_MODEL else 1.0

env = gym.make('SpaceInvaders-v0')
brain = Agent(gamma=0.95,
              epsilon=0.05,
              lr=0.003,
              input_dims=input_dims,
              batch_size=batch_size,
              n_actions=n_actions,
              max_mem_size=5000,
              save_path='models/')

if LOAD_MODEL:
    brain.load()
else:
    # load memory with random games
    while brain.mem_cntr < brain.mem_size:
        observation = env.reset()
        observation = preprocess(observation, crop_start, crop_end)
        done = False
        while not done:
            # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire
            action = env.action_space.sample()
예제 #20
0
파일: example.py 프로젝트: claymcleod/dqn
import gym
from dqn import Agent

# Python 3 compatability
try:
    xrange
except NameError:
    xrange = range

num_episodes = 20

env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0"
env = gym.make(env_name)

agent = Agent(state_size=env.observation_space.shape,
              number_of_actions=env.action_space.n,
              save_name=env_name)

for e in xrange(num_episodes):
    observation = env.reset()
    done = False
    agent.new_episode()
    total_cost = 0.0
    total_reward = 0.0
    frame = 0
    while not done:
        frame += 1
        #env.render()
        action, values = agent.act(observation)
        #action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
예제 #21
0
def train(path, env):
    #env = Monitor(env, path, video_callable=video_callable, force=True)
    agent = Agent(env, path=path)
    agent.train()
    return agent
예제 #22
0
import gym
from dqn import Agent
from utils import PlotLearning
import numpy as np

if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    agent = Agent(gamma=0.99,
                  epsilon=1.0,
                  batch_size=64,
                  n_actions=4,
                  eps_end=0.01,
                  inp_dims=[8],
                  lr=0.001)
    scores, eps_history = [], []
    n_games = 500

    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, observation_,
                                   done)
            agent.learn()
            observation = observation_
        scores.append(score)
        eps_history.append(agent.epsilon)
예제 #23
0
def run(env: LlvmEnv) -> None:

    agent = Agent(n_actions=15, input_dims=[69])
    env.observation_space = "InstCountNorm"
    agent.Q_eval.load_state_dict(torch.load("./H10-N4000-INSTCOUNTNORM.pth"))
    rollout(agent, env)
예제 #24
0
from dqn import Agent
import numpy as np
import gym
import matplotlib.pyplot as plt


if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    n_games = 300
    show = False
    agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0005, input_dims=8,
                n_actions=4, batch_size=64)

    scores = []
    eps_history = []

    for i in range(1, n_games+1):
        done = False
        score = 0
        obseervation = env.reset()
        while not done:
            if show:
                env.render()
            action = agent.choose_action(obseervation)
            obseervation_, reward, done, info = env.step(action)
            score += reward
            agent.remember(obseervation, action, reward, obseervation_, done)
            obseervation = obseervation_
            agent.learn()

        eps_history.append(agent.epsilon)
예제 #25
0
def main():
    scores = []
    eps_history = []
    info_history = []

    # Random starting-points:
    env = sky.make(random=True,
                   xi=(301, 650 - 25),
                   yi=(100, 300 - 25),
                   width=15,
                   height=15,
                   v_initial=14)
    # Fixed starting-point:
    #env = sky.make(xi=550)

    agent = Agent(gamma=gamma,
                  epsilon=epsilon,
                  lr=lr,
                  input_dims=[imput_dimensions],
                  n_actions=n_actions,
                  mem_size=mem_size,
                  batch_size=batch_size,
                  epsilon_dec=epsilon_dec)

    if (load_checkpoint):
        agent.load_modes()

    for i in range(n_games):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            '''
            one game: ending, when done=True
            '''
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, observation_,
                                   int(done))
            observation = observation_
            agent.learn()

        if i % 10 == 0 and i > 0:
            avg_score = np.mean(scores[max(0, i - 10):(i + 1)])
            print(i, 'episode', info, '|| score:', score,
                  '| average score: %.3f' % avg_score,
                  '| epsilon: %.3f' % agent.epsilon, '| training done:',
                  round(i / n_games, 2))
        else:
            print(i, 'episode', info, '|| score:', score)

        scores.append(score)
        eps_history.append(agent.epsilon)
        info_history.append(info)

    print('training ended with:',
          [[el, info_history.count(el)] for el in ('crashed', 'goal')])

    if (save_checkpoint):
        agent.save_models()
        print('[+] model saved')

    # -------------------
    # Plotting and output
    # -------------------
    x = [i + 1 for i in range(n_games)]

    # First axis: Scores
    fig, ax1 = plt.subplots()
    color = 'tab:red'
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('score per Episode', color=color)
    ax1.scatter(x, scores, color=color, s=2)
    ax1.tick_params(axis='y', labelcolor=color)

    # Second axis: epsilon
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    color = 'tab:blue'
    ax2.set_ylabel('epsilon',
                   color=color)  # we already handled the x-label with ax1
    ax2.plot(x, eps_history, color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    # Output
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.savefig(filename)

    return env
예제 #26
0
def main():
    # Initialize environment, agent
    env = gym.make(ENV_NAME)
    summary_writer = tf.summary.create_file_writer(LOG_DIR)
    agent = Agent(learning_rate=0.01, gamma=0.95,
                  state_shape=env.observation_space.shape, actions=7,
                  batch_size=64,
                  epsilon_initial=0.9, epsilon_decay=1e-6, epsilon_final=0.01,
                  replay_buffer_capacity=1000000,
                  model_name=MODEL_FILE, model_dir=MODEL_DIR,
                  ckpt_dir=CHECKPOINTS_DIR, log_dir=LOG_DIR)

    scores = []
    for i in range(1, EPISODES + 1):
        done = False
        score = 0
        state = env.reset()
        steps_per_episode = 0

        # Play one episode
        while not done:
            # Choose action (epsilon greedy), and execute
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            score += reward

            # Store in experience replay buffer
            agent.store_experience(state, action,
                                   reward, next_state, done)
            state = next_state
            agent.train()
            steps_per_episode += 1
        if len(scores) == 100:
            scores.pop(0)
        scores.append(score)

        avg_score = np.mean(scores)
        min_score = np.min(scores)
        max_score = np.max(scores)

        print(
            f'Episode: {i}, Score {score:.2f}, Avg_score {avg_score:.2f}, Epsilon {agent.epsilon:.2f}')

        # Summaries for Tensorboard
        write_summaries(summary_writer, {
            'epsilon': agent.epsilon,
            'reward.episode': score,
            'reward.avg': avg_score,
            'reward.min': min_score,
            'reward.max': max_score,
            'steps.count': steps_per_episode
        }, i, ENV_NAME)

        # Save the model
        if i % SAVE_INTERVAL == 0:
            print(f'Saving model to \'{MODEL_FILE}\' [Overwriting]')
            agent.save_model()

        # Save checkpoint
        if i % CHECKPOINT_INTERVAL == 0:
            print(f'Adding checkpoint: \'{CHECKPOINTS_DIR}/episode-{i}.h5\'')
            agent.save_checkpoint(f'episode-{i}')
예제 #27
0
    out2 = agent.model.predict(state2)
    print(out1)
    print(out2)
    assert np.all((out1[0]-out2[0])!=0), "Failed agent test - same output different state"
    print("Agent test passed :)\n\n")

state_shape=env.observation_space.shape # the state space
action_shape=env.action_space.n # the action space

#Testing Memory storage and sample
state = env.reset()
test_mem()
test_agent()

mem = Memory(10000, state_shape)
agent = Agent(state_shape, action_shape)

epsilon = 1
batch_size = 64

#action = env.action_space.sample()
#next_state, reward, done, info = env.step(action)

for game in range(n_games):

    state = env.reset()
    game_reward = 0

    for step in range(max_steps):
        #Render game
        if game % 10 == 0:
class RlBidAgent():
    def _load_config(self):
        """
        Parse the config.cfg file
        """
        cfg = configparser.ConfigParser(allow_no_value=True)
        env_dir = os.path.dirname(__file__)
        cfg.read(env_dir + '/config.cfg')
        self.budget = int(cfg['agent']['budget'])
        self.target_value = int(cfg['agent']['target_value'])
        self.T = int(cfg['rl_agent']['T'])  # T number of timesteps
        self.STATE_SIZE = int(cfg['rl_agent']['STATE_SIZE'])
        self.ACTION_SIZE = int(cfg['rl_agent']['ACTION_SIZE'])

    def __init__(self):
        self._load_config()
        # Control parameter used to scale bid price
        self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08]
        self.eps_start = 0.95
        self.eps_end = 0.05
        self.anneal = 0.00005
        self._reset_episode()
        # DQN Network to learn Q function
        self.dqn_agent = Agent(state_size=7, action_size=7, seed=0)
        # Reward Network to reward function
        self.reward_net = RewardNet(state_action_size=8, reward_size=1, seed=0)
        self.dqn_state = None
        self.dqn_action = 3  # no scaling
        self.dqn_reward = 0
        # Reward-Dictionary
        self.reward_dict = {}
        self.S = []
        self.V = 0
        self.total_wins = 0
        self.total_rewards = 0.0

    def _reset_episode(self):
        """
        Function to reset the state when episode changes
        """
        self.t_step = 0  # 1. t: the current time step
        self.budget_spend = 0.0
        self.rem_budget = self.budget  # 2. the remaining budget at time-step t
        self.ROL = self.T  # 3. the number of Lambda regulation opportunities left
        self.prev_budget = self.budget  # Bt-1
        self.BCR = 0  # 4. Budget consumption rate
        #      (self.budget - self.prev_budget) / self.prev_budget
        self.CPM = 0  # 5. Cost per mille of impressions between t-1 and t
        #       (self.prev_budget - self.running_budget) / self.cur_wins
        self.WR = 0  # 6. wins_e / total_impressions
        self._reset_step(
        )  # 7. Total value of the winning impressions 'click_prob'
        self.cur_day = 1
        self.cur_hour = 0
        self.ctl_lambda = 1.0  # Lambda sequential regulation parameter
        self.wins_e = 0
        self.eps = self.eps_start
        self.V = 0

    def _update_step(self):
        """
        Function to call to update the state with every bid request
        received for the state modeling
        """
        self.t_step += 1
        self.prev_budget = self.rem_budget
        self.rem_budget -= (self.cost_t / 1e9)
        self.ROL -= 1
        self.BCR = (self.rem_budget - self.prev_budget) / self.prev_budget
        self.CPM = self.cost_t
        self.WR = self.wins_t / self.bids_t

    def _reset_step(self):
        """
        Function to call every time a new time step is entered.
        """
        self.reward_t = 0.
        self.cost_t = 0.
        self.wins_t = 0
        self.bids_t = 0
        self.eps = max(self.eps_start - self.anneal * self.t_step, 0.05)

    def _update_reward_cost(self, reward, cost):
        """
        Internal function to update reward and action to compute the cumulative
        reward and cost within the given step.
        """
        self.reward_t += reward
        self.cost_t += cost
        self.bids_t += 1
        self.total_rewards += reward

    def _get_state(self):
        """
        Returns the state that will be used for the DQN state.
        """
        return np.asarray([
            self.t_step, self.rem_budget, self.ROL, self.BCR, self.CPM,
            self.WR, self.reward_t
        ])

    def act(self, state, reward, cost):
        """
        This function gets called with every bid request.
        By looking at the weekday and hour to progress between the steps and
        episodes during training.
        Returns the bid request cost based on the scaled version of the
        bid price using the DQN agent output.
        """
        episode_done = (state['weekday'] != self.cur_day)
        # within the time step
        if state['hour'] == self.cur_hour and state['weekday'] == self.cur_day:
            self._update_reward_cost(reward, cost)
        # within the episode, changing the time step
        elif state['hour'] != self.cur_hour and state[
                'weekday'] == self.cur_day:
            self._update_step()
            # Sample a mini batch and perform grad-descent step
            self.reward_net.step()
            dqn_next_state = self._get_state()
            a_beta = self.dqn_agent.act(dqn_next_state, eps=self.eps)
            sa = np.append(self.dqn_state, self.dqn_action)
            rnet_r = float(self.reward_net.act(sa))
            # call agent step
            self.dqn_agent.step(self.dqn_state, self.dqn_action, rnet_r,
                                dqn_next_state, episode_done)
            self.dqn_state = dqn_next_state
            self.dqn_action = a_beta
            # print(dqn_next_state, a_beta)
            self.ctl_lambda *= (1 + self.BETA[a_beta])
            self.cur_hour = state['hour']
            self._reset_step()
            self._update_reward_cost(reward, cost)
            self.V += self.reward_t
            self.S.append((self.dqn_state, self.dqn_action))
        # episode changes
        elif state['weekday'] != self.cur_day:
            for (s, a) in self.S:
                sa = tuple(np.append(s, a))
                max_r = max(self.reward_net.get_from_M(sa), self.V)
                self.reward_net.add_to_M(sa, max_r)
                self.reward_net.add(sa, max_r)
            print("Total Impressions won with Budget={} Spend={} wins = {}".
                  format(self.budget, self.budget_spend, self.wins_e))
            self.total_wins += self.wins_e
            self._reset_episode()
            self.cur_day = state['weekday']
            self.cur_hour = state['hour']
            self._update_reward_cost(reward, cost)

        # action = bid amount
        # send the best estimate of the bid
        self.budget_spend += (cost / 1e9)
        if cost > 0:
            self.wins_t += 1
            self.wins_e += 1
        action = min(
            self.ctl_lambda * self.target_value * state['click_prob'] * 1e9,
            (self.budget - self.budget_spend) * 1e9)
        return action

    def done(self):
        return self.budget <= self.budget_spend
예제 #29
0
class Neural_Agent:
    def __init__(self,
                 bandit,
                 epsilon,
                 alpha,
                 layersize=128,
                 UI=1000,
                 gm=0.99,
                 remember=False,
                 algorithm='DQNxR'):
        self.size = bandit.nvot
        if algorithm == 'DQNxR':
            seed = np.random.rand()  #DOESNT DO ANYTHING

            self.DQN = DQNxR(state_size=self.size,
                             action_size=bandit.N,
                             seed=seed,
                             alpha=alpha,
                             UI=UI,
                             batch_size=10,
                             gamma=gm,
                             tau=1e-3,
                             buffer_size=int(1e5))
            #print(vars(self.DQN))
            self.epsilon = epsilon
            self.last_state = None
            self.remember = remember
        elif algorithm == 'policygrad':
            self.DQN = None
            self.policy = PolicyGrad(state_space=self.size,
                                     action_space=bandit.N,
                                     hidden_layer_size=layersize,
                                     gamma=gm)
            self.optimizer = optim.Adam(self.policy.parameters(), lr=alpha)
            self.update_interval = UI
            self.remember = remember

#POLICY GRADIENT

    def select_action(self, state):
        #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state
        state = torch.from_numpy(state).type(torch.FloatTensor)
        state = self.policy(Variable(state))
        c = Categorical(state)
        action = c.sample()

        # Add log probability of our chosen action to our history
        if self.policy.policy_history.dim() != 0:
            #print(policy.policy_history)
            #print(c.log_prob(action))
            self.policy.policy_history = torch.cat(
                [self.policy.policy_history,
                 c.log_prob(action).unsqueeze(0)])
            #print("DID!")
        else:
            self.policy.policy_history = (c.log_prob(action))
        return action

    def update_policy(self):
        R = 0
        rewards = []

        #print(self.policy.reward_episode)

        # Discount future rewards back to the present using gamma
        for r in self.policy.reward_episode[::-1]:
            R = r + self.policy.gamma * R
            rewards.insert(0, R)

        # Scale rewards
        rewards = torch.FloatTensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() +
                                                np.finfo(np.float32).eps)

        # Calculate loss
        loss = (torch.sum(
            torch.mul(self.policy.policy_history, Variable(rewards)).mul(-1),
            -1))

        # Update network weights
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        #self.policy.loss_history.append(loss.data.item())
        #self.policy.reward_history.append(np.sum(policy.reward_episode))
        self.policy.policy_history = Variable(torch.Tensor())
        self.policy.reward_episode = []

#UNIVERSAL

    def update_Q(self, action, reward):
        if self.DQN is not None:
            self.AR = (action, reward)
        else:
            if len(self.policy.reward_episode) == self.update_interval:
                self.policy.reward_episode.append(reward)
                self.update_policy()
            else:
                self.policy.reward_episode.append(reward)

    def get_action(self, bandit, actnum, decline, N_episodes):
        if self.remember == False:
            state = np.ones(self.size) / 100
        elif self.remember == "Rewards":
            state_info = bandit.last_rewards
            state = np.array(state_info)
            #print(actnum, state)
        elif self.remember == "Actions":
            state_info = bandit.last_actions
            state = np.array(state_info)
        elif self.remember == "Actions_now":
            state = bandit.partial_result

        if self.DQN is not None:
            if self.last_state is not None:
                #print(actnum,self.last_state,self.AR[0],self.AR[1],state)
                self.DQN.step(self.last_state,
                              self.AR[0],
                              self.AR[1],
                              state,
                              done=False)
                #print(self.last_state,self.AR[0],self.AR[1],state)

            actnum = self.DQN.act(state, self.epsilon).item()
            self.last_state = state
        else:
            actnum = self.select_action(state).item()

            #print(state, actnum)

        return actnum
예제 #30
0
def main(argv):
	# Set seeds
	np.random.seed(FLAGS.seed)
	t.manual_seed(FLAGS.seed)

	# Create logfile
	f = create_exp_logfile(os.path.join(FLAGS.exp_log_dir, str(FLAGS.learning_rate), str(FLAGS.seed)))

	# Initialise agent and environment
	env = LunarLander()
	num_actions = env.num_actions()
	agent = Agent(body_type='ff', 
				  obs_num_features_or_obs_in_channels=FLAGS.observation_dimensions, 
				  fc_hidden_layer_size = FLAGS.fc_hidden_layer_size, 
				  output_actions = num_actions, 
				  use_target_net = FLAGS.use_target_net,
				  g = FLAGS.gamma, 
				  lr = FLAGS.learning_rate)

	# Initialise data structures
	c_buf = CircularBuffer(size=FLAGS.cb_size)
	er_buf = ExperienceReplayBuffer(size=FLAGS.er_size, batch_size=FLAGS.batch_size)

	# Initialise sampling range for e-greedy
	interval = t.distributions.uniform.Uniform(t.tensor([0.0]), t.tensor([1.0]))

	# Run
	step = 0
	episode_results = []
	state = env.reset()
	c_buf.append(t.from_numpy(state).float())

	while step < FLAGS.max_steps:
		# Agent select action
		eps = max(FLAGS.init_epsilon - (((FLAGS.init_epsilon - FLAGS.final_epsilon) / FLAGS.epsilon_anneal) * step), FLAGS.final_epsilon)

		if interval.sample() <= eps:
			action = np.random.randint(num_actions)
		else:
			action = agent.greedy_action(c_buf()).item()
		reward, next_state, terminal = env.act(action)
		terminal = 1 if terminal else 0

		er_buf.append(state, action, reward, next_state, terminal)
		state = next_state
		c_buf.append(t.from_numpy(state).float())

		if step > FLAGS.batch_size and step % FLAGS.update_frequency:
			batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminals = \
																				er_buf.sample()
			batch_states = t.from_numpy(batch_states).float()
			batch_actions = np.array(batch_actions)
			batch_rewards = np.array(batch_rewards)
			batch_next_states = t.from_numpy(batch_next_states).float()
			batch_terminals = np.array(batch_terminals)

			agent.optimise(batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminals)

		if step % FLAGS.target_network_update == 0:
			agent.sync()

		if terminal:
			episode_results.append(env.episode_return())
			state = env.reset()
			
		step += 1
	
		if step % FLAGS.evaluate == 0:
			f.write('{}, {}\n'.format(step, performance_avg(episode_results, FLAGS.num_episodes_average)))
			f.flush()

	f.close()
예제 #31
0
import gym
from keras.models import load_model
from dqn import Agent

env_name = 'CartPole-v0'
eps = 0.8
episodes = 5
env = gym.make(env_name)
model = load_model('./model/my_model.h5')
agent = Agent(env)

for episode in range(episodes):
    # initial state
    s = env.reset()

    done = False
    while not done:
        for i in range(50):
            a = agent.act(s, eps)
            env.render(a)
            s2, r, done, info = env.step(a)
            s = s2
env.close()