def __init__(self, fname): lr = 0.0005 self.agent = Agent(gamma=0.99, epsilon=0.0, alpha=lr, input_dims=6, n_actions=2, mem_size=60000, batch_size=64, epsilon_end=0.0, fname=fname) self.observation = [] self.action = 0 self.n_step = 0 self.fname = fname.split("/")[-1]
def __init__(self, bandit, epsilon, alpha, layersize=128, UI=1000, gm=0.99, remember=False, algorithm='DQNxR'): self.size = bandit.nvot if algorithm == 'DQNxR': seed = np.random.rand() #DOESNT DO ANYTHING self.DQN = DQNxR(state_size=self.size, action_size=bandit.N, seed=seed, alpha=alpha, UI=UI, batch_size=10, gamma=gm, tau=1e-3, buffer_size=int(1e5)) #print(vars(self.DQN)) self.epsilon = epsilon self.last_state = None self.remember = remember elif algorithm == 'policygrad': self.DQN = None self.policy = PolicyGrad(state_space=self.size, action_space=bandit.N, hidden_layer_size=layersize, gamma=gm) self.optimizer = optim.Adam(self.policy.parameters(), lr=alpha) self.update_interval = UI self.remember = remember
def main(env, gamma, epsilon, final_epsilon, final_exp_step, lr, memory_size, target_update_freq, gradient_update_freq, batch_size, replay_start, val_freq, log_freq_by_step, log_freq_by_ep, val_epsilon, log_dir, weight_dir, steps): train_env = make_atari(env + "NoFrameskip-v4") val_env = make_atari(env + "NoFrameskip-v4", noop=False) agent = Agent(train_env, DQN, gamma=gamma, epsilon=epsilon, final_epsilon=final_epsilon, final_exp_step=final_exp_step) trainer = Trainer(agent, val_env, lr=lr, memory_size=memory_size, target_update_freq=target_update_freq, gradient_update_freq=gradient_update_freq, batch_size=batch_size, replay_start=replay_start, val_freq=val_freq, log_freq_by_step=log_freq_by_step, log_freq_by_ep=log_freq_by_ep, val_epsilon=val_epsilon, log_dir=log_dir, weight_dir=weight_dir) trainer.train(steps)
def main(): agent = Agent() agent.load() total_reward = 0 obs = env.reset() env.render() for _ in range(10000): act = agent.predict(obs) obs, reward, done, _ = env.step(act) total_reward += reward env.render() if done: print(f'total_reward: {total_reward}') env.close() break
def setup_Agent(filename, epsilon): """ Function to initialize the DQN agent """ # one hot vector (opponents move) on top of game board input_dims = 7 * 7 action_space = tuple(range(7)) n_actions = 7 h1_dims = 512 h2_dims = 256 agent = Agent(lr=0.001, gamma=0.95, epsilon=epsilon, epsilon_dec=0.995, epsilon_min=0.01, input_shape=input_dims, h1_dims=h1_dims, h2_dims=h2_dims, action_space=action_space, training_epochs=2, fname=filename) memory = ReplayBuffer(50000, input_dims, n_actions) return agent, memory
def main(env_name=None): ENV_NAME = 'wumpus-v0' if env_name: ENV_NAME = env_name MODEL_DIR = f'models/{ENV_NAME}-dqn' MODEL_FILE = f'{ENV_NAME}-dqn.h5' CHECKPOINTS_DIR = f'models/{ENV_NAME}-dqn/checkpoints' TEST_IMG_DIR = f'tests/{ENV_NAME}-dqn' env = gym.make(ENV_NAME) env.reset() agent = Agent(learning_rate=0.01, gamma=0.95, state_shape=env.observation_space.shape, actions=7, batch_size=64, epsilon_initial=0.0, epsilon_decay=0, epsilon_final=0.0, replay_buffer_capacity=1000000, model_name=MODEL_FILE, model_dir=MODEL_DIR, ckpt_dir=CHECKPOINTS_DIR) agent.load_model() done = False score = 0 steps_per_episode = 0 state = env.reset() images = [env.render('rgb_array')] while not done: # Choose action according to policy, and execute action = agent.select_action(state) state, reward, done, _ = env.step(action) score += reward steps_per_episode += 1 images.append(env.render('rgb_array')) # Generate GIF for the execution create_gif( f'{ENV_NAME}.gif', np.array(images), fps=1.0 ) print( f'Model \'{str(ENV_NAME)}\', score {score}, steps {steps_per_episode}')
class AI: def __init__(self, fname): lr = 0.0005 self.agent = Agent(gamma=0.99, epsilon=0.0, alpha=lr, input_dims=6, n_actions=2, mem_size=60000, batch_size=64, epsilon_end=0.0, fname=fname) self.observation = [] self.action = 0 self.n_step = 0 self.fname = fname.split("/")[-1] def episode_start(self, observation): self.observation = observation def choose_action(self): self.action = self.agent.choose_action(self.observation) return self.action def step(self, observation_, reward, done): self.agent.remember(self.observation, self.action, reward, observation_, int(done)) self.observation = observation_ if self.n_step % 3 == 0: self.agent.learn() self.n_step += 1 def episode_end(self): self.agent.save_model()
def test_agent(): print("##################Running agent test##################") agent = Agent(state_shape, action_shape) state1 = np.array([1,2,3,4]).reshape(-1,4) state2 = np.array([2,3,4,5]).reshape(-1,4) out1 = agent.model.predict(state1) out2 = agent.model.predict(state2) print(out1) print(out2) assert np.all((out1[0]-out2[0])!=0), "Failed agent test - same output different state" print("Agent test passed :)\n\n")
def __init__(self): self._load_config() # Control parameter used to scale bid price self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08] self.eps_start = 0.95 self.eps_end = 0.05 self.anneal = 0.00005 self._reset_episode() # DQN Network to learn Q function self.dqn_agent = Agent(state_size=7, action_size=7, seed=0) # Reward Network to reward function self.reward_net = RewardNet(state_action_size=8, reward_size=1, seed=0) self.dqn_state = None self.dqn_action = 3 # no scaling self.dqn_reward = 0 # Reward-Dictionary self.reward_dict = {} self.S = [] self.V = 0 self.total_wins = 0 self.total_rewards = 0.0
def main(): gym_env = gym.make('custom_gym:Xplane-v0') lr = 0.001 gam = 0.01 n_games = 1 # nn_input = obs() agent = Agent(learning_rate=lr, gamma=gam, epsilon=1.0, input_dims=(6, ), n_actions=15, batch_size=32, file_name='AI_takeoff/saved_models/dq_model_2.h5') scores = [] total_steps = [] eps_hist = [] agent.load_model() for i in range(n_games): try: done = False score = 0 observation = gym_env.reset() time.sleep(2) observation_checkpoints = np.array([observation[0:2]]) step_counter = 0 print("GAME ITERATION ", i) while not done: action = agent.choose_action(observation) new_observation, reward, done = gym_env.step(action) step_counter = step_counter + 1 score = score + reward agent.store_transition(observation, action, reward, new_observation, done) observation = new_observation # agent.learn() # This if statement checks if the airplane is stuck observation_checkpoints = np.append(observation_checkpoints, [new_observation[0:2]], axis=0) print(observation_checkpoints) print("stepcounter is", step_counter) if step_counter % 30 == 0: if np.array_equal( observation_checkpoints[step_counter - 30], observation_checkpoints[step_counter - 1]): done = True eps_hist.append(agent.epsilon) scores.append(score) total_steps.append(step_counter) except Exception as e: print(str(e))
def main(): #make env and agent env = gym.make('LunarLander-v2') agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01, input_dims=[8], lr=0.0001) scores, eps_history = [], [] n_games = 500 for i in range(n_games): score = 0 done = False observation = env.reset() while not done: #ingame #get action from current view of game (observation) action = agent.choose_action(observation) #next frame observation_, reward, done, info = env.step(action) score += reward #store memory agent.store_transisation(observation, action, reward, observation_, done) agent.learn() #set next stage to current stage observation = observation_ #append score and eps scores.append(score) eps_history.append(agent.epsilon) #print some nice statements avg_score = np.mean(scores[-100:]) print( f'Episode: {i} Score: {score} Average Score: {avg_score} Epsilon: {agent.epsilon}' )
def start(): env = gym.make('CartPole-v0') params = { 'gamma': 0.8, 'epsi_high': 0.9, 'epsi_low': 0.05, 'decay': 500, 'lr': 0.001, 'capacity': 10000, 'batch_size': 64, 'state_space_dim': env.observation_space.shape[0], 'action_space_dim': env.action_space.n } agent = Agent(**params) score = [] mean = [] for episode in range(1000): s0 = env.reset() total_reward = 1 for i in range(200): env.render() a0 = agent.act(s0) s1, r1, done, _ = env.step(a0) if done: r1 = -1 agent.put(s0, a0, r1, s1) if done: break total_reward += r1 s0 = s1 agent.learn() score.append(total_reward) mean.append(sum(score[-100:]) / 100) print(total_reward)
def OldStuff(): tf.compat.v1.disable_eager_execution() lr = 0.001 numGames = 10000 session = TriadGameSession() observation = session.getState() scores = [] agent = Agent(gamma=0.99, lr=lr, epsilon=1.0, epsilonDec=0.0005, inputSize=[len(observation)], numActions=session.getMaxActions(), memSize=1000000, batchSize=1024) for i in range(numGames): done = False score = 0 session = TriadGameSession() observation = session.getState() while not done: action = agent.chooseAction(observation) observationNext, reward, done = session.step(action) score += reward agent.store(observation, action, reward, observationNext, done) observation = observationNext agent.learn() scores.append(score) avgScore = np.mean(scores[-100:]) print('game:', i, 'score %.2f' % score, 'avgScore %.2f' % avgScore, 'epsilon %.2f' % agent.epsilon) #agent.save() print('Finished!')
def setup_Agent(filename, epsilon): """ Function to initialize the DQN agent """ input_dims = 6 * 7 action_space = tuple(range(7)) n_actions = 7 h1_dims = 512 h2_dims = 256 agent = Agent(lr=0.001, gamma=0.95, epsilon=epsilon, epsilon_dec=0.995, epsilon_min=0.01, input_shape=input_dims, h1_dims=h1_dims, h2_dims=h2_dims, action_space=action_space, training_epochs=1, fname=filename) return agent
state_size = 122 action_size = 5 # Add some variables to keep track of the progress scores_window, steps_window = [deque(maxlen=200) for _ in range(2) ] # a, b = [deque([]), deque([])] agent_obs = [None] * flags.num_agents # [None, None] agent_obs_buffer = [None] * flags.num_agents agent_action_buffer = [2] * flags.num_agents max_steps = flags.episode_length start_time = time.time() # Load an RL agent and initialize it from checkpoint if necessary # independent dqn/ppo -->每个人obs不同,同一个model if flags.agent_type == "dqn": agent = DQN_Agent(state_size, action_size, flags.num_agents) elif flags.agent_type == "ppo": agent = PPO_Agent(state_size, action_size, flags.num_agents) if flags.load_model: start, eps = agent.load(project_root / 'checkpoints', 0, 1.0) else: start, eps = 0, 1 if not flags.train: eps = 0.0 # Helper function to detect collisions ACTIONS = {0: "up", 1: "right", 2: "down", 3: "left", 4: "stop"}
import gym from dqn import DeepQNetwork, Agent import numpy as np from gym import wrappers if __name__ == '__main__': env = gym.make('LunarLander-v2') brain = Agent(gamma=0.99, epsilon=1.0, n_actions=4, batch_size=128, input_dims=[8], alpha=0.0003, replace=64) scores = [] eps_history = [] num_games = 500 score = 0 for i in range(num_games): if i % 10 == 0 and i > 0: avg_score = np.mean(scores[max(0, i - 10):(i + 1)]) print('episode: ', i, 'score: ', score, ' average score %.3f' % avg_score, 'epsilon %.3f' % brain.EPSILON) else: print('episode: ', i, 'score: ', score) eps_history.append(brain.EPSILON) done = False observation = env.reset()
import sys import gym from dqn import Agent num_episodes = 5000 env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0" env = gym.make(env_name) agent = Agent(state_size=env.observation_space.shape, number_of_actions=env.action_space.n, save_name=env_name) for e in xrange(num_episodes): observation = env.reset() done = False agent.new_episode() total_cost = 0.0 total_reward = 0.0 frame = 0 while not done: frame += 1 #env.render() action, values = agent.act(observation) #action = env.action_space.sample() observation, reward, done, info = env.step(action) total_cost += agent.observe(reward) total_reward += reward print "total reward", total_reward print "mean cost", total_cost / frame
# ------------------------------ Variable Declaration ---------------------------------- NUM_OF_ZOMBIES = 1 NUM_OF_VILLAGERS = 1 agent_host = MalmoPython.AgentHost() malmoutils.parse_command_line(agent_host) validate = True num_reps = 300 #=======core part initialization==================================== #input size 5*5, you can change the size here memory = MemoryD(5) network_model, q_values_func = nn_model(input_shape=[5, 5]) agent = Agent(network_model, q_values_func, memory, 'train', 'ddqn') #set learning rate to be 0.00025 agent.do_compile(optimizer=Adam(lr=0.00025), loss_func=mean_huber_loss) agent.memoryD.clear() #=================================================================== for iRepeat in range(num_reps): my_mission_record = malmoutils.get_default_recording_object( agent_host, "./Mission_{}".format(iRepeat + 1)) #my_mission_record = MalmoPython.MissionRecordSpec('./' + "Mission_" + str(iRepeat) + ".tgz") #my_mission_record.recordRewards() #my_mission_record.recordMP4(24,400000) #my_mission_record.recordObservations() my_mission = MalmoPython.MissionSpec(GetMissionXML(mapblock, agent_host), validate)
scores = [] epsHistory = [] numGames = 1 batch_size = 32 n_actions = 6 input_dims = (185, 95) crop_start = (15, 30) crop_end = (200, 125) starting_epsilon = 0.05 if LOAD_MODEL else 1.0 env = gym.make('SpaceInvaders-v0') brain = Agent(gamma=0.95, epsilon=0.05, lr=0.003, input_dims=input_dims, batch_size=batch_size, n_actions=n_actions, max_mem_size=5000, save_path='models/') if LOAD_MODEL: brain.load() else: # load memory with random games while brain.mem_cntr < brain.mem_size: observation = env.reset() observation = preprocess(observation, crop_start, crop_end) done = False while not done: # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire action = env.action_space.sample()
import gym from dqn import Agent # Python 3 compatability try: xrange except NameError: xrange = range num_episodes = 20 env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0" env = gym.make(env_name) agent = Agent(state_size=env.observation_space.shape, number_of_actions=env.action_space.n, save_name=env_name) for e in xrange(num_episodes): observation = env.reset() done = False agent.new_episode() total_cost = 0.0 total_reward = 0.0 frame = 0 while not done: frame += 1 #env.render() action, values = agent.act(observation) #action = env.action_space.sample() observation, reward, done, info = env.step(action)
def train(path, env): #env = Monitor(env, path, video_callable=video_callable, force=True) agent = Agent(env, path=path) agent.train() return agent
import gym from dqn import Agent from utils import PlotLearning import numpy as np if __name__ == '__main__': env = gym.make('LunarLander-v2') agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01, inp_dims=[8], lr=0.001) scores, eps_history = [], [] n_games = 500 for i in range(n_games): score = 0 done = False observation = env.reset() while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward agent.store_transition(observation, action, reward, observation_, done) agent.learn() observation = observation_ scores.append(score) eps_history.append(agent.epsilon)
def run(env: LlvmEnv) -> None: agent = Agent(n_actions=15, input_dims=[69]) env.observation_space = "InstCountNorm" agent.Q_eval.load_state_dict(torch.load("./H10-N4000-INSTCOUNTNORM.pth")) rollout(agent, env)
from dqn import Agent import numpy as np import gym import matplotlib.pyplot as plt if __name__ == '__main__': env = gym.make('LunarLander-v2') n_games = 300 show = False agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0005, input_dims=8, n_actions=4, batch_size=64) scores = [] eps_history = [] for i in range(1, n_games+1): done = False score = 0 obseervation = env.reset() while not done: if show: env.render() action = agent.choose_action(obseervation) obseervation_, reward, done, info = env.step(action) score += reward agent.remember(obseervation, action, reward, obseervation_, done) obseervation = obseervation_ agent.learn() eps_history.append(agent.epsilon)
def main(): scores = [] eps_history = [] info_history = [] # Random starting-points: env = sky.make(random=True, xi=(301, 650 - 25), yi=(100, 300 - 25), width=15, height=15, v_initial=14) # Fixed starting-point: #env = sky.make(xi=550) agent = Agent(gamma=gamma, epsilon=epsilon, lr=lr, input_dims=[imput_dimensions], n_actions=n_actions, mem_size=mem_size, batch_size=batch_size, epsilon_dec=epsilon_dec) if (load_checkpoint): agent.load_modes() for i in range(n_games): score = 0 done = False observation = env.reset() while not done: ''' one game: ending, when done=True ''' action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward agent.store_transition(observation, action, reward, observation_, int(done)) observation = observation_ agent.learn() if i % 10 == 0 and i > 0: avg_score = np.mean(scores[max(0, i - 10):(i + 1)]) print(i, 'episode', info, '|| score:', score, '| average score: %.3f' % avg_score, '| epsilon: %.3f' % agent.epsilon, '| training done:', round(i / n_games, 2)) else: print(i, 'episode', info, '|| score:', score) scores.append(score) eps_history.append(agent.epsilon) info_history.append(info) print('training ended with:', [[el, info_history.count(el)] for el in ('crashed', 'goal')]) if (save_checkpoint): agent.save_models() print('[+] model saved') # ------------------- # Plotting and output # ------------------- x = [i + 1 for i in range(n_games)] # First axis: Scores fig, ax1 = plt.subplots() color = 'tab:red' ax1.set_xlabel('Episode') ax1.set_ylabel('score per Episode', color=color) ax1.scatter(x, scores, color=color, s=2) ax1.tick_params(axis='y', labelcolor=color) # Second axis: epsilon ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:blue' ax2.set_ylabel('epsilon', color=color) # we already handled the x-label with ax1 ax2.plot(x, eps_history, color=color) ax2.tick_params(axis='y', labelcolor=color) # Output fig.tight_layout() # otherwise the right y-label is slightly clipped plt.savefig(filename) return env
def main(): # Initialize environment, agent env = gym.make(ENV_NAME) summary_writer = tf.summary.create_file_writer(LOG_DIR) agent = Agent(learning_rate=0.01, gamma=0.95, state_shape=env.observation_space.shape, actions=7, batch_size=64, epsilon_initial=0.9, epsilon_decay=1e-6, epsilon_final=0.01, replay_buffer_capacity=1000000, model_name=MODEL_FILE, model_dir=MODEL_DIR, ckpt_dir=CHECKPOINTS_DIR, log_dir=LOG_DIR) scores = [] for i in range(1, EPISODES + 1): done = False score = 0 state = env.reset() steps_per_episode = 0 # Play one episode while not done: # Choose action (epsilon greedy), and execute action = agent.select_action(state) next_state, reward, done, _ = env.step(action) score += reward # Store in experience replay buffer agent.store_experience(state, action, reward, next_state, done) state = next_state agent.train() steps_per_episode += 1 if len(scores) == 100: scores.pop(0) scores.append(score) avg_score = np.mean(scores) min_score = np.min(scores) max_score = np.max(scores) print( f'Episode: {i}, Score {score:.2f}, Avg_score {avg_score:.2f}, Epsilon {agent.epsilon:.2f}') # Summaries for Tensorboard write_summaries(summary_writer, { 'epsilon': agent.epsilon, 'reward.episode': score, 'reward.avg': avg_score, 'reward.min': min_score, 'reward.max': max_score, 'steps.count': steps_per_episode }, i, ENV_NAME) # Save the model if i % SAVE_INTERVAL == 0: print(f'Saving model to \'{MODEL_FILE}\' [Overwriting]') agent.save_model() # Save checkpoint if i % CHECKPOINT_INTERVAL == 0: print(f'Adding checkpoint: \'{CHECKPOINTS_DIR}/episode-{i}.h5\'') agent.save_checkpoint(f'episode-{i}')
out2 = agent.model.predict(state2) print(out1) print(out2) assert np.all((out1[0]-out2[0])!=0), "Failed agent test - same output different state" print("Agent test passed :)\n\n") state_shape=env.observation_space.shape # the state space action_shape=env.action_space.n # the action space #Testing Memory storage and sample state = env.reset() test_mem() test_agent() mem = Memory(10000, state_shape) agent = Agent(state_shape, action_shape) epsilon = 1 batch_size = 64 #action = env.action_space.sample() #next_state, reward, done, info = env.step(action) for game in range(n_games): state = env.reset() game_reward = 0 for step in range(max_steps): #Render game if game % 10 == 0:
class RlBidAgent(): def _load_config(self): """ Parse the config.cfg file """ cfg = configparser.ConfigParser(allow_no_value=True) env_dir = os.path.dirname(__file__) cfg.read(env_dir + '/config.cfg') self.budget = int(cfg['agent']['budget']) self.target_value = int(cfg['agent']['target_value']) self.T = int(cfg['rl_agent']['T']) # T number of timesteps self.STATE_SIZE = int(cfg['rl_agent']['STATE_SIZE']) self.ACTION_SIZE = int(cfg['rl_agent']['ACTION_SIZE']) def __init__(self): self._load_config() # Control parameter used to scale bid price self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08] self.eps_start = 0.95 self.eps_end = 0.05 self.anneal = 0.00005 self._reset_episode() # DQN Network to learn Q function self.dqn_agent = Agent(state_size=7, action_size=7, seed=0) # Reward Network to reward function self.reward_net = RewardNet(state_action_size=8, reward_size=1, seed=0) self.dqn_state = None self.dqn_action = 3 # no scaling self.dqn_reward = 0 # Reward-Dictionary self.reward_dict = {} self.S = [] self.V = 0 self.total_wins = 0 self.total_rewards = 0.0 def _reset_episode(self): """ Function to reset the state when episode changes """ self.t_step = 0 # 1. t: the current time step self.budget_spend = 0.0 self.rem_budget = self.budget # 2. the remaining budget at time-step t self.ROL = self.T # 3. the number of Lambda regulation opportunities left self.prev_budget = self.budget # Bt-1 self.BCR = 0 # 4. Budget consumption rate # (self.budget - self.prev_budget) / self.prev_budget self.CPM = 0 # 5. Cost per mille of impressions between t-1 and t # (self.prev_budget - self.running_budget) / self.cur_wins self.WR = 0 # 6. wins_e / total_impressions self._reset_step( ) # 7. Total value of the winning impressions 'click_prob' self.cur_day = 1 self.cur_hour = 0 self.ctl_lambda = 1.0 # Lambda sequential regulation parameter self.wins_e = 0 self.eps = self.eps_start self.V = 0 def _update_step(self): """ Function to call to update the state with every bid request received for the state modeling """ self.t_step += 1 self.prev_budget = self.rem_budget self.rem_budget -= (self.cost_t / 1e9) self.ROL -= 1 self.BCR = (self.rem_budget - self.prev_budget) / self.prev_budget self.CPM = self.cost_t self.WR = self.wins_t / self.bids_t def _reset_step(self): """ Function to call every time a new time step is entered. """ self.reward_t = 0. self.cost_t = 0. self.wins_t = 0 self.bids_t = 0 self.eps = max(self.eps_start - self.anneal * self.t_step, 0.05) def _update_reward_cost(self, reward, cost): """ Internal function to update reward and action to compute the cumulative reward and cost within the given step. """ self.reward_t += reward self.cost_t += cost self.bids_t += 1 self.total_rewards += reward def _get_state(self): """ Returns the state that will be used for the DQN state. """ return np.asarray([ self.t_step, self.rem_budget, self.ROL, self.BCR, self.CPM, self.WR, self.reward_t ]) def act(self, state, reward, cost): """ This function gets called with every bid request. By looking at the weekday and hour to progress between the steps and episodes during training. Returns the bid request cost based on the scaled version of the bid price using the DQN agent output. """ episode_done = (state['weekday'] != self.cur_day) # within the time step if state['hour'] == self.cur_hour and state['weekday'] == self.cur_day: self._update_reward_cost(reward, cost) # within the episode, changing the time step elif state['hour'] != self.cur_hour and state[ 'weekday'] == self.cur_day: self._update_step() # Sample a mini batch and perform grad-descent step self.reward_net.step() dqn_next_state = self._get_state() a_beta = self.dqn_agent.act(dqn_next_state, eps=self.eps) sa = np.append(self.dqn_state, self.dqn_action) rnet_r = float(self.reward_net.act(sa)) # call agent step self.dqn_agent.step(self.dqn_state, self.dqn_action, rnet_r, dqn_next_state, episode_done) self.dqn_state = dqn_next_state self.dqn_action = a_beta # print(dqn_next_state, a_beta) self.ctl_lambda *= (1 + self.BETA[a_beta]) self.cur_hour = state['hour'] self._reset_step() self._update_reward_cost(reward, cost) self.V += self.reward_t self.S.append((self.dqn_state, self.dqn_action)) # episode changes elif state['weekday'] != self.cur_day: for (s, a) in self.S: sa = tuple(np.append(s, a)) max_r = max(self.reward_net.get_from_M(sa), self.V) self.reward_net.add_to_M(sa, max_r) self.reward_net.add(sa, max_r) print("Total Impressions won with Budget={} Spend={} wins = {}". format(self.budget, self.budget_spend, self.wins_e)) self.total_wins += self.wins_e self._reset_episode() self.cur_day = state['weekday'] self.cur_hour = state['hour'] self._update_reward_cost(reward, cost) # action = bid amount # send the best estimate of the bid self.budget_spend += (cost / 1e9) if cost > 0: self.wins_t += 1 self.wins_e += 1 action = min( self.ctl_lambda * self.target_value * state['click_prob'] * 1e9, (self.budget - self.budget_spend) * 1e9) return action def done(self): return self.budget <= self.budget_spend
class Neural_Agent: def __init__(self, bandit, epsilon, alpha, layersize=128, UI=1000, gm=0.99, remember=False, algorithm='DQNxR'): self.size = bandit.nvot if algorithm == 'DQNxR': seed = np.random.rand() #DOESNT DO ANYTHING self.DQN = DQNxR(state_size=self.size, action_size=bandit.N, seed=seed, alpha=alpha, UI=UI, batch_size=10, gamma=gm, tau=1e-3, buffer_size=int(1e5)) #print(vars(self.DQN)) self.epsilon = epsilon self.last_state = None self.remember = remember elif algorithm == 'policygrad': self.DQN = None self.policy = PolicyGrad(state_space=self.size, action_space=bandit.N, hidden_layer_size=layersize, gamma=gm) self.optimizer = optim.Adam(self.policy.parameters(), lr=alpha) self.update_interval = UI self.remember = remember #POLICY GRADIENT def select_action(self, state): #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state state = torch.from_numpy(state).type(torch.FloatTensor) state = self.policy(Variable(state)) c = Categorical(state) action = c.sample() # Add log probability of our chosen action to our history if self.policy.policy_history.dim() != 0: #print(policy.policy_history) #print(c.log_prob(action)) self.policy.policy_history = torch.cat( [self.policy.policy_history, c.log_prob(action).unsqueeze(0)]) #print("DID!") else: self.policy.policy_history = (c.log_prob(action)) return action def update_policy(self): R = 0 rewards = [] #print(self.policy.reward_episode) # Discount future rewards back to the present using gamma for r in self.policy.reward_episode[::-1]: R = r + self.policy.gamma * R rewards.insert(0, R) # Scale rewards rewards = torch.FloatTensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) # Calculate loss loss = (torch.sum( torch.mul(self.policy.policy_history, Variable(rewards)).mul(-1), -1)) # Update network weights self.optimizer.zero_grad() loss.backward() self.optimizer.step() #self.policy.loss_history.append(loss.data.item()) #self.policy.reward_history.append(np.sum(policy.reward_episode)) self.policy.policy_history = Variable(torch.Tensor()) self.policy.reward_episode = [] #UNIVERSAL def update_Q(self, action, reward): if self.DQN is not None: self.AR = (action, reward) else: if len(self.policy.reward_episode) == self.update_interval: self.policy.reward_episode.append(reward) self.update_policy() else: self.policy.reward_episode.append(reward) def get_action(self, bandit, actnum, decline, N_episodes): if self.remember == False: state = np.ones(self.size) / 100 elif self.remember == "Rewards": state_info = bandit.last_rewards state = np.array(state_info) #print(actnum, state) elif self.remember == "Actions": state_info = bandit.last_actions state = np.array(state_info) elif self.remember == "Actions_now": state = bandit.partial_result if self.DQN is not None: if self.last_state is not None: #print(actnum,self.last_state,self.AR[0],self.AR[1],state) self.DQN.step(self.last_state, self.AR[0], self.AR[1], state, done=False) #print(self.last_state,self.AR[0],self.AR[1],state) actnum = self.DQN.act(state, self.epsilon).item() self.last_state = state else: actnum = self.select_action(state).item() #print(state, actnum) return actnum
def main(argv): # Set seeds np.random.seed(FLAGS.seed) t.manual_seed(FLAGS.seed) # Create logfile f = create_exp_logfile(os.path.join(FLAGS.exp_log_dir, str(FLAGS.learning_rate), str(FLAGS.seed))) # Initialise agent and environment env = LunarLander() num_actions = env.num_actions() agent = Agent(body_type='ff', obs_num_features_or_obs_in_channels=FLAGS.observation_dimensions, fc_hidden_layer_size = FLAGS.fc_hidden_layer_size, output_actions = num_actions, use_target_net = FLAGS.use_target_net, g = FLAGS.gamma, lr = FLAGS.learning_rate) # Initialise data structures c_buf = CircularBuffer(size=FLAGS.cb_size) er_buf = ExperienceReplayBuffer(size=FLAGS.er_size, batch_size=FLAGS.batch_size) # Initialise sampling range for e-greedy interval = t.distributions.uniform.Uniform(t.tensor([0.0]), t.tensor([1.0])) # Run step = 0 episode_results = [] state = env.reset() c_buf.append(t.from_numpy(state).float()) while step < FLAGS.max_steps: # Agent select action eps = max(FLAGS.init_epsilon - (((FLAGS.init_epsilon - FLAGS.final_epsilon) / FLAGS.epsilon_anneal) * step), FLAGS.final_epsilon) if interval.sample() <= eps: action = np.random.randint(num_actions) else: action = agent.greedy_action(c_buf()).item() reward, next_state, terminal = env.act(action) terminal = 1 if terminal else 0 er_buf.append(state, action, reward, next_state, terminal) state = next_state c_buf.append(t.from_numpy(state).float()) if step > FLAGS.batch_size and step % FLAGS.update_frequency: batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminals = \ er_buf.sample() batch_states = t.from_numpy(batch_states).float() batch_actions = np.array(batch_actions) batch_rewards = np.array(batch_rewards) batch_next_states = t.from_numpy(batch_next_states).float() batch_terminals = np.array(batch_terminals) agent.optimise(batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminals) if step % FLAGS.target_network_update == 0: agent.sync() if terminal: episode_results.append(env.episode_return()) state = env.reset() step += 1 if step % FLAGS.evaluate == 0: f.write('{}, {}\n'.format(step, performance_avg(episode_results, FLAGS.num_episodes_average))) f.flush() f.close()
import gym from keras.models import load_model from dqn import Agent env_name = 'CartPole-v0' eps = 0.8 episodes = 5 env = gym.make(env_name) model = load_model('./model/my_model.h5') agent = Agent(env) for episode in range(episodes): # initial state s = env.reset() done = False while not done: for i in range(50): a = agent.act(s, eps) env.render(a) s2, r, done, info = env.step(a) s = s2 env.close()