def start(): env = gym.make('CartPole-v0') params = { 'gamma': 0.8, 'epsi_high': 0.9, 'epsi_low': 0.05, 'decay': 500, 'lr': 0.001, 'capacity': 10000, 'batch_size': 64, 'state_space_dim': env.observation_space.shape[0], 'action_space_dim': env.action_space.n } agent = Agent(**params) score = [] mean = [] for episode in range(1000): s0 = env.reset() total_reward = 1 for i in range(200): env.render() a0 = agent.act(s0) s1, r1, done, _ = env.step(a0) if done: r1 = -1 agent.put(s0, a0, r1, s1) if done: break total_reward += r1 s0 = s1 agent.learn() score.append(total_reward) mean.append(sum(score[-100:]) / 100) print(total_reward)
class RlBidAgent(): def _load_config(self): """ Parse the config.cfg file """ cfg = configparser.ConfigParser(allow_no_value=True) env_dir = os.path.dirname(__file__) cfg.read(env_dir + '/config.cfg') self.budget = int(cfg['agent']['budget']) self.target_value = int(cfg['agent']['target_value']) self.T = int(cfg['rl_agent']['T']) # T number of timesteps self.STATE_SIZE = int(cfg['rl_agent']['STATE_SIZE']) self.ACTION_SIZE = int(cfg['rl_agent']['ACTION_SIZE']) def __init__(self): self._load_config() # Control parameter used to scale bid price self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08] self.eps_start = 0.95 self.eps_end = 0.05 self.anneal = 0.00005 self._reset_episode() # DQN Network to learn Q function self.dqn_agent = Agent(state_size=7, action_size=7, seed=0) # Reward Network to reward function self.reward_net = RewardNet(state_action_size=8, reward_size=1, seed=0) self.dqn_state = None self.dqn_action = 3 # no scaling self.dqn_reward = 0 # Reward-Dictionary self.reward_dict = {} self.S = [] self.V = 0 self.total_wins = 0 self.total_rewards = 0.0 def _reset_episode(self): """ Function to reset the state when episode changes """ self.t_step = 0 # 1. t: the current time step self.budget_spend = 0.0 self.rem_budget = self.budget # 2. the remaining budget at time-step t self.ROL = self.T # 3. the number of Lambda regulation opportunities left self.prev_budget = self.budget # Bt-1 self.BCR = 0 # 4. Budget consumption rate # (self.budget - self.prev_budget) / self.prev_budget self.CPM = 0 # 5. Cost per mille of impressions between t-1 and t # (self.prev_budget - self.running_budget) / self.cur_wins self.WR = 0 # 6. wins_e / total_impressions self._reset_step( ) # 7. Total value of the winning impressions 'click_prob' self.cur_day = 1 self.cur_hour = 0 self.ctl_lambda = 1.0 # Lambda sequential regulation parameter self.wins_e = 0 self.eps = self.eps_start self.V = 0 def _update_step(self): """ Function to call to update the state with every bid request received for the state modeling """ self.t_step += 1 self.prev_budget = self.rem_budget self.rem_budget -= (self.cost_t / 1e9) self.ROL -= 1 self.BCR = (self.rem_budget - self.prev_budget) / self.prev_budget self.CPM = self.cost_t self.WR = self.wins_t / self.bids_t def _reset_step(self): """ Function to call every time a new time step is entered. """ self.reward_t = 0. self.cost_t = 0. self.wins_t = 0 self.bids_t = 0 self.eps = max(self.eps_start - self.anneal * self.t_step, 0.05) def _update_reward_cost(self, reward, cost): """ Internal function to update reward and action to compute the cumulative reward and cost within the given step. """ self.reward_t += reward self.cost_t += cost self.bids_t += 1 self.total_rewards += reward def _get_state(self): """ Returns the state that will be used for the DQN state. """ return np.asarray([ self.t_step, self.rem_budget, self.ROL, self.BCR, self.CPM, self.WR, self.reward_t ]) def act(self, state, reward, cost): """ This function gets called with every bid request. By looking at the weekday and hour to progress between the steps and episodes during training. Returns the bid request cost based on the scaled version of the bid price using the DQN agent output. """ episode_done = (state['weekday'] != self.cur_day) # within the time step if state['hour'] == self.cur_hour and state['weekday'] == self.cur_day: self._update_reward_cost(reward, cost) # within the episode, changing the time step elif state['hour'] != self.cur_hour and state[ 'weekday'] == self.cur_day: self._update_step() # Sample a mini batch and perform grad-descent step self.reward_net.step() dqn_next_state = self._get_state() a_beta = self.dqn_agent.act(dqn_next_state, eps=self.eps) sa = np.append(self.dqn_state, self.dqn_action) rnet_r = float(self.reward_net.act(sa)) # call agent step self.dqn_agent.step(self.dqn_state, self.dqn_action, rnet_r, dqn_next_state, episode_done) self.dqn_state = dqn_next_state self.dqn_action = a_beta # print(dqn_next_state, a_beta) self.ctl_lambda *= (1 + self.BETA[a_beta]) self.cur_hour = state['hour'] self._reset_step() self._update_reward_cost(reward, cost) self.V += self.reward_t self.S.append((self.dqn_state, self.dqn_action)) # episode changes elif state['weekday'] != self.cur_day: for (s, a) in self.S: sa = tuple(np.append(s, a)) max_r = max(self.reward_net.get_from_M(sa), self.V) self.reward_net.add_to_M(sa, max_r) self.reward_net.add(sa, max_r) print("Total Impressions won with Budget={} Spend={} wins = {}". format(self.budget, self.budget_spend, self.wins_e)) self.total_wins += self.wins_e self._reset_episode() self.cur_day = state['weekday'] self.cur_hour = state['hour'] self._update_reward_cost(reward, cost) # action = bid amount # send the best estimate of the bid self.budget_spend += (cost / 1e9) if cost > 0: self.wins_t += 1 self.wins_e += 1 action = min( self.ctl_lambda * self.target_value * state['click_prob'] * 1e9, (self.budget - self.budget_spend) * 1e9) return action def done(self): return self.budget <= self.budget_spend
env._render() steps_taken += 1 update_values = [False] * flags.num_agents action_dict = {} # 对每一个agent算法进行更新 # TODO UPDATE # for a in range(flags.num_agents): # if info['action_required'][a]: # action_dict[a] = agent.act(agent_obs[a], eps=eps) # # action_dict[a] = np.random.randint(5) # update_values[a] = True # steps_taken += 1 # else: action_dict[a] = 0 action_dict[a] = agent.act(agent_obs[0], eps=eps) joint_action = action_wrapper(action_dict) obs, rewards, done, info = env.step(joint_action) # if rewards[0][0] != 0 or rewards[0][1] != 0: score += rewards[0][0] # if score != 0: # Check for collisions and episode completion # if step == max_steps - 1: # done['__all__'] = True # Update replay buffer and train agent for a in range(flags.num_agents):
import sys import gym from dqn import Agent num_episodes = 5000 env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0" env = gym.make(env_name) agent = Agent(state_size=env.observation_space.shape, number_of_actions=env.action_space.n, save_name=env_name) for e in xrange(num_episodes): observation = env.reset() done = False agent.new_episode() total_cost = 0.0 total_reward = 0.0 frame = 0 while not done: frame += 1 #env.render() action, values = agent.act(observation) #action = env.action_space.sample() observation, reward, done, info = env.step(action) total_cost += agent.observe(reward) total_reward += reward print "total reward", total_reward print "mean cost", total_cost / frame
import gym from keras.models import load_model from dqn import Agent env_name = 'CartPole-v0' eps = 0.8 episodes = 5 env = gym.make(env_name) model = load_model('./model/my_model.h5') agent = Agent(env) for episode in range(episodes): # initial state s = env.reset() done = False while not done: for i in range(50): a = agent.act(s, eps) env.render(a) s2, r, done, info = env.step(a) s = s2 env.close()
class Neural_Agent: def __init__(self, bandit, epsilon, alpha, layersize=128, UI=1000, gm=0.99, remember=False, algorithm='DQNxR'): self.size = bandit.nvot if algorithm == 'DQNxR': seed = np.random.rand() #DOESNT DO ANYTHING self.DQN = DQNxR(state_size=self.size, action_size=bandit.N, seed=seed, alpha=alpha, UI=UI, batch_size=10, gamma=gm, tau=1e-3, buffer_size=int(1e5)) #print(vars(self.DQN)) self.epsilon = epsilon self.last_state = None self.remember = remember elif algorithm == 'policygrad': self.DQN = None self.policy = PolicyGrad(state_space=self.size, action_space=bandit.N, hidden_layer_size=layersize, gamma=gm) self.optimizer = optim.Adam(self.policy.parameters(), lr=alpha) self.update_interval = UI self.remember = remember #POLICY GRADIENT def select_action(self, state): #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state state = torch.from_numpy(state).type(torch.FloatTensor) state = self.policy(Variable(state)) c = Categorical(state) action = c.sample() # Add log probability of our chosen action to our history if self.policy.policy_history.dim() != 0: #print(policy.policy_history) #print(c.log_prob(action)) self.policy.policy_history = torch.cat( [self.policy.policy_history, c.log_prob(action).unsqueeze(0)]) #print("DID!") else: self.policy.policy_history = (c.log_prob(action)) return action def update_policy(self): R = 0 rewards = [] #print(self.policy.reward_episode) # Discount future rewards back to the present using gamma for r in self.policy.reward_episode[::-1]: R = r + self.policy.gamma * R rewards.insert(0, R) # Scale rewards rewards = torch.FloatTensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) # Calculate loss loss = (torch.sum( torch.mul(self.policy.policy_history, Variable(rewards)).mul(-1), -1)) # Update network weights self.optimizer.zero_grad() loss.backward() self.optimizer.step() #self.policy.loss_history.append(loss.data.item()) #self.policy.reward_history.append(np.sum(policy.reward_episode)) self.policy.policy_history = Variable(torch.Tensor()) self.policy.reward_episode = [] #UNIVERSAL def update_Q(self, action, reward): if self.DQN is not None: self.AR = (action, reward) else: if len(self.policy.reward_episode) == self.update_interval: self.policy.reward_episode.append(reward) self.update_policy() else: self.policy.reward_episode.append(reward) def get_action(self, bandit, actnum, decline, N_episodes): if self.remember == False: state = np.ones(self.size) / 100 elif self.remember == "Rewards": state_info = bandit.last_rewards state = np.array(state_info) #print(actnum, state) elif self.remember == "Actions": state_info = bandit.last_actions state = np.array(state_info) elif self.remember == "Actions_now": state = bandit.partial_result if self.DQN is not None: if self.last_state is not None: #print(actnum,self.last_state,self.AR[0],self.AR[1],state) self.DQN.step(self.last_state, self.AR[0], self.AR[1], state, done=False) #print(self.last_state,self.AR[0],self.AR[1],state) actnum = self.DQN.act(state, self.epsilon).item() self.last_state = state else: actnum = self.select_action(state).item() #print(state, actnum) return actnum
'capacity': 10000, 'batch_size': 64, 'state_space_dim': env.observation_space.shape[0], 'action_space_dim': env.action_space.n } agent = Agent(**params) score = [] mean = [] for episode in range(1000): s0 = env.reset() total_reward = 1 while True: env.render() a0 = agent.act(s0) s1, r1, done, _ = env.step(a0) if done: r1 = -1 agent.put(s0, a0, r1, s1) if done: break total_reward += r1 s0 = s1 agent.learn() score.append(total_reward)
xrange except NameError: xrange = range num_episodes = 20 env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0" env = gym.make(env_name) agent = Agent(state_size=env.observation_space.shape, number_of_actions=env.action_space.n, save_name=env_name) for e in xrange(num_episodes): observation = env.reset() done = False agent.new_episode() total_cost = 0.0 total_reward = 0.0 frame = 0 while not done: frame += 1 #env.render() action, values = agent.act(observation) #action = env.action_space.sample() observation, reward, done, info = env.step(action) total_cost += agent.observe(reward) total_reward += reward print("total reward", total_reward) print("mean cost", total_cost/frame)
reward_list = [] grasp_success_list = [] grasp_success_rate_list = [] for e in range(1, EPISODES+1): dist_list = [] log('###################################################') log('#################### EPISODE ' + str(e) + ' ' + '#'*(20-int(math.log(e, 10)))) log('###################################################') state = env.reset() object_state = np.reshape(state[0], [1, state_dim[0]]) arm_state = np.reshape(state[1], [1, state_dim[1]]) state = (object_state, arm_state) total_reward = 0 for t in range(1, TIME_STEPS+1): action = dqn_agent.act(state) # log('############### ITERATION ' + str(t) + ' ' + '#'*(15-int(math.log(t, 10)))) state_next, reward, terminal, info, next_distance, successful_grasping = env.step(action) total_reward += reward dist_list.append(next_distance) object_state_next = np.reshape(state_next[0], [1, state_dim[0]]) arm_state_next = np.reshape(state_next[1], [1, state_dim[1]]) state_next = (object_state_next, arm_state_next) # log('State: ' + str(state)) # log('Action: ' + str(action)) # log('Reward: ' + str(reward)) # log('Next State: ' + str(state_next)) # log('Done: ' + str(terminal)) dqn_agent.remember(state, action, reward, state_next, terminal) state = state_next
# plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() # Play the Trained Agent # load the weights from file agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth', map_location=lambda storage, loc: storage)) env_info = env.reset(train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score while True: action = agent.act(state) # select an action #print(action) env_info = env.step(vector_action=[action]) # send the action to the environment next_state = env_info[brain_name].vector_observations[0]# get the next state reward = env_info[brain_name].rewards[0] # get the reward done = env_info[brain_name].local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step if done: # exit loop if episode finished break print("End Score: {}".format(score)) env.close()