Пример #1
0
def start():
    env = gym.make('CartPole-v0')

    params = {
        'gamma': 0.8,
        'epsi_high': 0.9,
        'epsi_low': 0.05,
        'decay': 500,
        'lr': 0.001,
        'capacity': 10000,
        'batch_size': 64,
        'state_space_dim': env.observation_space.shape[0],
        'action_space_dim': env.action_space.n
    }
    agent = Agent(**params)

    score = []
    mean = []

    for episode in range(1000):
        s0 = env.reset()
        total_reward = 1
        for i in range(200):
            env.render()
            a0 = agent.act(s0)
            s1, r1, done, _ = env.step(a0)

            if done:
                r1 = -1

            agent.put(s0, a0, r1, s1)

            if done:
                break

            total_reward += r1
            s0 = s1
            agent.learn()

        score.append(total_reward)
        mean.append(sum(score[-100:]) / 100)
        print(total_reward)
class RlBidAgent():
    def _load_config(self):
        """
        Parse the config.cfg file
        """
        cfg = configparser.ConfigParser(allow_no_value=True)
        env_dir = os.path.dirname(__file__)
        cfg.read(env_dir + '/config.cfg')
        self.budget = int(cfg['agent']['budget'])
        self.target_value = int(cfg['agent']['target_value'])
        self.T = int(cfg['rl_agent']['T'])  # T number of timesteps
        self.STATE_SIZE = int(cfg['rl_agent']['STATE_SIZE'])
        self.ACTION_SIZE = int(cfg['rl_agent']['ACTION_SIZE'])

    def __init__(self):
        self._load_config()
        # Control parameter used to scale bid price
        self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08]
        self.eps_start = 0.95
        self.eps_end = 0.05
        self.anneal = 0.00005
        self._reset_episode()
        # DQN Network to learn Q function
        self.dqn_agent = Agent(state_size=7, action_size=7, seed=0)
        # Reward Network to reward function
        self.reward_net = RewardNet(state_action_size=8, reward_size=1, seed=0)
        self.dqn_state = None
        self.dqn_action = 3  # no scaling
        self.dqn_reward = 0
        # Reward-Dictionary
        self.reward_dict = {}
        self.S = []
        self.V = 0
        self.total_wins = 0
        self.total_rewards = 0.0

    def _reset_episode(self):
        """
        Function to reset the state when episode changes
        """
        self.t_step = 0  # 1. t: the current time step
        self.budget_spend = 0.0
        self.rem_budget = self.budget  # 2. the remaining budget at time-step t
        self.ROL = self.T  # 3. the number of Lambda regulation opportunities left
        self.prev_budget = self.budget  # Bt-1
        self.BCR = 0  # 4. Budget consumption rate
        #      (self.budget - self.prev_budget) / self.prev_budget
        self.CPM = 0  # 5. Cost per mille of impressions between t-1 and t
        #       (self.prev_budget - self.running_budget) / self.cur_wins
        self.WR = 0  # 6. wins_e / total_impressions
        self._reset_step(
        )  # 7. Total value of the winning impressions 'click_prob'
        self.cur_day = 1
        self.cur_hour = 0
        self.ctl_lambda = 1.0  # Lambda sequential regulation parameter
        self.wins_e = 0
        self.eps = self.eps_start
        self.V = 0

    def _update_step(self):
        """
        Function to call to update the state with every bid request
        received for the state modeling
        """
        self.t_step += 1
        self.prev_budget = self.rem_budget
        self.rem_budget -= (self.cost_t / 1e9)
        self.ROL -= 1
        self.BCR = (self.rem_budget - self.prev_budget) / self.prev_budget
        self.CPM = self.cost_t
        self.WR = self.wins_t / self.bids_t

    def _reset_step(self):
        """
        Function to call every time a new time step is entered.
        """
        self.reward_t = 0.
        self.cost_t = 0.
        self.wins_t = 0
        self.bids_t = 0
        self.eps = max(self.eps_start - self.anneal * self.t_step, 0.05)

    def _update_reward_cost(self, reward, cost):
        """
        Internal function to update reward and action to compute the cumulative
        reward and cost within the given step.
        """
        self.reward_t += reward
        self.cost_t += cost
        self.bids_t += 1
        self.total_rewards += reward

    def _get_state(self):
        """
        Returns the state that will be used for the DQN state.
        """
        return np.asarray([
            self.t_step, self.rem_budget, self.ROL, self.BCR, self.CPM,
            self.WR, self.reward_t
        ])

    def act(self, state, reward, cost):
        """
        This function gets called with every bid request.
        By looking at the weekday and hour to progress between the steps and
        episodes during training.
        Returns the bid request cost based on the scaled version of the
        bid price using the DQN agent output.
        """
        episode_done = (state['weekday'] != self.cur_day)
        # within the time step
        if state['hour'] == self.cur_hour and state['weekday'] == self.cur_day:
            self._update_reward_cost(reward, cost)
        # within the episode, changing the time step
        elif state['hour'] != self.cur_hour and state[
                'weekday'] == self.cur_day:
            self._update_step()
            # Sample a mini batch and perform grad-descent step
            self.reward_net.step()
            dqn_next_state = self._get_state()
            a_beta = self.dqn_agent.act(dqn_next_state, eps=self.eps)
            sa = np.append(self.dqn_state, self.dqn_action)
            rnet_r = float(self.reward_net.act(sa))
            # call agent step
            self.dqn_agent.step(self.dqn_state, self.dqn_action, rnet_r,
                                dqn_next_state, episode_done)
            self.dqn_state = dqn_next_state
            self.dqn_action = a_beta
            # print(dqn_next_state, a_beta)
            self.ctl_lambda *= (1 + self.BETA[a_beta])
            self.cur_hour = state['hour']
            self._reset_step()
            self._update_reward_cost(reward, cost)
            self.V += self.reward_t
            self.S.append((self.dqn_state, self.dqn_action))
        # episode changes
        elif state['weekday'] != self.cur_day:
            for (s, a) in self.S:
                sa = tuple(np.append(s, a))
                max_r = max(self.reward_net.get_from_M(sa), self.V)
                self.reward_net.add_to_M(sa, max_r)
                self.reward_net.add(sa, max_r)
            print("Total Impressions won with Budget={} Spend={} wins = {}".
                  format(self.budget, self.budget_spend, self.wins_e))
            self.total_wins += self.wins_e
            self._reset_episode()
            self.cur_day = state['weekday']
            self.cur_hour = state['hour']
            self._update_reward_cost(reward, cost)

        # action = bid amount
        # send the best estimate of the bid
        self.budget_spend += (cost / 1e9)
        if cost > 0:
            self.wins_t += 1
            self.wins_e += 1
        action = min(
            self.ctl_lambda * self.target_value * state['click_prob'] * 1e9,
            (self.budget - self.budget_spend) * 1e9)
        return action

    def done(self):
        return self.budget <= self.budget_spend
Пример #3
0
        env._render()
        steps_taken += 1
        update_values = [False] * flags.num_agents
        action_dict = {}

        # 对每一个agent算法进行更新
        # TODO UPDATE
        # for a in range(flags.num_agents):
        #     if info['action_required'][a]:
        #           action_dict[a] = agent.act(agent_obs[a], eps=eps)
        #           # action_dict[a] = np.random.randint(5)
        #           update_values[a] = True
        #           steps_taken += 1
        #     else: action_dict[a] = 0

        action_dict[a] = agent.act(agent_obs[0], eps=eps)

        joint_action = action_wrapper(action_dict)

        obs, rewards, done, info = env.step(joint_action)
        # if rewards[0][0] != 0 or rewards[0][1] != 0:

        score += rewards[0][0]
        # if score != 0:

        # Check for collisions and episode completion
        # if step == max_steps - 1:
        #     done['__all__'] = True

        # Update replay buffer and train agent
        for a in range(flags.num_agents):
Пример #4
0
import sys
import gym
from dqn import Agent

num_episodes = 5000

env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0"
env = gym.make(env_name)

agent = Agent(state_size=env.observation_space.shape,
              number_of_actions=env.action_space.n,
              save_name=env_name)

for e in xrange(num_episodes):
    observation = env.reset()
    done = False
    agent.new_episode()
    total_cost = 0.0
    total_reward = 0.0
    frame = 0
    while not done:
        frame += 1
        #env.render()
        action, values = agent.act(observation)
        #action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        total_cost += agent.observe(reward)
        total_reward += reward
    print "total reward", total_reward
    print "mean cost", total_cost / frame
Пример #5
0
import gym
from keras.models import load_model
from dqn import Agent

env_name = 'CartPole-v0'
eps = 0.8
episodes = 5
env = gym.make(env_name)
model = load_model('./model/my_model.h5')
agent = Agent(env)

for episode in range(episodes):
    # initial state
    s = env.reset()

    done = False
    while not done:
        for i in range(50):
            a = agent.act(s, eps)
            env.render(a)
            s2, r, done, info = env.step(a)
            s = s2
env.close()
Пример #6
0
class Neural_Agent:
    def __init__(self,
                 bandit,
                 epsilon,
                 alpha,
                 layersize=128,
                 UI=1000,
                 gm=0.99,
                 remember=False,
                 algorithm='DQNxR'):
        self.size = bandit.nvot
        if algorithm == 'DQNxR':
            seed = np.random.rand()  #DOESNT DO ANYTHING

            self.DQN = DQNxR(state_size=self.size,
                             action_size=bandit.N,
                             seed=seed,
                             alpha=alpha,
                             UI=UI,
                             batch_size=10,
                             gamma=gm,
                             tau=1e-3,
                             buffer_size=int(1e5))
            #print(vars(self.DQN))
            self.epsilon = epsilon
            self.last_state = None
            self.remember = remember
        elif algorithm == 'policygrad':
            self.DQN = None
            self.policy = PolicyGrad(state_space=self.size,
                                     action_space=bandit.N,
                                     hidden_layer_size=layersize,
                                     gamma=gm)
            self.optimizer = optim.Adam(self.policy.parameters(), lr=alpha)
            self.update_interval = UI
            self.remember = remember

#POLICY GRADIENT

    def select_action(self, state):
        #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state
        state = torch.from_numpy(state).type(torch.FloatTensor)
        state = self.policy(Variable(state))
        c = Categorical(state)
        action = c.sample()

        # Add log probability of our chosen action to our history
        if self.policy.policy_history.dim() != 0:
            #print(policy.policy_history)
            #print(c.log_prob(action))
            self.policy.policy_history = torch.cat(
                [self.policy.policy_history,
                 c.log_prob(action).unsqueeze(0)])
            #print("DID!")
        else:
            self.policy.policy_history = (c.log_prob(action))
        return action

    def update_policy(self):
        R = 0
        rewards = []

        #print(self.policy.reward_episode)

        # Discount future rewards back to the present using gamma
        for r in self.policy.reward_episode[::-1]:
            R = r + self.policy.gamma * R
            rewards.insert(0, R)

        # Scale rewards
        rewards = torch.FloatTensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() +
                                                np.finfo(np.float32).eps)

        # Calculate loss
        loss = (torch.sum(
            torch.mul(self.policy.policy_history, Variable(rewards)).mul(-1),
            -1))

        # Update network weights
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        #self.policy.loss_history.append(loss.data.item())
        #self.policy.reward_history.append(np.sum(policy.reward_episode))
        self.policy.policy_history = Variable(torch.Tensor())
        self.policy.reward_episode = []

#UNIVERSAL

    def update_Q(self, action, reward):
        if self.DQN is not None:
            self.AR = (action, reward)
        else:
            if len(self.policy.reward_episode) == self.update_interval:
                self.policy.reward_episode.append(reward)
                self.update_policy()
            else:
                self.policy.reward_episode.append(reward)

    def get_action(self, bandit, actnum, decline, N_episodes):
        if self.remember == False:
            state = np.ones(self.size) / 100
        elif self.remember == "Rewards":
            state_info = bandit.last_rewards
            state = np.array(state_info)
            #print(actnum, state)
        elif self.remember == "Actions":
            state_info = bandit.last_actions
            state = np.array(state_info)
        elif self.remember == "Actions_now":
            state = bandit.partial_result

        if self.DQN is not None:
            if self.last_state is not None:
                #print(actnum,self.last_state,self.AR[0],self.AR[1],state)
                self.DQN.step(self.last_state,
                              self.AR[0],
                              self.AR[1],
                              state,
                              done=False)
                #print(self.last_state,self.AR[0],self.AR[1],state)

            actnum = self.DQN.act(state, self.epsilon).item()
            self.last_state = state
        else:
            actnum = self.select_action(state).item()

            #print(state, actnum)

        return actnum
Пример #7
0
        'capacity': 10000,
        'batch_size': 64,
        'state_space_dim': env.observation_space.shape[0],
        'action_space_dim': env.action_space.n
    }
    agent = Agent(**params)

    score = []
    mean = []

    for episode in range(1000):
        s0 = env.reset()
        total_reward = 1
        while True:
            env.render()
            a0 = agent.act(s0)
            s1, r1, done, _ = env.step(a0)

            if done:
                r1 = -1

            agent.put(s0, a0, r1, s1)

            if done:
                break

            total_reward += r1
            s0 = s1
            agent.learn()

        score.append(total_reward)
Пример #8
0
    xrange
except NameError:
    xrange = range

num_episodes = 20

env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0"
env = gym.make(env_name)

agent = Agent(state_size=env.observation_space.shape,
              number_of_actions=env.action_space.n,
              save_name=env_name)

for e in xrange(num_episodes):
    observation = env.reset()
    done = False
    agent.new_episode()
    total_cost = 0.0
    total_reward = 0.0
    frame = 0
    while not done:
        frame += 1
        #env.render()
        action, values = agent.act(observation)
        #action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        total_cost += agent.observe(reward)
        total_reward += reward
    print("total reward", total_reward)
    print("mean cost", total_cost/frame)
Пример #9
0
    reward_list = []
    grasp_success_list = []
    grasp_success_rate_list = []

    for e in range(1, EPISODES+1):
        dist_list = []
        log('###################################################')
        log('#################### EPISODE ' + str(e) + ' ' + '#'*(20-int(math.log(e, 10))))
        log('###################################################')
        state = env.reset()
        object_state = np.reshape(state[0], [1, state_dim[0]])
        arm_state = np.reshape(state[1], [1, state_dim[1]])
        state = (object_state, arm_state)
        total_reward = 0
        for t in range(1, TIME_STEPS+1):
            action = dqn_agent.act(state)
            # log('############### ITERATION ' + str(t) + ' ' + '#'*(15-int(math.log(t, 10))))
            state_next, reward, terminal, info, next_distance, successful_grasping = env.step(action)
            total_reward += reward
            dist_list.append(next_distance)
            object_state_next = np.reshape(state_next[0], [1, state_dim[0]])
            arm_state_next = np.reshape(state_next[1], [1, state_dim[1]])
            state_next = (object_state_next, arm_state_next)
            # log('State: ' + str(state))
            # log('Action: ' + str(action))
            # log('Reward: ' + str(reward))
            # log('Next State: ' + str(state_next))
            # log('Done: ' + str(terminal))
            dqn_agent.remember(state, action, reward, state_next, terminal)
            state = state_next
Пример #10
0
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

# Play the Trained Agent

# load the weights from file
agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth', map_location=lambda storage, loc: storage))

env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0                                          # initialize the score
while True:
    action = agent.act(state)    # select an action 
    #print(action)        
    env_info = env.step(vector_action=[action])    # send the action to the environment                  
    next_state = env_info[brain_name].vector_observations[0]# get the next state 
    reward = env_info[brain_name].rewards[0]       # get the reward
    done = env_info[brain_name].local_done[0]      # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if done:                                       # exit loop if episode finished
        break
    
print("End Score: {}".format(score))

env.close()