示例#1
0
def train_dqn(env, args):

    agent = DQN(env, args)
    agent.train()

    total_episodes = args.episodes
    max_steps = 10

    for episode in range(total_episodes):
        print(episode, agent.epsilon, end='\r')

        state = env.reset()
        done = False

        for step in range(max_steps):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)

            agent.push(state, action, reward, next_state, done)
            agent.learn(episode)

            state = next_state

            if done:
                break

        if episode % 5 == 0:
            max_steps += 10

    return agent
示例#2
0
def main(argv):
    env_name = FLAGS.env_name
    env = gym.make(env_name)
    agent = DQN(env, load_path=f'train/{env_name}/')

    for episodes in range(FLAGS.num_episodes):
        done = False
        obs = env.reset()
        episode_reward = 0
        while not done:
            env.render()
            action = agent.act(np.expand_dims(obs, axis=0))
            obs, rew, done, info = env.step(action)
            episode_reward += rew
        print(f'Episode Reward:{episode_reward}')
                        
                if game_params[curr_task]['pygame']:
                    env.reset_game()
                    state = env.getGameState()
                else:
                    state = env.reset()
                    state = np.reshape(state, [1, sim_params['num_inputs']])
                state = normalise_state(state, game_params[curr_task]['state_means'], game_params[curr_task]['state_stds'],
                                        task_id=arch_params['task_id'], num_tasks=sim_params['num_tasks'],curr_task=curr_task)

                total_r = 0
                done = False

                for t in range(sim_params['episode_length']):
                    # Choose action
                    action, maxQ = agent.act(state, curr_task, test=test)
                    totalq += maxQ
                    if game_params[curr_task]['pygame']:
                        reward = env.act(game_params[curr_task]['actions'][action])
                        if reward < 0:
                            reward = -1
                        next_state = env.getGameState()
                        done = env.game_over()
                        if t > (sim_params['episode_length']-2):
                            done = True
                        if train_params['catcherscale']:
                            reward = reward * train_params['r_scale']
                    else:
                        next_state, reward, done, _ = env.step(action)
                        next_state = np.reshape(next_state,[1,sim_params['num_inputs']])
                        if not train_params['catcherscale']:
示例#4
0
    memory = ReplayMemory(env_config['memory_size'])

    # Initialize optimizer used for training the DQN. We use Adam rather than RMSProp.
    optimizer = torch.optim.Adam(dqn.parameters(), lr=env_config['lr'])

    # Keep track of best evaluation mean return achieved so far.
    best_mean_return = -float("Inf")
    for episode in range(env_config['n_episodes']):
        done = False
        obs = preprocess(env.reset(), envID=args.env, env=env).unsqueeze(0)
        obs_stack = torch.cat(env_config['obs_stack_size'] *
                              [obs]).unsqueeze(0).to(device)
        count = 0
        while not done:
            # TODO: Get action from DQN.
            action = dqn.act(obs_stack)
            # Act in the true environment.
            #print(env)
            #old_obs = obs
            obs, reward, done, info = env.step(action.item() +
                                               ENV_CONFIGS[args.env]['offset'])
            # Preprocess incoming observation.
            if not done:
                obs = preprocess(obs, envID=args.env, env=env).unsqueeze(0)
                next_obs_stack = torch.cat(
                    (obs_stack[:, 1:, ...], obs.unsqueeze(1)),
                    dim=1).to(device)
            else:
                next_obs_stack = None

            #action = action - ENV_CONFIGS[args.env]['offset']
class RlBidAgent():
    def _load_config(self):
        """
        Parse the config.cfg file
        """
        cfg = configparser.ConfigParser(allow_no_value=True)
        env_dir = os.path.dirname(__file__)
        cfg.read(env_dir + '/config.cfg')
        self.exp_type = str(cfg['experiment_type']['type'])
        self.T = int(
            cfg[self.exp_type]['T'])  # Number of timesteps in each episode

    def __init__(self):
        self._load_config()
        # Beta parameter adjsuting the lambda parameter, that regulates the agent's bid amount
        self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08]
        # Starting value of epsilon in the adaptive eps-greedy policy
        self.eps = 0.9
        # Parameter controlling the annealing speed of epsilon
        self.anneal = 2e-5
        if self.exp_type in ('improved_drlb', 'improved_drlb_eval'):
            # DQN Network to learn Q function
            self.dqn_agent = DQN(state_size=6, action_size=7)
            # Reward Network to learn the reward function
            self.reward_net = RewardNet(state_action_size=7, reward_size=1)
        else:
            self.dqn_agent = DQN(state_size=7, action_size=7)
            self.reward_net = RewardNet(state_action_size=8, reward_size=1)
        # Number of timesteps in each episode (4 15min intervals x 24 hours = 96)
        # self.T = 672
        # Initialize the DQN action for t=0 (index 3 - no adjustment of lambda, 0 ind self.BETA)
        self.dqn_action = 3
        self.ctl_lambda = None
        # Arrays saving the training history
        self.step_memory = []
        self.episode_memory = []
        # Params for tracking the progress
        self.global_T = 0  # Tracking the global time step
        self.episode_budgets = None
        self.budget = None
        self.total_wins = 0
        self.total_rewards = 0
        self.rewards_prev_t = 0
        self.rewards_prev_t_ratio = 0
        self.rnet_r = 0
        self.wins_e = 0
        self.rewards_e = 0
        self.ROL = self.T
        self.ROL_ratio = 1

    def _get_state(self):
        """
        Returns the state that will be used as input in the DQN
        """
        if self.exp_type in ('improved_drlb', 'improved_drlb_eval'):
            return np.asarray([
                self.
                rem_budget_ratio,  # 2. the ratio of the remaining budget to total available budget at time-step t
                self.
                ROL_ratio,  # 3. The ratio of the number of Lambda regulation opportunities left 
                self.BCR,  # 4. Budget consumption rate
                self.
                CPI,  # 5. Cost per impression between t-1 and t, in relation to the highest cost possible in the training set (300)
                self.WR,  # 6. Auction win rate at state t
                self.rewards_prev_t_ratio
            ])  # 7. Ratio of acquired/total clicks at timestep t-1
        else:
            return np.asarray([
                self.t_step,  # 1. Current time step
                self.rem_budget,  # 2. the remaining budget at time-step t
                self.
                ROL,  # 3. The number of Lambda regulation opportunities left
                self.BCR,  # 4. Budget consumption rate
                self.
                CPM,  # 5. Cost per mille of impressions between t-1 and t:
                self.WR,  # 6. Auction win rate at state t
                self.rewards_prev_t
            ])  # 7. Clicks acquired at timestep t-1

    def _reset_episode(self):
        """
        Function to reset the state when episode changes
        """
        # Reset the count of time steps
        self.t_step = 0
        # Lambda regulation parameter - set according to the greedy approximation algorithm, as suggested by the paper
        if self.exp_type == 'vanilla_drlb':
            self.ctl_lambda = 0.01 if self.budget is None else self.calc_greedy(
                self.greedy_memory, self.budget)
            # Clean up the array used to save all the necessary information to solve the knapsack problem with the GA algo
            self.greedy_memory = []
        elif self.exp_type == 'episode_lambda':
            self.ctl_lambda = 0.01
        else:
            pass
        # Next episode -> next step
        self._reset_step()
        # Set the budget for the episode
        self.budget = self.episode_budgets.pop(0)
        self.rem_budget = self.budget
        self.rem_budget_ratio = 1
        self.budget_spent_t = 0
        self.budget_spent_e = 0
        if self.exp_type not in ('free_lambda', 'free_lambda_eval',
                                 'improved_drlb', 'improved_drlb_eval'):
            self.ROL = self.T  # 3. The number of Lambda regulation opportunities left
            self.ROL_ratio = 1
        self.cur_day = 0
        self.cur_min = 0
        self.total_wins += self.rewards_e
        self.total_rewards += self.wins_e
        # Impressions won in each episode
        self.wins_e = 0
        # Clicks won in each episode
        self.rewards_e = 0
        # Dict and Value necessary for learning the RewardNet
        self.reward_net.V = 0
        self.reward_net.S = []

    def _update_step(self):
        """
        Function that is called before transitioning into step t+1 (updates state t)
        """
        self.global_T += 1
        self.t_step += 1
        self.prev_budget = self.rem_budget
        self.rem_budget = self.prev_budget - self.budget_spent_t
        self.budget_spent_e += self.budget_spent_t
        self.rewards_prev_t = self.reward_t
        self.ROL -= 1
        self.BCR = 0 if self.prev_budget == 0 else -(
            (self.rem_budget - self.prev_budget) / self.prev_budget)
        if self.exp_type in ('improved_drlb', 'improved_drlb_eval'):
            self.CPI = 0 if self.wins_t == 0 else (self.cost_t /
                                                   self.wins_t) / 300
            self.rewards_prev_t_ratio = 1 if self.possible_clicks_t == 0 else self.reward_t / self.possible_clicks_t
            self.ROL_ratio = self.ROL / self.T
            self.rem_budget_ratio = self.rem_budget / self.budget
        else:
            self.CPM = 0 if self.wins_t == 0 else (
                (self.cost_t / self.wins_t) * 1000)
        self.WR = self.wins_t / self.imp_opps_t
        # Adaptive eps-greedy policy
        self.eps = max(0.95 - self.anneal * self.global_T, 0.05)

    def _reset_step(self):
        """
        Function to call every time a new time step is entered.
        """
        self.possible_clicks_t = 0
        self.total_rewards_t = 0
        self.reward_t = 0
        self.cost_t = 0
        self.wins_t = 0
        self.imp_opps_t = 0
        self.BCR = 0
        if self.exp_type in ('improved_drlb', 'improved_drlb_eval'):
            self.CPI = 0
        else:
            self.CPM = 0
        self.WR = 0
        self.budget_spent_t = 0

    def _update_reward_cost(self, bid, reward, potential_reward, cost, win):
        """
        Internal function to update reward and action to compute the cumulative
        reward and cost within the given step.
        """
        self.possible_clicks_t += potential_reward
        if win:
            self.budget_spent_t += cost
            self.wins_t += 1
            self.wins_e += 1
            self.total_wins += 1
            self.reward_t += reward
            self.rewards_e += reward
            self.total_rewards += reward
            self.cost_t += cost

    def _model_upd(self, eval_mode):
        if not eval_mode:
            self.reward_net.step()  # update reward net

        next_state = self._get_state(
        )  # observe state s_t+1 (state at the beginning of t+1)
        # get action a_t+1 (adjusting lambda_t to lambda_t+1) from the adaptive greedy policy
        a_beta = self.dqn_agent.act(next_state,
                                    eps=self.eps,
                                    eval_mode=eval_mode)
        self.ctl_lambda *= (1 + self.BETA[a_beta])

        if not eval_mode:
            # updates for the RewardNet
            sa = np.append(self.cur_state, self.BETA[
                self.dqn_action])  #self.dqn_action) # state-action pair for t
            self.rnet_r = float(
                self.reward_net.act(sa))  # get reward r_t from RewardNet
            self.reward_net.V += self.reward_t
            self.reward_net.S.append(
                (self.cur_state, self.BETA[self.dqn_action]))

            # Store in D1 and sample a mini batch and perform grad-descent step
            self.dqn_agent.step(self.cur_state, self.dqn_action, self.rnet_r,
                                next_state)

        self.cur_state = next_state  # set state t+1 as state t
        self.dqn_action = a_beta  # analogously with the action t+1

    def act(self, obs, eval_mode):
        """
        This function gets called with every bid request.
        By looking at the weekday and hour to progress between the steps and
        episodes during training.
        Returns the bid decision based on the scaled version of the
        bid price using the DQN agent output.
        """
        # within the time step
        if obs['min'] == self.cur_min and obs['weekday'] == self.cur_day:
            pass
        # within the episode, changing the time step
        elif obs['min'] != self.cur_min and obs['weekday'] == self.cur_day:
            self._update_step()
            self._model_upd(eval_mode)
            self.cur_min = obs['min']
            # save history
            self.step_memory.append([
                self.global_T,
                int(self.rem_budget), self.ctl_lambda, self.eps,
                self.dqn_action, self.dqn_agent.loss, self.rnet_r,
                self.reward_net.loss
            ])
            self._reset_step()
        # transition to next episode
        elif obs['weekday'] != self.cur_day:
            self._update_step()
            self._model_upd(eval_mode)
            self.step_memory.append([
                self.global_T,
                int(self.rem_budget), self.ctl_lambda, self.eps,
                self.dqn_action, self.dqn_agent.loss, self.rnet_r,
                self.reward_net.loss
            ])
            # Updates for the RewardNet at the end of each episode (only when training)
            if not eval_mode:
                for (s, a) in self.reward_net.S:
                    sa = tuple(np.append(s, a))
                    max_r = max(self.reward_net.get_from_M(sa),
                                self.reward_net.V)
                    self.reward_net.add_to_M(sa, max_r)
                    self.reward_net.add(sa, max_r)
            print(
                "Episode Result with Step={} Budget={} Spend={} impressions={} clicks={}"
                .format(self.global_T, int(self.budget),
                        int(self.budget_spent_e), self.wins_e, self.rewards_e))
            # Save history
            self.episode_memory.append([
                self.budget,
                int(self.budget_spent_e), self.wins_e, self.rewards_e
            ])
            self._reset_episode()
            self.cur_day = obs['weekday']
            self.cur_min = obs['min']

        self.imp_opps_t += 1
        bid = self.calc_bid(obs['pCTR'])

        if self.exp_type == 'vanilla_drlb':
            self.greedy_memory.append([
                obs['pCTR'], obs['payprice'],
                obs['pCTR'] / max(obs['payprice'], 1)
            ])

        return bid

    def calc_bid(self, imp_value):
        # Calculate the theoretically optimal bid
        bid_amt = round(imp_value / self.ctl_lambda, 2)

        curr_budget_left = self.rem_budget - self.budget_spent_t

        if bid_amt > curr_budget_left:
            bid_amt = curr_budget_left

        return bid_amt

    def calc_greedy(self, items, budget_limit):
        # Borrowed from: https://bitbucket.org/trebsirk/algorithms/src/master/knapsack.py
        # Greedy approximation algorithm (Dantzig, 1957)
        bids = []
        spending = 0
        ctr = 0
        items_sorted = sorted(items, key=itemgetter(2), reverse=True)
        while len(items_sorted) > 0:
            item = items_sorted.pop()
            if item[1] + spending <= budget_limit:  # should be item[1], currently adds pCTR instead of price?????
                bids.append(item)
                spending += bids[-1][1]
                ctr += bids[-1][0]
            else:
                break
        ctrs = np.array(bids)[:, 0]
        costs = np.array(bids)[:, 1]
        # Take the max lambda to be more conservative at the beginning of a time step
        opt_lambda = np.max(np.divide(ctrs, costs))
        return opt_lambda
示例#6
0
class Generator(nn.Module):
    def __init__(self, args, data, g, level2_parients):
        super(Generator, self).__init__()
        self.args = args
        self.data = data
        self.g = g
        self.level2_parients = level2_parients

        self.cnn = VanillaConv(args, vocab_size=data.size())
        self.pathEncoder = PathEncoder(args, self.g)
        self.pathDecoder = PathDecoder(args)

        self.DQN = DQN(args)

        self.pathHist = []  # 保存着已经选择的路径(只保留最后的一个ICD)

        self.attn = nn.Linear(args.node_embedding_size * 4,
                              args.node_embedding_size * 3)
        self.attn_combine = nn.Linear(args.node_embedding_size * 2,
                                      args.node_embedding_size)

        #self.atten=selfAttention(hidden_dim=args.node_embedding_size*4)

        # Attentional affine transformation
        # self.r_x = nn.Linear(args.node_embedding_size, args.node_embedding_size * 3)
        # nn.init.normal_(self.r_x.weight, mean=0, std=1)
        # self.b_x = nn.Linear(args.node_embedding_size, args.node_embedding_size * 3)
        # nn.init.normal_(self.b_x.weight, mean=0, std=1)

        self.r_x = nn.Parameter(
            torch.FloatTensor(args.node_embedding_size,
                              args.node_embedding_size * 3))
        # 使用xaview_uniform_方法初始化权重
        nn.init.xavier_uniform_(self.r_x.data)  # (2,8285,16)
        self.b_x = nn.Parameter(
            torch.FloatTensor(args.node_embedding_size,
                              args.node_embedding_size * 3))
        # nn.init.xavier_normal_(self.fc2.weight)
        nn.init.xavier_uniform_(self.b_x.data)  # (95,2)

        self.optimizer = optim.Adam(self.parameters(), lr=args.lr)

    # 传入的是一个batch的数据量
    # K表示针对每个hop 选择出前K个最有可能的action
    def forward(self, ehrs, hier_labels):
        batchPaths = []

        # 针对batch 中的每个样本(每个电子病历)进行单独的取样
        for i in range(len(ehrs)):
            example_states = []
            example_rewards = []
            example_done = []
            example_actionIndexs = []

            # 首先初始化hidden
            hidden = torch.Tensor(np.zeros(
                (1, self.args.path_hidden_size))).to(self.args.device)
            # 1.得到电子病历的表示
            ehrRrep = self.cnn(ehrs[i])  # 放在此处是为了每个样本都重置下环境
            self.sequences = [[[self.args.node2id.get('ROOT')], 1.0]]  # 根节点

            for hop in range(self.args.hops):  # 每个样本的尝试次数(对应于每个样本的标签个数,即路径个数),
                # 其实不同路径之间也应该相互影响,已选择的路径要影响到下一个路径开始节点的选择,这个怎么建模呢??
                # 输入EHR的,得到G网络生成的路径

                if hop != 0:
                    hidden = hidden.sum(dim=1)

                # 2.得到attentive的EHR表示
                #atten_weights = F.softmax(self.attn(torch.cat((hidden, ehrRrep), 1))) #[1,300]
                # attn_ehrRrep = torch.mul(atten_weights, ehrRrep)  # [1,300]
                #attn_ehrRrep=self.atten(torch.cat((hidden,ehrRrep),1))
                # print('hidden:',hidden)
                state = F.relu(self.attn(torch.cat((hidden, ehrRrep), 1)))
                # print('ehrRrep:',ehrRrep)
                # print('attn_ehrRrep:', attn_ehrRrep)
                #
                # # 3.得到融合了EHR和路径信息的表示,即state的表示
                # #state = self.r_x(hidden) * attn_ehrRrep + self.b_x(hidden)  # [32, 300], state:[1,300]
                # state = torch.mm(hidden,self.r_x)+ attn_ehrRrep
                #print('state:',state)

                # 4.首先获得当前跳的action_space空间 然后DQN根据state在该空间中选择action
                if hop == 0:
                    children = torch.Tensor(self.level2_parients).long().to(
                        self.args.device)
                    children_len = torch.Tensor([13
                                                 ]).long().to(self.args.device)
                else:
                    # selected_action作为父节点 选择对应的孩子节点
                    children, children_len = action_space(
                        selected_action, self.args)  # action:[32]

                # print('children.shape:',children.shape)        #[1, 101]
                # 在选择action 之前 也要执行以下判断,如果children_len中包含有0 说明选择出了叶子节点

                action_values, actions, actionIndexs = self.DQN.act(
                    state, children, children_len)
                print('hop:', hop)
                # print('actions:',actions)

                selected_action, actionList = self.beam_search(
                    action_values, actions)

                # 4.将当前选择的节点和上一步选择的节点(保存在self.actionList中) 输入到path encoder中得到路径的表示
                path = self.pathEncoder(selected_action, actionList)

                # 5.将路径表示输入到path Decoder中以更新hidden的表示
                path = path.unsqueeze(0)

                output = self.pathDecoder(path)
                hidden = self.pathDecoder.hidden

                # 执行这些选择出来的action, 更改环境的变化
                reward, done = self.step(actionList, hier_labels[i], hop)

                example_rewards.append(reward)
                example_states.append(state)
                example_done.append(done)
                example_actionIndexs.append(actionIndexs)

            # 最后一个hop之后得到的state(训练时要使用)
            hidden = hidden.sum(dim=1)
            state = F.relu(self.attn(torch.cat((hidden, ehrRrep), 1)))
            example_states.append(state)

            # 将这些数据转换成(state,action,reward,next_state,done)保存到memory中
            for i in range(len(example_states)):
                example_states[i] = example_states[i].data.cpu().numpy()

            for i in range(len(example_rewards)):
                for j in range(len(example_actionIndexs[i])):
                    self.DQN.buffer.push(example_states[i],
                                         example_actionIndexs[i][j][0],
                                         example_rewards[i],
                                         example_states[i + 1],
                                         example_done[i])

            batchPaths.append(actionList)
        return batchPaths

    def beam_search(self, data, actions_store):
        # print('data:',data)
        # print('actions_store:',actions_store)

        # 若 选择出的action是叶子节点 则要中断这个路径

        all_candidates = list()
        for sequence, row, actions in zip(self.sequences, data, actions_store):
            seq, score = sequence

            for j in range(len(row)):
                candidate = [seq + [actions[j].item()], score + row[j].item()]
                all_candidates.append(candidate)

            # order all candidates by scores
            ordered = sorted(all_candidates,
                             key=lambda tup: tup[1],
                             reverse=True)
            # select k best
            self.sequences = ordered[:self.args.k]
            #print('self.sequences:',self.sequences)
        selected_action = [row[0][-1] for row in self.sequences]
        print('selected_action:', selected_action)
        selected_path = [row[0] for row in self.sequences]
        return selected_action, selected_path

    def step(self, actionList, hier_label, hop):
        # 对比当前预测出的路径以及真实的路径 设定对应的reward
        hop_tures = []
        for row in hier_label:
            hop_tures.extend(row)

        for row in actionList:
            if row[hop + 1] in hop_tures:
                reward = 1
            else:
                reward = -1
        if hop == 3:
            done = True
        else:
            done = False

        return reward, done
示例#7
0
文件: run.py 项目: bonniesjli/parc
LOG_DIR = '{}/{}_DQN_{}'.format(args.dir, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env)
writer = SummaryWriter(logdir=LOG_DIR)

total_numsteps = 0
for i_episode in itertools.count(1):

    episode_reward = 0
    episode_steps = 0
    done = False
    state = env.reset()

    while not done:
        if total_numsteps < args.start_steps:
            action = env.action_space.sample()  # Sample random action
        else:
            action = agent.act(state)  # Sample action from policy

        next_state, reward, done, _ = env.step(action) # Step
        episode_steps += 1
        total_numsteps += 1
        episode_reward += reward
        # Ignore the "done" signal if it comes from hitting the time horizon.
        # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
        mask = 1 if episode_steps == env._max_episode_steps else float(not done)

        agent.step(state, action, reward, next_state, mask)
        if total_numsteps >= args.start_steps and total_numsteps % args.update_freq == 0:
            q_loss = agent.update()
            writer.add_scalar('loss/q', q_loss, total_numsteps)

        state = next_state
示例#8
0
target.eval() # fixed the target net, we don't want to train it
mse = nn.MSELoss()
optimizer = optim.RMSprop(policy.parameters())
replay_buffer = ReplayBuffer(BUFFER_SIZE)

# training phase
total_game_step = 0
for current_episode in range(EPISODE):
    state = env.reset() # get the initial observation
    game_step = 0
    total_reward = 0
    state = torch.tensor([state]).float().to(DEVICE)
    while True:
        game_step += 1
        total_game_step += 1
        action = policy.act(state, total_game_step, isTrain = True).to(DEVICE) # sample an action
        next_state, reward, done, _ = env.step(action.item()) # take action in environment
        total_reward += reward
        reward = torch.tensor([reward]).float().to(DEVICE)
        
        if done: # whether this episode is terminate (game end)
            next_state = None
        else:
            next_state = torch.tensor([next_state]).float().to(DEVICE)
        
        replay_buffer.store(state, action, reward, next_state)
        state = next_state

        # optimze model with batch_size sample from buffer

        if replay_buffer.lenth() > BATCH_SIZE: # only optimize when replay buffer have sufficient number of data
示例#9
0
import gym
from gym.wrappers import Monitor

from dqn import DQN

env = gym.make('LunarLander-v2')
#env = Monitor(env, 'videos', video_callable=lambda episode_id: True)
dqn = DQN(env)
dqn.load()


for x in range(20):
    state = env.reset()
    done = False

    while not done:
        env.render()
        action = dqn.act(state, False)
        next_state, reward, done, info = env.step(action)
        state = next_state

env.close()
示例#10
0
    # Create a DQN with a replay buffer capacity up to 150000 experiences
    agent = DQN(state_size, action_size, 150000)
    # Initialize episode counter
    e = 0
    while True:
        # Make a new episode
        game.new_episode()
        episode_rewards = []

        # Get the current environment state and add it to the previously stacked frames
        state = game.get_state()
        state, stacked_frames = preprocessor.stack_frames(
            stacked_frames, state, True)
        for time in range(max_steps):
            # Get next action from the DQN
            action = agent.act(state)
            # Perform that action and recieve its reward
            reward = game.make_action(possible_actions[action])
            episode_rewards.append(reward)
            # Check whether the episode is finished or not
            done = game.is_episode_finished() or time == max_steps

            if done:
                # Episode finished
                agent.update_target_model()
                print("Episode: {}, score: {}, e: {:.2}".format(
                    e, np.sum(episode_rewards), agent.epsilon))
                # exit episode loop
                break
            else:
                # Get the next environment state and stack it to the previously stacked frames
示例#11
0
policy = DQN(POLICY_ARGS).to(DEVICE)
policy.load_state_dict(torch.load(PATH))
policy.eval()
env = gym.make('SpaceInvaders-ram-v0').unwrapped
print('play 10 episode')

for episode in range(10):
    state = env.reset()
    game_step = 0
    total_reward = 0
    state = torch.FloatTensor([state]).to(DEVICE)
    while True:
        env.render()
        time.sleep(0.05)
        game_step += 1
        action = policy.act(state, 1, isTrain=False).to(DEVICE)
        # print(action)
        # print(state.squeeze()[:20])
        # print(action.item())
        # i = game_step%4
        next_state, reward, done, _ = env.step(
            action.item())  # take action in environment
        total_reward += reward
        reward = torch.FloatTensor([reward]).to(DEVICE)
        if done:
            print('--------------------')
            print('episode: {episode}, game_step: {game_step}, total_reward: {total_reward}' \
            .format(episode=episode, game_step=game_step, total_reward=total_reward))
            # wandb.log({'total_reward': total_reward})
            break
        else:
示例#12
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--episodes',
                        type=int,
                        default=1000,
                        help='No of episodes')
    parser.add_argument('--episode_len',
                        type=int,
                        default=500,
                        help='length of episode')
    parser.add_argument('--openai_env',
                        type=str,
                        required=True,
                        help='env like MountainCar-v0, CartPole-v0 etc')
    parser.add_argument('--epsilon',
                        type=float,
                        default=1,
                        help='exploration parameter')
    parser.add_argument('--epsilon_decay',
                        type=float,
                        default=0.995,
                        help='epsilon decay rate')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.95,
                        help='discount factor')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.01,
                        help='learning_rate')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        help='batch size')

    args = parser.parse_args()
    parameters = {}
    for key, value in vars(args).items():
        parameters[key] = value

    env = gym.make(args.openai_env)
    model = DQN(env, parameters)
    model.build_model()
    saver = tf.train.Saver(max_to_keep=1)

    total_reward = 0
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for i in range(args.episodes):
            curr_state = env.reset().reshape(-1,
                                             env.observation_space.shape[0])
            j = 0
            done = False

            if model.epsilon > 0.15:
                model.epsilon *= model.epsilon_decay
                print(model.epsilon)

            #for j in range(args.episode_len):
            while done == False:
                print("episode:{} trial:{}".format(i, j))
                env.render()
                _, action = model.act(sess, curr_state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward
                print("action:{} next_state:{} ".format(action, next_state))

                next_state = next_state.reshape(-1,
                                                env.observation_space.shape[0])
                model.add_to_memory(curr_state, action, reward, next_state,
                                    done)
                model.replay(sess)

                curr_state = next_state
                j += 1
            if j < 199:
                print("Comleted in {} episodes".format(i))
                saver.save(sess,
                           "checkpoint/ckpt-" + str(i),
                           write_meta_graph=False)
                break
            else:
                saver.save(sess,
                           "checkpoint/ckpt-" + str(i),
                           write_meta_graph=False)
示例#13
0
    best_mean_return = -float("Inf")

    # Used for eps annealing
    step_number = 0

    # Used for the plot
    all_means = []

    for episode in range(env_config['n_episodes']):
        done = False

        obs = preprocess(env.reset(), env=args.env)

        while not done:
            # Get action from DQN.
            action = dqn.act(obs, step_number)

            # Act in the true environment.
            next_obs, reward, done, _ = env.step(action.item())

            reward = preprocess(reward, env=args.env)

            step_number += 1

            # Preprocess incoming observation.
            if done:
                next_obs = None
            else:
                next_obs = preprocess(next_obs, env=args.env)

            # Add the transition to the replay memory
示例#14
0
target = 270
sample_size = 50

env = gym.make('LunarLander-v2')
dqn = DQN(env)

e = 1
scores_list = []
while True:
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = dqn.act(state)
        next_state, reward, done, info = env.step(action)
        dqn.remember(state, action, reward, next_state, done)
        dqn.replay()
        state = next_state
        score += reward

    scores_list.append(score)
    last_rewards_mean = np.mean(scores_list[sample_size * -1:])
    print(
        str(e) + ": \t\t" + str(round(float(score))) + "\t\t" +
        str(round(float(last_rewards_mean))) + "\t\t" +
        str(round(dqn.epsilon, 3)))

    dqn.adjust_epsilon()
    e += 1
示例#15
0
def main():
    # Our environment
    env = gym.make("MountainCar-v0")

    trials = 200
    trial_len = 500

    updateTargetNetwork = 1000
    #Initialize our DQN agent
    dqn_agent = DQN(env=env, tau=1, file_name=file_name)
    steps = []
    max_show = -50
    # Re-run environment [trial] times
    for trial in range(trials):
        print("Trial {}".format(trial))
        # Start car at on every trial start
        cur_state = env.reset().reshape(1, 2)
        # Local variables
        local_max = -50
        max_position = -0.4

        for step in range(trial_len):
            #Predict action using our DQN action function
            action, temp_max = dqn_agent.act(cur_state)
            max_show = max(temp_max, max_show)
            local_max = max(temp_max, local_max)
            env.render()

            # Make a move in env using predicted action
            new_state, reward, done, _ = env.step(action)
            new_state = new_state.reshape(1, 2)

            # Adjust reward - i.e. Give more reward if max position reached!
            if cur_state[0][0] > max_position:
                max_position = cur_state[0][0]
                normalized_max = max_position + 0.6  # Reward range: 0 to 1
                reward = reward + 11 * (
                    normalized_max**3
                )  # incentivize closer to flag! Max reward of 10. n^3 reward
            # if done:
            #     reward = 20
            # elif step == 199:
            #     reward = reward - 1
            # print("Reward: {}".format(reward))

            # Now remember, train, and reorient goals
            dqn_agent.remember(cur_state, action, reward, new_state, done)
            if done:  #remember twice because important
                dqn_agent.remember(cur_state, action, reward, new_state, done)
            dqn_agent.replay()
            if step % 20 == 0:
                dqn_agent.target_train(False)
                # print("Retraining")

            cur_state = new_state
            if done:
                break
        if step >= 199:
            print("Failed to complete trial, best q: {}, max-Pos: {}".format(
                local_max, max_position))
        else:
            print("Completed in {} trials, best q: {}, max-Pos: {}".format(
                trial, local_max, max_position))
            print(
                "_______________!!!!!!!!!!!!!!!!!_______________!!!!!!!!!!!!!!!!!"
            )
            # Need to save model, so can reuse and get better over time
            dqn_agent.save_model(file_name)
示例#16
0
def main():
	original_size = (782, 600)
	env = ENV(actions, (original_size[0]/6, original_size[1]/6))
	gamma = 0.9
	epsilon = .95
	model_ph = 'models'
	if not os.path.exists(model_ph):
		os.mkdir(model_ph)
	trials = 500
	trial_len = 1000
	rewards = []
	q_values = []

	dqn_agent = DQN(env=env)
	success_num = 0
	rewards = []
	q_values = []
	Q = []
	for trial in range(1, trials):
		t_reward = []
		t_qvalue = []
		cur_state = env.reset()
		for step in range(trial_len):
			action = dqn_agent.act(cur_state)
			new_state, reward, done, success = env.step(action)
			t_reward.append(reward)
			
			# reward = reward if not done else -20
			dqn_agent.remember(cur_state, action, reward, new_state, done)
	
			q_value = dqn_agent.replay()  # internally iterates default (prediction) model
			if q_value:
				t_qvalue.append(q_value)
				Q.append(q_value)
			else:
				t_qvalue.append(0.0)
				Q.append(0.0)
			dqn_agent.target_train()  # iterates target model
			cur_state = new_state

			dqn_agent.log_result()

			save_q(Q)

			if success:
				success_num += 1
				dqn_agent.step = 100
				print("Completed in {} trials".format(trial))
				dqn_agent.save_model(os.path.join(model_ph, "success-model.h5"))
				break
			if done:
				print("Failed to complete in trial {}, step {}".format(trial, step))
				dqn_agent.save_model(os.path.join(model_ph, "trial-{}-model.h5").format(trial))
				break
		rewards.append(np.sum(t_reward) if t_reward else 0.0)
		q_values.append(np.mean(t_qvalue) if t_qvalue else 0.0)
		
		with open('reward_and_Q/reward.txt', 'wb') as f:
			pickle.dump(rewards, f)
		with open('reward_and_Q/qvalue.txt', 'wb') as f:
			pickle.dump(q_values, f)
		print('trial: {}, success acc: {}'.format(trial, success_num / float(trial)))
示例#17
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']


    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    grad_var_ = model_dict['grad_var_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    # if ls_:
    #     print ('env for ls')
    #     envs_ls = make_env_basic(env_name)

    # if vae_:
    #     print ('env for vae')
    #     envs_vae = make_env_basic(env_name)

    # if grad_var_:
    #     print ('env for grad_var_')
    #     envs_grad_var = make_env_basic(env_name)



    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0
    model_dict['action_size'] = envs.action_space.n
    print (envs.action_space.n, 'actions')



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')

    elif algo == 'dqn':
        agent = DQN(envs, model_dict)
        print ('init DQN agent')  
        print (agent.q_net)   



    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest, since its a stack
    # agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)


    # dqn_epsilon = .1 #lower means less likely to do random .9 # .1

    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 50000
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        dqn_epsilon = epsilon_by_frame(j)

        #Num steps till agent update
        # for step in range(num_steps):

        # Act, [P,1], [P,1], [P,1], [P]
        # state_pytorch = Variable(agent.rollouts.states[step])
        state_pytorch = Variable(current_state)
        # value, action, action_log_probs, dist_entropy = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True))
        action = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True))
        
        # Apply to Environment, S:[P,C,H,W], R:[P], D:[P]
        # cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
        frame, reward, done, info = envs.step(action) 

        # Record rewards and update state
        reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
        new_current_state = update_current_state(current_state, frame, shape_dim0)


        agent.replay_buffer.push(current_state, action, reward, new_current_state, done.astype(int))

        current_state = new_current_state


        if len(agent.replay_buffer) > 100:
            agent.update()
            # agent.update()
            # agent.update()
            # agent.update()







        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps)
            # #make vae prob gif
            # if grad_var_:
            #     do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()


            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.2f}, {:.5f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2,
                                       dqn_epsilon,
                                       agent.loss.data.cpu().numpy()[0])
                                       # torch.mean(discrim_errors).data.cpu().numpy()[0])

            print(to_print_info_string)


            # if vae_:
            #     elbo =  "{:.2f}".format(elbo.data.cpu().numpy()[0])


            # if next_state_pred_:
            #     state_pred_error_print =  "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0])
            #     print(to_print_info_string+' '+state_pred_error_print+' '+elbo)
            #     to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            # else:
            # if vae_:
            #     print(to_print_info_string+' '+elbo)
            # else:
            # print(to_print_info_string)


            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"#, elbo"
            start2 = time.time()

            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)

                # if grad_var_  and j % (log_interval*300) == 0:
                if grad_var_  and j % (log_interval*30) == 0:
                    #writes to file
                    do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards)






                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)

                    # if grad_var_ and j % (log_interval*300) == 0:
                    if grad_var_ and j % (log_interval*30) == 0:
                        update_grad_plot(model_dict)
                        to_print_legend_string += ' grad_var_plot updated '

                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")

                    # print (len(agent.replay_buffer))
                except:
                    raise #pass
                    print(to_print_legend_string + " problem with plot")



    try:
        make_plots(model_dict)
    except:
        print ()
示例#18
0
def train_dqn(episode,
              rand_obs=0,
              rand_act=0,
              noise_obs_level=0.01,
              noise_act_level=0.1):
    loss = []
    agent = DQN(env.action_space.n, env.observation_space.shape[0])
    all_actions = []
    all_rand_acts = []
    all_rewards = []
    for e in range(episode):
        curr_acts = []
        curr_rand_acts = []
        curr_rewards = []
        state = env.reset()
        state = np.reshape(state, (1, 8))
        score = 0
        max_steps = 5000
        for i in range(max_steps):
            if rand_obs == 1:
                state = get_observation(state,
                                        option=0,
                                        noise_obs_level=noise_obs_level)
            action = agent.act(state)
            if rand_act == 1:
                action, is_rand = get_action(action)
            else:
                action, is_rand = action, 0
            curr_acts.append(action)
            curr_rand_acts.append(is_rand)
            # env.render()
            next_state, reward, done, _ = env.step(action)
            curr_rewards.append(reward)
            score += reward
            next_state = np.reshape(next_state, (1, 8))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            agent.replay()
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        loss.append(score)
        all_actions.append(np.array(curr_acts))
        all_rand_acts.append(np.array(curr_rand_acts))
        all_rewards.append(np.array(curr_rewards))
        # Average score of last 100 episode
        is_solved = np.mean(loss[-100:])
        # if is_solved > 50:
        #     print('\n Task Completed! \n')
        #     break
        print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
    # np.savez("./saved/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + ".npz",
    #                       acts=np.array(all_actions),
    #                       rand_actions=np.array(all_rand_acts),
    #                       rewards=np.array(all_rewards),
    #                       scores=np.array(loss))
    # np.savez("./saved_dqn/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + "_noise_obs_lvl_" + str(noise_obs_level) + ".npz",
    #                       acts=np.array(all_actions),
    #                       rand_actions=np.array(all_rand_acts),
    #                       rewards=np.array(all_rewards),
    #                       scores=np.array(loss))
    np.savez("./saved_dqn/dqn_rand_act_" + str(rand_act) + "_rand_obs_" +
             str(rand_obs) + "_noise_act_lvl_" + str(noise_act_level) + ".npz",
             acts=np.array(all_actions),
             rand_actions=np.array(all_rand_acts),
             rewards=np.array(all_rewards),
             scores=np.array(loss))
    return loss