示例#1
0
    def __init__(self,
                 chance_coeff=1,
                 chance_coeff_hl=False,
                 mem_size=1000,
                 mem_batch_size=4,
                 penalize_if_repeats=True,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        self.loss_func = torch.nn.KLDivLoss()
        # Whether to just choose max prob. event (0.0) or pick from probability
        # distribution produced by PolicyNet (1.0).
        # It's a float, so you can control how much exploring should be done by agent
        # chance_coeff_hl: If defined, makes it a decaying variable, with halflife in steps
        self.chance_coeff = chance_coeff
        self.chance_coeff_hl = chance_coeff_hl

        # memory buffer
        self.memory = MemoryBuffer(size=mem_size,
                                   batch_size=mem_batch_size,
                                   replace=False)

        self.curr_state = self.get_game_state()
        self.prev_state = self.curr_state

        self.max_event = ""
        self._reward = 0
        self._loss = 0
示例#2
0
    def __init__(
            self,
            target_model=None,  # must be the same as normal model
            update_target_model_every=1000,  # model with which qvals are calculated
            learn_epsilon_half_life=3000,
            discount_factor=0.9,
            mem_size=1000,
            mem_batch_size=4,
            mem_bias_prob=0.9,
            *args,
            **kwargs):
        super().__init__(*args, **kwargs)

        self.target_model = target_model
        self.update_target_model_every = update_target_model_every

        self.discount_factor = discount_factor
        self.learn_epsilon = 1  # for epsilon-greedy search
        self.learn_epsilon_half_life = learn_epsilon_half_life  # time it takes to fall to half
        self.memory = MemoryBuffer(size=mem_size,
                                   batch_size=mem_batch_size,
                                   bias_prob=mem_bias_prob)
        self.prev_state = self.get_game_state()
        self.prev_score = 0

        # info vars
        self._real_event = ""
        self._reward = 0
        self._last_q = 0
        self._loss = 0
示例#3
0
def train(env, model, args):
    model.optim = torch.optim.Adam(islice(model.parameters(), 20), lr=0.0005)
    model.pos_optim = torch.optim.Adam(model.eval_mlp.parameters(), lr=0.0005)
    replay_buffer = MemoryBuffer(int(args.batch))

    agent = RandomAgent(env.action_spec())
    max_steps = args.num_steps
    env.reset()

    step = 0
    sub_trajectory = SubTrajectory(100)

    pbar = tqdm(total = max_steps)

    while step < max_steps:
        action = agent.step()
        # for _ in range(np.random.randint(1,5)):
        rgb, pos, orientation = env.observations()['RGB'], env.observations()['DEBUG.POS.TRANS'], env.observations()['DEBUG.POS.ROT']
        reward = env.step(action)
        if (not env.is_running()):
            env.reset()
        else:
            new_rgb, new_pos, new_orientation = env.observations()['RGB'], env.observations()['DEBUG.POS.TRANS'], env.observations()['DEBUG.POS.ROT']
        
        if sub_trajectory.len == 100:
            tmp = copy.deepcopy(sub_trajectory)
            # Send initial belief to replay buffer
            o_0 = torch.from_numpy(tmp.new_rgb[0]).to(dtype=torch.float32).unsqueeze(0).to(device)
            a_0 = torch.from_numpy(tmp.action[0]).to(dtype=torch.float32).unsqueeze(0).to(device)
            z_0 = model.conv(o_0)
            bgru_input = torch.cat((z_0, a_0), dim=1)
            _, tmp.belief = model.belief_gru.gru1(torch.unsqueeze(bgru_input, 1))
            replay_buffer.add(tmp)
            sub_trajectory.clear()

        sub_trajectory.add(rgb, pos, orientation, action, new_rgb, new_pos, new_orientation)

        # Train using replay_buffer
        if step >= args.batch * 100:
            train_batch = replay_buffer.sample(64)
            if None in train_batch:
                raise Exception("Training batch contains None object")
            model.update(train_batch)

        step += 1
        pbar.update(1)
    
    pbar.close()
示例#4
0
def train(rank, device, args):
    current_time = datetime.now().strftime('%b%d_%H-%M')
    LOGGER_DIR = os.path.join(args.log_dir, args.env, current_time, 'Agent:{}'.format(rank))
    writer = SummaryWriter(LOGGER_DIR)
    MODEL_DIR = os.path.join(LOGGER_DIR, 'models')
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    env = create_env(args.env, args)

    if args.pri:
        ram = PrioMemoryBuffer(args.buffer_size)
    else:
        ram = MemoryBuffer(args.buffer_size)

    player = DDPGAgent(env.observation_space, env.action_space, ram, writer, device, args)
    if args.model_dir is not None:
        player.load_models(args.model_dir)
    steps_done = 0
    episode_rewards = []
    max_score = -9999
    count_eps = 0
    for _ep in range(1, args.max_eps):
        observation = env.reset()
        total_reward = 0
        count_eps += 1
        for r in range(10000):
            if 'img' in args.obs:
                state = np.expand_dims(observation, axis=0)
            else:
                state = np.float32(observation)
            action, action_rescale = player.get_exploration_action(state)
            new_observation, reward, done, info = env.step(action_rescale)
            steps_done += 1
            total_reward += reward
            ram.add(observation, np.expand_dims(action, axis=0), reward, new_observation)
            observation = new_observation
            # perform optimization
            if steps_done > args.start_learning:
                player.optimize()
            if done:
                break

        # logger
        writer.add_scalar('episode/reward', total_reward, steps_done)
        writer.add_scalar('episode/length', r, steps_done)
        episode_rewards.append(total_reward)
        if _ep % args.eval_eps == 0:
            reward_ave = np.array(episode_rewards).mean()
            print('Train, episode %d, steps: %d reward: %.3f,ave_reward: %.3f' % (count_eps, steps_done, episode_rewards[-1], reward_ave))
            if reward_ave > max_score:
                player.save_models(os.path.join(MODEL_DIR, 'best'))
                max_score = reward_ave
                print('Save Best!')
            else:
                player.save_models(os.path.join(MODEL_DIR, 'new'))
            episode_rewards = []
        # check memory consumption and clear memory
        gc.collect()
示例#5
0
def test(device, args):

    env = create_env(args.env, args)
    ram = MemoryBuffer(1)
    player = DDPGAgent(env.observation_space, env.action_space, ram, None,
                       device, args)
    if args.model_dir is not None:
        player.load_models(args.model_dir, test=True)
    steps_done = 0
    count_eps = 0
    count_success = 0
    while True:
        episode_rewards = []
        episode_lenghts = []
        for _ep in range(1, args.eval_eps):
            if args.ar:
                env.seed(True)
            observation = env.reset()
            total_reward = 0
            episode_action = []
            for steps in range(1000):
                if 'img' in args.obs:
                    state = np.expand_dims(observation, axis=0)
                else:
                    state = np.float32(observation)

                action, action_rescale = player.get_exploitation_action(state)
                episode_action.append(action)
                new_observation, reward, done, info = env.step(action_rescale)
                observation = new_observation
                total_reward += reward
                steps_done += 1

                if args.render:
                    env.render()
                if done:
                    episode_rewards.append(total_reward)
                    count_eps += 1
                    episode_lenghts.append(steps)
                    if reward > 1:
                        count_success += 1.0
                    break
            # check memory consumption and clear memory
            gc.collect()

        reward_ave = np.array(episode_rewards).mean()
        length_ave = np.array(episode_lenghts).mean()
        print(
            'Test, episode %d, steps: %d, Success_rate: %.3f ave_reward: %.3f, ave_length: %.3f'
            % (count_eps, steps_done, count_success / count_eps, reward_ave,
               length_ave))

    env.close()
示例#6
0
class PAgent(Agent):
    def __init__(self,
                 chance_coeff=1,
                 chance_coeff_hl=False,
                 mem_size=1000,
                 mem_batch_size=4,
                 penalize_if_repeats=True,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        self.loss_func = torch.nn.KLDivLoss()
        # Whether to just choose max prob. event (0.0) or pick from probability
        # distribution produced by PolicyNet (1.0).
        # It's a float, so you can control how much exploring should be done by agent
        # chance_coeff_hl: If defined, makes it a decaying variable, with halflife in steps
        self.chance_coeff = chance_coeff
        self.chance_coeff_hl = chance_coeff_hl

        # memory buffer
        self.memory = MemoryBuffer(size=mem_size,
                                   batch_size=mem_batch_size,
                                   replace=False)

        self.curr_state = self.get_game_state()
        self.prev_state = self.curr_state

        self.max_event = ""
        self._reward = 0
        self._loss = 0

    def step(self):
        if self.chance_coeff_hl:
            hl_ratio = self.step_counter / self.chance_coeff_hl
            self.chance_coeff = math.pow(2, -hl_ratio)

        self.count_step()
        reward = self.compute_reward()
        self.curr_state = self.get_game_state()

        probs = self.model(self.curr_state)
        # p is the probability distribution set by policy net
        probs_list = probs.detach().numpy()[0]
        self.max_idx = np.argmax(probs_list)
        self._real_event = globals.EVENTS[self.max_idx]

        if random.random() > self.chance_coeff:
            chosen_idx = self.max_idx
        else:
            chosen_idx = np.random.choice(range(LEN_EVENTS), p=probs_list)

        chosen_event = globals.EVENTS[chosen_idx]
        if chosen_event != self.last_event:
            self.vm.reset_keys()
            self.vm.keyDown(chosen_event)

        self.memory.add(self.prev_state, chosen_idx, reward, self.curr_state)

        self.last_event = chosen_event
        self._reward = reward  # for info
        if len(self.memory.buffer) >= self.memory.size:
            self.learning_step()

        self.prev_state = self.curr_state

    def learning_step(self):
        self.optimizer.zero_grad()
        batch = self.memory.get_batch()
        probs = self.model(batch["prev_state"])  # (b,8)
        target = torch.tensor(probs).detach()  # copy without grad

        for i, p in enumerate(probs):
            prev_a = batch["prev_action"][i]
            if batch["reward"][i] > 0:
                target[i] = torch.zeros(LEN_EVENTS)
                target[i][prev_a] = 1.0
            else:
                target[i][prev_a] = 0.0

        loss = self.loss_func(probs, target)
        self._loss = float(loss)
        loss.backward()
        self.optimizer.step()

    def __learning_step(self):
        if len(self.event_buffer) > 0:
            self.optimizer.zero_grad()

            target = torch.zeros((len(self.event_buffer), LEN_EVENTS))
            probs_list = []
            for i, event_probs in enumerate(self.event_buffer):
                event, probs = event_probs
                event_idx = globals.EVENTS_IDX[event]
                target[i][event_idx] = 1
                probs_list.append(probs)

            probs_batch = torch.cat(probs_list)
            loss = self.loss_func(probs_batch, target)
            loss.backward()
            self.optimizer.step()
            # end learning step
            self.clear_event_past()
            self.end_learning_step()

            self.positive_step_counter += 1
            print("POSITIVE STEP +++")

    def negative_step_for_repeating(
            self):  # used for penalizing if agent repeats itself
        self.optimizer.zero_grad()
        for event, probs in self.event_buffer:
            event_idx = globals.EVENTS_IDX[event]
            # probability distribution where event we're penalizing is 0
            target = torch.tensor(probs)  # copy as tensor
            target += target[0][event_idx] / (
                LEN_EVENTS - 1
            )  # make sure all elements add up to 1 after we set unwanted event to 0
            target[0][event_idx] = 0

            loss = self.loss_func(probs, target)

            loss.backward()

            self.negative_step_counter += 1
            print("NEGATIVE STEP ---")
            self.clear_event_past()
        self.optimizer.step()

    def clear_event_past(self):
        self.event_buffer = []
        self.event_history = []
示例#7
0
class DQAgent(Agent):
    def __init__(
            self,
            target_model=None,  # must be the same as normal model
            update_target_model_every=1000,  # model with which qvals are calculated
            learn_epsilon_half_life=3000,
            discount_factor=0.9,
            mem_size=1000,
            mem_batch_size=4,
            mem_bias_prob=0.9,
            *args,
            **kwargs):
        super().__init__(*args, **kwargs)

        self.target_model = target_model
        self.update_target_model_every = update_target_model_every

        self.discount_factor = discount_factor
        self.learn_epsilon = 1  # for epsilon-greedy search
        self.learn_epsilon_half_life = learn_epsilon_half_life  # time it takes to fall to half
        self.memory = MemoryBuffer(size=mem_size,
                                   batch_size=mem_batch_size,
                                   bias_prob=mem_bias_prob)
        self.prev_state = self.get_game_state()
        self.prev_score = 0

        # info vars
        self._real_event = ""
        self._reward = 0
        self._last_q = 0
        self._loss = 0

    def step(self):
        if self.learn_epsilon_half_life:
            hl_ratio = self.step_counter / self.learn_epsilon_half_life
            self.learn_epsilon = math.pow(2, -hl_ratio)
        # update target model
        if self.step_counter % self.update_target_model_every == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        self.count_step()  # computes score and counts step

        # compute reward
        reward = self.compute_reward()

        qvals_prev = self.model(self.prev_state)
        qvals_prev = qvals_prev.detach()
        if self.learn_epsilon_half_life and random.random(
        ) < self.learn_epsilon:  # random action
            chosen_idx = random.randrange(len(globals.EVENTS))
        else:  # pick best action
            chosen_idx = np.argmax(qvals_prev.numpy())

        self.curr_state = self.get_game_state()
        self.memory.add(self.prev_state, chosen_idx, reward, self.curr_state)

        chosen_event = globals.EVENTS[chosen_idx]
        self._real_event = globals.EVENTS[np.argmax(
            qvals_prev.numpy())]  # for info

        if chosen_event != self.last_event:
            self.vm.reset_keys()
            self.vm.keyDown(chosen_event)

        self.last_event = chosen_event
        self._reward = reward  # for info

        if len(self.memory.buffer) >= self.memory.size:
            self.learning_step()

        self.prev_state = self.curr_state

    def learning_step(self):
        self.optimizer.zero_grad()
        batch = self.memory.get_batch()

        # implement pseudo-Double-DQN
        # |Q|(s',a')
        with torch.no_grad():
            tqvals_curr = self.target_model(
                batch["curr_state"])  # qvals for all possible actions for curr

            # Q(s',a')
            qvals_curr = self.model(batch["curr_state"])
            argmax_qval_curr = torch.argmax(qvals_curr.detach(), dim=1)

        #self._last_q = float(max_qval_curr[0])
        # Q(s,a)
        qvals_prev = self.model(batch["prev_state"])  # Shape = (b,8)

        # don't touch actions that weren't activated
        target = torch.tensor(qvals_prev).detach()

        for i, prev_a in enumerate(batch["prev_action"]):
            argmax_curr = argmax_qval_curr[i]
            target[i][prev_a] = batch["reward"][
                i] + self.discount_factor * tqvals_curr[i][argmax_curr]

        loss = self.loss_func(qvals_prev, target)
        self._loss = float(loss)
        loss.backward()

        if batch["reward"][0] > 0:
            self.positive_step_counter += 1

        self.optimizer.step()