Пример #1
0
def tester(model,
           device,
           n=5,
           task_config_file="config_files/config_example.json"):
    episode_reward = []
    rnn_size = 128
    env = AI2ThorEnv(config_file=task_config_file)
    for _ in range(n):
        # Wait for trainer to inform next job
        total_r = 0.
        d = False
        x = reset(env, rnn_size, device)
        while not d:
            with torch.no_grad():
                a_t, _, _, _, state_t = model(x)
                # interact with environment
                o, r, d, _ = env.step(a_t.data.item())
                total_r += r  # accumulate reward within one rollout.
                # prepare inputs for next step
                x["observation"] = torch.Tensor(o / 255.).to(device)
                x["memory"]["state"] = state_t
                x["memory"]["mask"] = torch.tensor(
                    (d + 1) % 2, dtype=torch.float32).to(device)
                x["memory"]["action"] = a_t

        episode_reward.append(total_r)
        print("Episode reward:", total_r)

    env.close()
    print(f"Average eposide reward ({np.mean(episode_reward)})")
Пример #2
0
    def test_environments_runs(self):
        """
        Checks to see if the environment still runs and nothing breaks. Useful for continuous
        deployment and keeping master stable. Also, we check how much time 10 steps takes within
        the environment. Final assert checks if max_episode_length is equal to the number of steps
        taken and no off-by-one errors.

        Prints the execution time at the end of the test for performance check.
        """
        num_steps = 10
        env = AI2ThorEnv()
        start = time.time()
        all_step_times = []
        env.reset()
        for step_num in range(num_steps):
            start_of_step = time.time()
            action = env.action_space.sample()
            state, reward, done, _ = env.step(action)

            time_for_step = time.time() - start_of_step
            print(
                'Step: {}. env.task.step_num: {}. Time taken for step: {:.3f}'.
                format(step_num, env.task.step_num, time_for_step))
            all_step_times.append(time_for_step)

            if done:
                break

        print('Time taken altogether: {}\nAverage time taken per step: {:.3f}'.
              format(time.time() - start,
                     sum(all_step_times) / len(all_step_times)))

        self.assertTrue(len(all_step_times) == num_steps)
        env.close()
Пример #3
0
    def test_config_override(self):
        """
        Check if reading both a config file and a config dict at the same time works and that the
        correct warning occurs for overwriting. Afterwards, check if scene_id was correctly
        changed from overwriting
        """
        with warnings.catch_warnings(record=True) as warning_objs:
            env = AI2ThorEnv(config_dict={'scene_id': 'FloorPlan27'})
            # checking if correct warning appears (there could be multiple depending on user)
            self.assertTrue([
                w for w in warning_objs
                if 'Key: scene_id already in config file' in w.message.args[0]
            ])

        self.assertTrue(env.scene_id == 'FloorPlan27')
        env.close()
Пример #4
0
    def test_cup_task_and_interaction_actions(self):
        """
        Check if picking up and putting down cup works and agent receives reward of 2 for doing it
        twice. For putting the cup down, the agent places it in the microwave and then picks it up
        again. Also this implicitly checks there is no random initialisation and that the same
        actions in the same environment will achieve the same reward each time.
        """

        # actions_to_look_at_cup = ['RotateRight', 'RotateRight', 'MoveAhead', 'MoveAhead',
        #     'RotateRight', 'MoveAhead', 'MoveAhead', 'RotateLeft', 'MoveAhead', 'MoveAhead',
        #     'MoveAhead', 'RotateLeft', 'LookDown', 'PickupObject', 'PutObject', 'LookUp',
        #     'MoveRight', 'OpenObject', 'PutObject', 'PickupObject', 'CloseObject']

        actions_to_look_at_cup = [
            'MoveAhead', 'MoveBack', 'RotateRight', 'RotateLeft', 'LookUp',
            'LookDown', 'Stop'
        ]

        env = AI2ThorEnv(
            config_dict={
                'scene_id': 'FloorPlan28',
                'gridSize': 0.25,
                'acceptable_receptacles': [
                    'Microwave'  # the used receptacle below
                ],
                'target_objects': {
                    'Mug': 1
                }
            })
        movement_penalty = len(
            actions_to_look_at_cup) * env.task.movement_reward

        for episode in range(2):  # twice to make sure no random initialisation
            env.reset()
            rewards = []
            for action_str in actions_to_look_at_cup:
                action = env.action_names.index(action_str)
                state, reward, done, _ = env.step(action)
                rewards.append(reward)
                if done:
                    break
            # self.assertAlmostEqual(sum(rewards), 2 + movement_penalty)
        env.close()
Пример #5
0
def main(args=None):
    # Parse arguments
    if args is None:
        args = sys.argv[1:]
    args = parse_args(args)
    # Check if a GPU ID was set
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    set_session(get_session())

    # Environment Initialization
    if args.is_ai2thor:
        config_dict = {'max_episode_length': 2000}
        env = AI2ThorEnv(config_dict=config_dict)
        env.reset()
        state = env.reset()
        state_dim = state.shape
        action_dim = env.action_space.n
    elif (args.is_atari):
        # Atari Environment Wrapper
        env = AtariEnvironment(args)
        state_dim = env.get_state_size()
        action_dim = env.get_action_size()
    else:
        # Standard Environments
        env = Environment(gym.make(args.env), args.consecutive_frames)
        env.reset()
        state_dim = env.get_state_size()
        action_dim = gym.make(args.env).action_space.n

    algo = A3C(action_dim, state_dim, args.consecutive_frames, is_atari=args.is_atari, is_ai2thor=args.is_ai2thor)
    algo.load_weights(args.actor_path, args.critic_path)

    # Display agent
    old_state, time = env.reset(), 0
    while True:
        a = algo.policy_action(old_state)
        old_state, r, done, _ = env.step(a)
        time += 1
        if done:
            print('----- done, resetting env ----')
            env.reset()
Пример #6
0
    def test_all_task_init(self):
        """
        Test that the creation of all tasks still works by taking a few random steps after
        resetting environment
        """
        param_list = [{
            'pickup_objects': ['Mug', 'Apple'],
            'task': {
                'task_name': 'PickUpTask',
                'target_objects': {
                    'Mug': 1,
                    'Apple': 5
                }
            }
        }]

        for params in param_list:
            env = AI2ThorEnv(config_dict=params)
            state = env.reset()
            for i in range(5):
                action = env.action_space.sample()
                state, reward, done, _ = env.step(action)
            env.close()
Пример #7
0
def train(rank, args, shared_model, counter, lock, optimizer):
    torch.manual_seed(args.seed + rank)
    env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    if args.point_cloud_model:
        model = ActorCritic(env.action_space.n)
    else:
        args.frame_dim = env.config['resolution'][-1]
        model = ActorCritic(env.action_space.n, env.observation_space.shape[0],
                            args.frame_dim)

    if args.cuda:
        model = model.cuda()
    model.train()

    state = env.reset()
    done = True

    # monitoring
    total_reward_for_num_steps_list = []
    episode_total_rewards_list = []
    avg_reward_for_num_steps_list = []

    total_length = 0
    episode_length = 0
    n_episode = 0
    total_reward_for_episode = 0
    all_rewards_in_episode = []
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())

        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            total_length += 1
            if args.cuda:
                if args.point_cloud_model:
                    state = (state[0].cuda(), state[1].cuda())
                else:
                    state = state.cuda()
                cx = cx.cuda()
                hx = hx.cuda()
            value, logit, (hx, cx) = model((state, (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)
            log_probs.append(log_prob)

            action_int = action.cpu().numpy()[0][0].item()

            state, reward, done, _ = env.step(action_int, verbose=False)
            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            if done:
                total_length -= 1
                total_reward_for_episode = sum(all_rewards_in_episode)
                episode_total_rewards_list.append(total_reward_for_episode)
                all_rewards_in_episode = []
                state = env.reset()
                print(
                    'Process {} Episode {} Over with Length: {} and Reward: {: .2f}. Total Trained Length: {}'
                    .format(rank, n_episode, episode_length,
                            total_reward_for_episode, total_length))
                sys.stdout.flush()
                episode_length = 0
                n_episode += 1

            values.append(value)
            rewards.append(reward)
            all_rewards_in_episode.append(reward)

            if done:
                break

        if args.synchronous:
            if total_reward_for_episode >= args.solved_reward:
                print("Process {} Solved with Reward {}".format(
                    rank, total_reward_for_episode))
                env.close()
                break

        total_reward_for_num_steps = sum(rewards)
        total_reward_for_num_steps_list.append(total_reward_for_num_steps)
        avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards)
        avg_reward_for_num_steps_list.append(avg_reward_for_num_steps)

        # Backprop and optimisation
        R = torch.zeros(1, 1)
        gae = torch.zeros(1, 1)
        if args.cuda:
            if args.point_cloud_model:
                state = (state[0].cuda(), state[1].cuda())
            else:
                state = state.cuda()
            R = R.cuda()
            gae = gae.cuda()
        if not done:  # to change last reward to predicted value to ....
            value, _, _ = model((state, (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle

        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach() - \
                          args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Пример #8
0
from gym_ai2thor.envs.ai2thor_env import AI2ThorEnv
import numpy as np



if __name__ == '__main__':
    env = AI2ThorEnv()
    n = env.action_space.n
    env.reset()
    episodes = []
    for i in range(5):
        env.reset()
        d = False
        total_r = 0.
        while not d:
            a = np.random.choice(n)
            o,r,d,_ = env.step(a)
            total_r +=r

        episodes.append(total_r)

        print(f'Total reward in episode {i} is {total_r}')

    print("AVG episode rewards:",episodes, np.mean(episodes))
Пример #9
0
    mp.set_start_method("spawn")
    os.environ['OMP_NUM_THREADS'] = '1'
    # os.environ['CUDA_VISIBLE_DEVICES'] = ""

    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    if args.cuda:
        print('Using', torch.cuda.get_device_name(0))
        torch.cuda.init()

    torch.manual_seed(args.seed)
    args.config_dict = {
        'max_episode_length': args.max_episode_length,
        'point_cloud_model': args.point_cloud_model
    }
    env = AI2ThorEnv(config_dict=args.config_dict)

    if args.point_cloud_model:
        shared_model = ActorCritic(env.action_space.n)
    else:
        args.frame_dim = env.config['resolution'][-1]
        shared_model = ActorCritic(env.action_space.n,
                                   env.observation_space.shape[0],
                                   args.frame_dim)

    if args.cuda:
        shared_model = shared_model.cuda()
    shared_model.share_memory()

    env.close(
    )  # above env initialisation was only to find certain params needed
Пример #10
0
    if torch.cuda.is_available() and not args.disable_cuda:
        args.device = torch.device('cuda')
        torch.cuda.manual_seed(np.random.randint(1, 10000))
        # Disable non deterministic ops (not sure if critical but better safe than sorry)
        torch.backends.cudnn.enabled = False
    else:
        args.device = torch.device('cpu')

    # Simple ISO 8601 timestamped logger
    def log(s):
        print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' +
              s)

    # Environment selection
    if args.game == 'ai2thor':
        env = FrameStackEnv(AI2ThorEnv(config_file=args.config_file),
                            args.history_length, args.device)
        args.resolution = env.config['resolution']
        args.img_channels = env.observation_space.shape[0]
    else:
        env = Env(args)
        env.train()
        args.resolution = (84, 84)
        args.img_channels = 1
    action_space = env.action_space

    # Agent
    dqn = Agent(args, env)
    mem = ReplayMemory(args, args.memory_capacity)
    """ Priority weights are linearly annealed and increase every step by priority_weight_increase from 
    args.priority_weight to 1. 
Пример #11
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    if args.atari:
        env = create_atari_env(args.atari_env_name)
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
        env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.frame_dim)
    if args.cuda:
        model = model.cuda()

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if args.atari and args.atari_render:
            env.render()
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = torch.zeros(1, 64)
            hx = torch.zeros(1, 64)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            if args.cuda:
                state = state.cuda()
                cx = cx.cuda()
                hx = hx.cuda()
            value, logit, (hx, cx) = model((state.unsqueeze(0).float(), (hx, cx)))
        prob = F.softmax(logit, dim=-1)
        # log_prob = F.log_softmax(logit, dim=-1)
        # print(prob)
        # entropy = -(log_prob * prob).sum(1, keepdim=True)
        # print(prob.max(1, keepdim=True)[0].cpu().numpy())
        # print(entropy)

        action = prob.max(1, keepdim=True)[1].cpu().numpy()
        state, reward, done, _ = env.step(action[0, 0], verbose=False)
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # i.e. in test mode an agent can repeat an action ad infinitum
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            print('In test. Episode over because agent repeated action {} times'.format(
                                                                                actions.maxlen))
            done = True

        if done:
            print("Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))

            if reward_sum >= args.solved_reward:
                print("Solved Testing with Reward {}".format(reward_sum))
                torch.save(model.state_dict(), "solved_{}.pth".format("atari" if args.atari else "ai2thor"))
                env.close()
                break

            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(args.test_sleep_time)

        state = torch.from_numpy(state)
Пример #12
0
def worker(worker_id,
           policy,
           storage,
           ready_to_work,
           queue,
           exit_flag,
           use_priors=False,
           task_config_file="config_files/config_example.json"):
    '''
    Worker function to collect experience based on policy and store the experience in storage
    :param worker_id: id used for store the experience in storage
    :param policy: function/actor-critic
    :param storage:
    :param ready_to_work: condition to synchronize work and training
    :param queue: message queue to send episode reward to learner
    :param exit_flag: flag set by leaner to exit the job
    :param task_config_file: the task configuration file
    :return:
    '''

    print(f"Worker with Id:{worker_id} pid ({os.getpid()}) starts ...")

    steps_per_epoch = storage.block_size
    state_size = storage.h_buf.shape[1]
    device = storage.device

    env = AI2ThorEnv(config_file=task_config_file)
    x = reset(env, state_size, device)
    episode_rewards, episode_steps = [], []
    r_sum, step_sum = 0., 0

    # Wait for start job
    print('waiting>>>>>>>>>>>>>>>>>>')
    ready_to_work.wait()
    print('waiting<<<<<<<<<<<<<<<<<<')
    while exit_flag.value != 1:
        for i in range(steps_per_epoch):
            with torch.no_grad():
                a_t, logp_t, _, v_t, state_t = policy(x)

            # interact with environment
            o, r, d, _ = env.step(a_t.item())
            # print('o.shape', o.shape, type(o.shape))

            r_sum += r  # accumulate reward within one rollout.
            step_sum += 1
            r_t = torch.tensor(r, dtype=torch.float32).to(device)
            # save experience
            storage.store(worker_id, x["observation"], a_t, r_t, v_t, logp_t,
                          x["memory"]["state"], x["memory"]["mask"])
            # prepare inputs for next step
            if use_priors:
                x["observation"] = o
            else:
                x["observation"] = torch.Tensor(o / 255.).to(
                    device)  # 128x128 -> 1x128x128
            # print('x["observation"]', x["observation"].shape)

            x["memory"]["state"] = state_t
            x["memory"]["mask"] = torch.tensor((d + 1) % 2,
                                               dtype=torch.float32).to(device)
            x["memory"]["action"] = a_t
            # check terminal state
            if d:  # calculate the returns and GAE and reset environment
                storage.finish_path(worker_id, 0)
                # print(f"Worker:{worker_id} {device} pid:{os.getpid()} finishes goal at steps :{i}")
                episode_rewards.append(r_sum)
                episode_steps.append(step_sum)
                x = reset(env, state_size, device)
                r_sum, step_sum = 0., 0
        # env does not reaches end
        if not d:
            _, _, _, last_val, _ = policy(x)
            storage.finish_path(worker_id, last_val)
        # print(f"Worker:{worker_id} {device} pid:{os.getpid()} begins to notify Learner Episode done")
        queue.put((episode_rewards, episode_steps, worker_id))
        # print(f"Worker:{worker_id} waits for next episode")
        episode_rewards, episode_steps = [], []
        # x = reset(env, state_size)
        # r_sum, step_sum = 0., 0
        # Wait for next job
        ready_to_work.clear()
        ready_to_work.wait()
        # print(f"Worker:{worker_id} {device} pid:{os.getpid()} starts new episode")

    env.close()
    print(f"Worker with pid ({os.getpid()})  finished job")
Пример #13
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    if args.atari:
        env = create_atari_env(args.atari_env_name)
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
        env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.frame_dim)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True  # be True at the beginning

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    episodes = 0

    vis = Visdom()
    assert vis.check_connection()
    vis.close()
    win = vis.line(X=[0.],
                   Y=[0.],
                   win='testing_Rewards',
                   opts=dict(title='testing_Rewards'))

    while True:
        episode_length += 1
        if args.atari and args.atari_render:
            env.render()
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            value, logit, (hx, cx) = model(
                (state.unsqueeze(0).float(), (hx, cx)))

        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # i.e. in test mode an agent can repeat an action ad infinitum
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            print(
                'In test. Episode over because agent repeated action {} times'.
                format(actions.maxlen))
            done = True

        if done:
            print(
                "In test. Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))
            vis.line(X=[episodes],
                     Y=[reward_sum],
                     win='testing_Rewards',
                     update='append')
            episodes += 1

            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(args.test_sleep_time)  # wasting resource...
            print('testing...')

        state = torch.from_numpy(state)
Пример #14
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)
    env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    if args.point_cloud_model:
        model = ActorCritic(env.action_space.n)
    else:
        args.frame_dim = env.config['resolution'][-1]
        model = ActorCritic(env.action_space.n, env.observation_space.shape[0],
                            args.frame_dim)

    if args.cuda:
        model = model.cuda()

    model.eval()

    state = env.reset()
    reward_sum = 0
    done = True

    save = '{}-steps{}-process{}-lr{}-entropy_coef{}'.format(
        "point" if args.point_cloud_model else "conv", args.num_steps,
        args.num_processes, args.lr, args.entropy_coef)
    save = os.path.join('logs', save)
    os.makedirs(save, exist_ok=True)

    if args.model:
        shared_model.load_state_dict(
            torch.load(os.path.join(save, "solved_ai2thor.pth")))
    else:
        logger = CSVLogger(os.path.join(save, 'test.csv'))
        fileds = ['episode_reward', 'frames_rendered']
        logger.log(fileds)

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    # actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(deepcopy(shared_model.state_dict()))
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            if args.cuda:
                if args.point_cloud_model:
                    state = (state[0].cuda(), state[1].cuda())
                else:
                    state = state.cuda()
                cx = cx.cuda()
                hx = hx.cuda()
            value, logit, (hx, cx) = model((state, (hx, cx)))
        prob = F.softmax(logit, dim=-1)
        # log_prob = F.log_softmax(logit, dim=-1)
        # print(prob)
        # entropy = -(log_prob * prob).sum(1, keepdim=True)
        # print(prob.max(1, keepdim=True)[0].cpu().numpy())
        # print(entropy)

        action = prob.max(1, keepdim=True)[1].cpu().numpy()
        state, reward, done, _ = env.step(action[0, 0], verbose=False)
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # i.e. in test mode an agent can repeat an action ad infinitum
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     print('In test. Episode over because agent repeated action {} times'.format(
        #                                                                         actions.maxlen))
        #     done = True

        if done:
            print(
                "Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {: .2f}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))
            if not args.model:
                logger.log(["{: .2f}".format(reward_sum), counter.value])

            if reward_sum >= args.solved_reward:
                print("Solved Testing with Reward {}".format(reward_sum))
                torch.save(model.state_dict(),
                           os.path.join(save, "solved_ai2thor.pth"))
                env.close()
                logger.close()
                break

            reward_sum = 0
            episode_length = 0
            # actions.clear()
            state = env.reset()
            time.sleep(args.test_sleep_time)
Пример #15
0
def train(rank, args, shared_model, counter, lock, device, optimizer=None):
    torch.manual_seed(args.seed + rank)

    if args.atari:
        env = create_atari_env(args.atari_env_name)
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
        env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.frame_dim)
    model = model.to(device)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    # monitoring
    total_reward_for_num_steps_list = []
    episode_total_rewards_list = []
    all_rewards_in_episode = []
    avg_reward_for_num_steps_list = []

    total_length = 0
    episode_length = 0
    episodes = 0

    vis = Visdom()
    assert vis.check_connection()
    vis.close()
    vis.line(X=[0.],
             Y=[0.],
             win='training_Rewards' + str(rank),
             opts=dict(title='training_Rewards' + str(rank)))

    while True:
        episodes += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            total_length += 1
            value, logit, (hx, cx) = model(
                (state.unsqueeze(0).float().cuda(), (hx.cuda(), cx.cuda())))
            value = value.cpu()
            logit = logit.cpu()
            hx = hx.cpu()
            cx = cx.cpu()
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            action_int = action.numpy()[0][0].item()
            state, reward, done, _ = env.step(action_int)

            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            if done:
                total_length -= 1
                total_reward_for_episode = sum(all_rewards_in_episode)
                episode_total_rewards_list.append(total_reward_for_episode)
                all_rewards_in_episode = []

                vis.line(X=[episodes],
                         Y=[total_reward_for_episode],
                         win='training_Rewards' + str(rank),
                         update='append')
                print(
                    'In Train. Episode Over. Total Length: {}. Total reward for episode: {}'
                    .format(total_length, total_reward_for_episode))
                print('In Train. Step no: {}. total length: {}'.format(
                    episode_length, total_length))

                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            all_rewards_in_episode.append(reward)

            if done:
                break

        # No interaction with environment below.
        # Monitoring
        total_reward_for_num_steps = sum(rewards)  # accumulate at each step
        total_reward_for_num_steps_list.append(total_reward_for_num_steps)
        avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards)
        avg_reward_for_num_steps_list.append(avg_reward_for_num_steps)

        # Backprop and optimisation
        R = torch.zeros(1, 1)
        if not done:  # to change last reward to predicted value to ....
            value, _, _ = model((state.unsqueeze(0).float(), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach() - \
                          args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).cuda().backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Пример #16
0
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--clip-param', type=float, default=0.2)
    parser.add_argument('--value_loss_coef', type=float, default=0.2)
    parser.add_argument('--entropy_coef', type=float, default=0.001)
    parser.add_argument('--max-kl', type=float, default=0.01)

    parser.add_argument('--use-priors', type=bool, default=False)
    parser.add_argument('--use-attention', type=bool, default=True)
    parser.add_argument('--attention', type=str, default='CBAM')

    args = parser.parse_args()
    torch.multiprocessing.set_start_method('spawn')

    # get observation dimension
    env = AI2ThorEnv(config_file="config_files/multiMugTaskTrain.json")
    env.reset()
    obs_dim = env.observation_space.shape
    # Share information about action space with policy architecture
    ac_kwargs = dict()
    ac_kwargs['action_space'] = env.action_space
    ac_kwargs['state_size'] = args.state_size
    ac_kwargs['use_attention'] = args.use_attention
    ac_kwargs['attention'] = args.attention
    env.close()
    # Main model
    print("Initialize Model...")
    # Construct Model
    ac_model = ActorCritic(obs_shape=obs_dim, **ac_kwargs)
    if args.model_path:
        ac_model.load_state_dict(torch.load(args.model_path))
        'cameraY': -0.85,
        'gridSize': 0.1,  # 0.01
        'continuous_movement': True,
        'build_file_name': args.build_file_name,
        'task': {
            'task_name': 'PickUpTask',
            'target_objects': {
                'Mug': 1,
                'Bowl': 5
            }
        }
    }

    # Input config_dict to env which will overwrite a few values given in the default config_file.
    # Therefore, a few warnings will occur
    env = AI2ThorEnv(config_dict=config_dict)
    max_episode_length = env.task.max_episode_length
    N_EPISODES = 3
    for episode in range(N_EPISODES):
        start = time.time()
        state = env.reset()
        for step_num in range(max_episode_length):
            action = env.action_space.sample()
            state, reward, done, _ = env.step(action)
            if done:
                break

            if step_num + 1 > 0 and (step_num + 1) % 100 == 0:
                print('Episode: {}. Step: {}/{}. Time taken: {:.3f}s'.format(
                    episode + 1, (step_num + 1), max_episode_length,
                    time.time() - start))
Пример #18
0
def main(args=None):
    # Parse arguments
    if args is None:
        args = sys.argv[1:]
    args = parse_args(args)

    # Check if a GPU ID was set
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    # Environment Initialization
    if args.is_ai2thor:
        config_dict = {'max_episode_length': 500}
        env = AI2ThorEnv(config_dict=config_dict)
        env.reset()
        state = env.reset()
        state_dim = state.shape
        action_dim = env.action_space.n
        args.env = 'ai2thor'
    elif (args.is_atari):
        # Atari Environment Wrapper
        env = AtariEnvironment(args)
        state_dim = env.get_state_size()
        action_dim = env.get_action_size()
        print(state_dim)
        print(action_dim)
    else:
        # Standard Environments
        env = Environment(gym.make(args.env), args.consecutive_frames)
        env.reset()
        state_dim = env.get_state_size()
        action_dim = gym.make(args.env).action_space.n
    set_session(get_session())
    summary_writer = tf.summary.FileWriter(args.type + "/tensorboard_" +
                                           args.env)
    algo = A3C(action_dim,
               state_dim,
               args.consecutive_frames,
               is_atari=args.is_atari,
               is_ai2thor=args.is_ai2thor)

    # Train
    stats = algo.train(env, args, summary_writer)

    # Export results to CSV
    if args.gather_stats:
        df = pd.DataFrame(np.array(stats))
        df.to_csv(args.type + "/logs.csv",
                  header=['Episode', 'Mean', 'Stddev'],
                  float_format='%10.5f')

    # Save weights and close environments
    exp_dir = '{}/models/'.format(args.type)
    if not os.path.exists(exp_dir):
        os.makedirs(exp_dir)

    export_path = '{}{}_ENV_{}_NB_EP_{}_BS_{}'.format(exp_dir, args.type,
                                                      args.env,
                                                      args.nb_episodes,
                                                      args.batch_size)

    algo.save_weights(export_path)
    env.close()