示例#1
0
def train(
    rank, args, shared_model, shared_curiosity,
    counter, lock, pids, optimizer, train_policy_losses,
    train_value_losses, train_rewards
):
    pids.append(os.getpid())

    torch.manual_seed(args.seed + rank)

    if args.game == 'doom':
        env = create_doom_env(
            args.env_name, rank,
            num_skip=args.num_skip, num_stack=args.num_stack)
    elif args.game == 'atari':
        env = create_atari_env(args.env_name)
    elif args.game == 'picolmaze':
        env = create_picolmaze_env(args.num_rooms)
    env.seed(args.seed + rank)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)
    curiosity = IntrinsicCuriosityModule(  # ICM
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    if optimizer is None:
        # optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)
        optimizer = optim.Adam(  # ICM
            chain(shared_model.parameters(), shared_curiosity.parameters()),
            lr=args.lr)

    model.train()
    curiosity.train()  # ICM

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0

    killer = Killer()
    while not killer.kill_now:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        curiosity.load_state_dict(shared_curiosity.state_dict())  # ICM

        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        inv_loss = torch.tensor(0.0)   # ICM
        forw_loss = torch.tensor(0.0)  # ICM

        for step in range(args.num_steps):
            if done:
                episode_length = 0
                state = env.reset()
                state = torch.from_numpy(state)
            episode_length += 1

            value, logit, (hx, cx) = model(state.unsqueeze(0),
                                           hx, cx)
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)

            # Entropy trick
            if 'sparse' in args.env_name.lower():
                max_entropy = torch.log(
                    torch.tensor(logit.size()[1], dtype=torch.float))
                entropy = entropy \
                    if entropy <= args.max_entropy_coef * max_entropy \
                    else torch.tensor(0.0)

            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).flatten().detach()
            log_prob = log_prob.gather(1, action.view(1, 1))

            state_old = state  # ICM

            state, external_reward, done, _ = env.step(action)
            state = torch.from_numpy(state)

            # external reward = 0 if ICM-only mode
            external_reward = external_reward * (1 - args.icm_only)

            # <---ICM---
            inv_out, forw_out, curiosity_reward = \
                curiosity(
                    state_old.unsqueeze(0), action,
                    state.unsqueeze(0))
            # In noreward-rl:
            # self.invloss = tf.reduce_mean(
            #     tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex),
            #     name="invloss")
            # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss')
            # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it.
            current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1), action)
            current_forw_loss = curiosity_reward
            inv_loss += current_inv_loss
            forw_loss += current_forw_loss

            curiosity_reward = args.eta * curiosity_reward

            reward = max(min(external_reward, args.clip), -args.clip) + \
                max(min(curiosity_reward.detach(), args.clip), -args.clip)
            # ---ICM--->

            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        train_rewards[rank - 1] = sum(rewards)

        # <---ICM---
        inv_loss = inv_loss / episode_length
        forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length

        curiosity_loss = args.lambda_1 * (
            (1 - args.beta) * inv_loss + args.beta * forw_loss)
        # ---ICM--->

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model(state.unsqueeze(0), hx, cx)
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.gae_lambda + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        train_policy_losses[rank - 1] = float((policy_loss).detach().item())
        train_value_losses[rank - 1] = float((value_loss).detach().item())

        (policy_loss + args.value_loss_coef * value_loss +
            curiosity_loss).backward()  # ICM
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        torch.nn.utils.clip_grad_norm_(curiosity.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        ensure_shared_grads(curiosity, shared_curiosity)
        optimizer.step()

    env.close()
def train_curiosity(rank, args, shared_model, shared_curiosity, counter, lock,
                    pids, optimizer):
    pids.append(os.getpid())

    torch.manual_seed(args.seed + rank)

    if args.game == 'doom':
        env = create_doom_env(args.env_name,
                              rank,
                              num_skip=args.num_skip,
                              num_stack=args.num_stack)
    elif args.game == 'atari':
        env = create_atari_env(args.env_name)
    elif args.game == 'picolmaze':
        env = create_picolmaze_env(args.num_rooms)
    env.seed(args.seed + rank)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)
    curiosity = IntrinsicCuriosityModule(  # ICM
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    if optimizer is None:
        # optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)
        optimizer = optim.Adam(  # ICM
            chain(shared_model.parameters(), shared_curiosity.parameters()),
            lr=args.lr)

    model.train()
    curiosity.train()  # ICM

    model.load_state_dict(shared_model.state_dict())

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0

    killer = Killer()
    while not killer.kill_now:
        # Sync with the shared model
        curiosity.load_state_dict(shared_curiosity.state_dict())  # ICM

        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        inv_loss = torch.tensor(0.0)  # ICM
        forw_loss = torch.tensor(0.0)  # ICM

        for step in range(args.num_steps):
            if done:
                episode_length = 0
                state = env.reset()
                state = torch.from_numpy(state)
            episode_length += 1

            value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx)
            prob = F.softmax(logit, dim=-1)

            action = prob.multinomial(num_samples=1).flatten().detach()

            state_old = state  # ICM

            state, external_reward, done, _ = env.step(action)
            state = torch.from_numpy(state)

            # external reward = 0 if ICM-only mode
            external_reward = external_reward * (1 - args.icm_only)

            # <---ICM---
            inv_out, forw_out, curiosity_reward = \
                curiosity(
                    state_old.unsqueeze(0), action,
                    state.unsqueeze(0))
            # In noreward-rl:
            # self.invloss = tf.reduce_mean(
            #     tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex),
            #     name="invloss")
            # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss')
            # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it.
            current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1),
                                          action)
            current_forw_loss = curiosity_reward
            inv_loss += current_inv_loss
            forw_loss += current_forw_loss
            # ---ICM--->

            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            if done:
                break

        # <---ICM---
        inv_loss = inv_loss / episode_length
        forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length

        curiosity_loss = args.lambda_1 * (
            (1 - args.beta) * inv_loss + args.beta * forw_loss)
        # ---ICM--->

        optimizer.zero_grad()

        curiosity_loss.backward()  # ICM
        torch.nn.utils.clip_grad_norm_(curiosity.parameters(),
                                       args.max_grad_norm)

        ensure_shared_grads(curiosity, shared_curiosity)
        optimizer.step()

    env.close()
示例#3
0
    if args.random_seed:
        random_seed = torch.randint(0, 1000, (1, ))
        logging.info(f"Seed: {int(random_seed)}")
        torch.manual_seed(random_seed)
    else:
        torch.manual_seed(args.seed)

    if args.game == 'doom':
        env = create_doom_env(args.env_name,
                              0,
                              num_skip=args.num_skip,
                              num_stack=args.num_stack)
    elif args.game == 'atari':
        env = create_atari_env(args.env_name)
    elif args.game == 'picolmaze':
        env = create_picolmaze_env(args.num_rooms)

    cx = torch.zeros(1, 256)
    hx = torch.zeros(1, 256)
    state = env.reset()
    state = torch.from_numpy(state)

    shared_model = ActorCritic(
        # env.observation_space.shape[0], env.action_space)
        args.num_stack,
        env.action_space)
    shared_model.share_memory()

    if not args.no_curiosity:
        # <---ICM---
        shared_curiosity = IntrinsicCuriosityModule(
示例#4
0
    if args.game == 'doom':
        env = create_doom_env(args.env_name,
                              0,
                              num_skip=args.num_skip,
                              num_stack=args.num_stack)
        if args.record:
            env.set_recordings_dir(recordings_dir)
            logging.info("Set recordings dir")
        env.seed(args.seed + 0)
    elif args.game == 'atari':
        env_to_wrap = create_atari_env(args.env_name)
        env_to_wrap.seed(args.seed + 0)
        env = env_to_wrap
    elif args.game == 'picolmaze':
        env_to_wrap = create_picolmaze_env(args.num_rooms)
        env_to_wrap.seed(args.seed + 0)
        env = env_to_wrap

    env.step(0)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    model.eval()

    external_reward_sum = 0
    done = True
示例#5
0
def test(rank, args, shared_model, shared_curiosity, counter, pids, optimizer,
         train_policy_losses, train_value_losses, train_rewards):
    models_dir = os.path.join(args.sum_base_dir, 'models')
    if not os.path.exists(models_dir):
        logging.info("Created models dir")
        os.makedirs(models_dir)

    recordings_dir = os.path.join(args.sum_base_dir, 'recordings')
    if (not os.path.exists(recordings_dir)) and (args.game == 'doom'):
        logging.info("Created recordings dir")
        os.makedirs(recordings_dir)

    videos_dir = os.path.join(args.sum_base_dir, 'videos')
    if (not os.path.exists(videos_dir)) and (args.game == 'atari'):
        logging.info("Created videos dir")
        os.makedirs(videos_dir)

    torch.manual_seed(args.seed + rank)

    if args.game == 'doom':
        env = create_doom_env(args.env_name,
                              rank,
                              num_skip=args.num_skip,
                              num_stack=args.num_stack)
        env.set_recordings_dir(recordings_dir)
        logging.info("Set recordings dir")
        env.seed(args.seed + rank)
    elif args.game == 'atari':
        env_to_wrap = create_atari_env(args.env_name)
        env_to_wrap.seed(args.seed + rank)
        env = env_to_wrap
    elif args.game == 'picolmaze':
        env_to_wrap = create_picolmaze_env(args.num_rooms)
        env_to_wrap.seed(args.seed + rank)
        env = env_to_wrap

    env.step(0)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)
    curiosity = IntrinsicCuriosityModule(  # ICM
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    model.eval()
    curiosity.eval()  # ICM

    external_reward_sum = 0
    curiosity_reward_sum = 0  # ICM
    curiosity_reward_sum_clipped = 0  # ICM
    inv_loss = torch.tensor(0.0)  # ICM
    forw_loss = torch.tensor(0.0)  # ICM
    curiosity_loss = 0  # ICM
    done = True

    count_done = 0

    start_time = time.time()

    passed_time = 0
    current_counter = 0

    # a quick hack to prevent the agent from stucking
    # actions = deque(maxlen=100)
    actions = deque(maxlen=args.max_episode_length_test)
    episode_length = 0
    while True:
        episode_length += 1

        if done:
            passed_time = time.time() - start_time
            current_counter = counter.value

            # Sync with the shared model
            model.load_state_dict(shared_model.state_dict())
            curiosity.load_state_dict(shared_curiosity.state_dict())  # ICM
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)

            if count_done % args.save_video_again_eps == 0:
                if args.game == 'atari':
                    video_dir = os.path.join(
                        videos_dir,
                        'video_' + time.strftime('%Y.%m.%d-%H.%M.%S_') +
                        str(current_counter))
                    if not os.path.exists(video_dir):
                        os.makedirs(video_dir)
                    logging.info("Created new video dir")
                    env = wrappers.Monitor(env_to_wrap, video_dir, force=False)
                    logging.info("Created new wrapper")
                elif args.game == 'doom':
                    env.set_current_counter(current_counter)
                    env.set_record()
                    logging.info("Set new recording")

            state = env.reset()
            state = torch.from_numpy(state)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx)
        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].flatten().detach()

        state_old = state  # ICM

        state, external_reward, done, _ = env.step(action)
        state = torch.from_numpy(state)

        # external reward = 0 if ICM-only mode
        # external_reward = external_reward * (1 - args.icm_only)
        external_reward_sum += external_reward

        # <---ICM---
        inv_out, forw_out, curiosity_reward = \
            curiosity(
                state_old.unsqueeze(0), action,
                state.unsqueeze(0))
        # In noreward-rl:
        # self.invloss = tf.reduce_mean(
        #     tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex),
        #     name="invloss")
        # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss')
        # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it.
        current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1), action)
        current_forw_loss = curiosity_reward
        inv_loss += current_inv_loss
        forw_loss += current_forw_loss

        curiosity_reward = args.eta * curiosity_reward
        curiosity_reward_sum += curiosity_reward.detach()
        curiosity_reward_sum_clipped += \
            max(min(curiosity_reward.detach(), args.clip), -args.clip)
        # ---ICM--->

        done = done or episode_length >= args.max_episode_length

        # a quick hack to prevent the agent from stucking
        actions.append(action)
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            # <---ICM---
            inv_loss = inv_loss / episode_length
            forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length

            curiosity_loss = args.lambda_1 * (
                (1 - args.beta) * inv_loss + args.beta * forw_loss)
            # ---ICM--->

            train_policy_loss_mean = sum(train_policy_losses) / \
                len(train_policy_losses)
            train_value_loss_mean = sum(train_value_losses) / \
                len(train_value_losses)
            train_rewards_mean = sum(train_rewards) / \
                len(train_rewards)
            logging.info(
                "\n\nEp {:3d}: time {}, num steps {}, FPS {:.0f}, len {},\n"
                "        total R {:.6f}, train policy loss {:.6f}, train value loss {:.6f},\n"
                "        train mean R {:.6f}, curiosity R {:.3f}, curiosity R clipped {:.3f},\n"
                "        inv loss {:.3f}, forw loss {:.3f}, curiosity loss {:.3f}.\n"
                "".format(
                    count_done,
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(passed_time)), current_counter,
                    current_counter / passed_time, episode_length,
                    external_reward_sum, train_policy_loss_mean,
                    train_value_loss_mean, train_rewards_mean,
                    curiosity_reward_sum, curiosity_reward_sum_clipped,
                    inv_loss, forw_loss, curiosity_loss))

            if ((count_done % args.save_model_again_eps == 0)
                    and (optimizer is not None)):
                torch.save(
                    model.state_dict(), models_dir + '/model_' +
                    time.strftime('%Y.%m.%d-%H.%M.%S') +
                    f'_{current_counter}.pth')
                torch.save(
                    curiosity.state_dict(), models_dir + '/curiosity_' +
                    time.strftime('%Y.%m.%d-%H.%M.%S') +
                    f'_{current_counter}.pth')
                torch.save(
                    optimizer.state_dict(), models_dir + '/optimizer_' +
                    time.strftime('%Y.%m.%d-%H.%M.%S') +
                    f'_{current_counter}.pth')
                logging.info("Saved the model")

            tb.log_value('steps_second', current_counter / passed_time,
                         current_counter)
            tb.log_value('reward', external_reward_sum, current_counter)
            tb.log_value('reward_icm', curiosity_reward_sum, current_counter)
            tb.log_value('reward_icm_clipped', curiosity_reward_sum_clipped,
                         current_counter)
            tb.log_value('loss_inv', inv_loss, current_counter)
            tb.log_value('loss_forw', forw_loss, current_counter)
            tb.log_value('loss_curiosity', curiosity_loss, current_counter)
            tb.log_value('loss_train_policy_mean', train_policy_loss_mean,
                         current_counter)
            tb.log_value('loss_train_value_mean', train_value_loss_mean,
                         current_counter)
            tb.log_value('reward_train_mean', train_value_loss_mean,
                         current_counter)

            if args.game == 'atari':
                env.close()  # Close the window after the rendering session
                env_to_wrap.close()
            logging.info("Episode done, close all")

            episode_length = 0
            external_reward_sum = 0
            curiosity_reward_sum = 0  # ICM
            curiosity_reward_sum_clipped = 0  # ICM
            inv_loss = torch.tensor(0.0)  # ICM
            forw_loss = torch.tensor(0.0)  # ICM
            curiosity_loss = 0  # ICM
            actions.clear()

            if count_done >= args.max_episodes:
                for pid in pids:
                    os.kill(pid, signal.SIGTERM)
                env.close()
                os.kill(os.getpid(), signal.SIGKILL)

            count_done += 1
            time.sleep(args.time_sleep)
示例#6
0
    setup_logs(args)

    if args.random_seed:
        random_seed = torch.randint(0, 1000, (1, ))
        logging.info(f"Seed: {int(random_seed)}")
        torch.manual_seed(random_seed)
    else:
        torch.manual_seed(args.seed)

    if args.env_folder is not None:
        env_file = os.path.join(args.env_folder, 'env.pkl')
        if os.path.exists(env_file):
            logging.info("Load environment from curiosity folder")
            env = pkl.load(open(env_file, 'rb'))
    else:
        env = create_picolmaze_env(args.num_rooms, args.colors, args.periodic)
    env.save(os.path.join(args.sum_base_dir, 'env.pkl'))

    cx = torch.zeros(1, 256)
    hx = torch.zeros(1, 256)
    state = env.reset()
    state = torch.from_numpy(state)

    # <---ICM---
    shared_curiosity = IntrinsicCuriosityModule2(args.num_stack,
                                                 env.action_space,
                                                 args.epsilon)
    shared_curiosity.share_memory()
    # ---ICM--->

    if args.no_shared:
def test_no_curiosity(rank, args, shared_model, counter, pids, optimizer,
                      train_policy_losses, train_value_losses, train_rewards):
    models_dir = os.path.join(args.sum_base_dir, 'models')
    if not os.path.exists(models_dir):
        logging.info("Created models dir")
        os.makedirs(models_dir)

    recordings_dir = os.path.join(args.sum_base_dir, 'recordings')
    if (not os.path.exists(recordings_dir)) and (args.game == 'doom'):
        logging.info("Created recordings dir")
        os.makedirs(recordings_dir)

    videos_dir = os.path.join(args.sum_base_dir, 'videos')
    if (not os.path.exists(videos_dir)) and (args.game == 'atari'):
        logging.info("Created videos dir")
        os.makedirs(videos_dir)

    torch.manual_seed(args.seed + rank)

    if args.game == 'doom':
        env = create_doom_env(args.env_name,
                              rank,
                              num_skip=args.num_skip,
                              num_stack=args.num_stack)
        env.set_recordings_dir(recordings_dir)
        logging.info("Set recordings dir")
        env.seed(args.seed + rank)
    elif args.game == 'atari':
        env_to_wrap = create_atari_env(args.env_name)
        env_to_wrap.seed(args.seed + rank)
        env = env_to_wrap
    elif args.game == 'picolmaze':
        env_to_wrap = create_picolmaze_env(args.num_rooms)
        env_to_wrap.seed(args.seed + rank)
        env = env_to_wrap

    env.step(0)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    model.eval()

    external_reward_sum = 0
    done = True

    count_done = 0

    start_time = time.time()

    passed_time = 0
    current_counter = 0

    # a quick hack to prevent the agent from stucking
    # actions = deque(maxlen=100)
    actions = deque(maxlen=args.max_episode_length_test)
    episode_length = 0
    while True:
        episode_length += 1

        if done:
            passed_time = time.time() - start_time
            current_counter = counter.value

            # Sync with the shared model
            model.load_state_dict(shared_model.state_dict())
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)

            if count_done % args.save_video_again_eps == 0:
                if args.game == 'atari':
                    video_dir = os.path.join(
                        videos_dir,
                        'video_' + time.strftime('%Y.%m.%d-%H.%M.%S_') +
                        str(current_counter))
                    if not os.path.exists(video_dir):
                        os.makedirs(video_dir)
                    logging.info("Created new video dir")
                    env = wrappers.Monitor(env_to_wrap, video_dir, force=False)
                    logging.info("Created new wrapper")
                elif args.game == 'doom':
                    env.set_current_counter(current_counter)
                    env.set_record()
                    logging.info("Set new recording")

            state = env.reset()
            state = torch.from_numpy(state)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx)
        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].flatten().detach()

        state, external_reward, done, _ = env.step(action)
        state = torch.from_numpy(state)

        # external reward = 0 if ICM-only mode
        external_reward = external_reward * (1 - args.icm_only)
        external_reward_sum += external_reward

        done = done or episode_length >= args.max_episode_length

        # a quick hack to prevent the agent from stucking
        actions.append(action)
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            train_policy_loss_mean = sum(train_policy_losses) / \
                len(train_policy_losses)
            train_value_loss_mean = sum(train_value_losses) / \
                len(train_value_losses)
            train_rewards_mean = sum(train_rewards) / \
                len(train_rewards)
            logging.info(
                "\n\nEp {:3d}: time {}, num steps {}, FPS {:.0f}, len {},\n"
                "        total R {:.6f}, train policy loss {:.6f}, train value loss {:.6f},\n"
                "        train rewards {:.6f}.\n"
                "".format(
                    count_done,
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(passed_time)), current_counter,
                    current_counter / passed_time, episode_length,
                    external_reward_sum, train_policy_loss_mean,
                    train_value_loss_mean, train_rewards_mean))

            if ((count_done % args.save_model_again_eps == 0)
                    and (optimizer is not None)):
                torch.save(
                    model.state_dict(), models_dir + '/model_' +
                    time.strftime('%Y.%m.%d-%H.%M.%S') +
                    f'_{current_counter}.pth')
                torch.save(
                    optimizer.state_dict(), models_dir + '/optimizer_' +
                    time.strftime('%Y.%m.%d-%H.%M.%S') +
                    f'_{current_counter}.pth')
                logging.info("Saved the model")

            tb.log_value('steps_second', current_counter / passed_time,
                         current_counter)
            tb.log_value('reward', external_reward_sum, current_counter)
            tb.log_value('loss_train_policy_mean', train_policy_loss_mean,
                         current_counter)
            tb.log_value('loss_train_value_mean', train_value_loss_mean,
                         current_counter)
            tb.log_value('reward_train_mean', train_value_loss_mean,
                         current_counter)

            if args.game == 'atari':
                env.close()  # Close the window after the rendering session
                env_to_wrap.close()
            logging.info("Episode done, close all")

            episode_length = 0
            external_reward_sum = 0
            actions.clear()

            if count_done >= args.max_episodes:
                for pid in pids:
                    os.kill(pid, signal.SIGTERM)
                env.close()
                os.kill(os.getpid(), signal.SIGKILL)

            count_done += 1
            time.sleep(args.time_sleep)
示例#8
0
def train_no_curiosity(rank, args, shared_model, counter, lock, pids,
                       optimizer, train_policy_losses, train_value_losses,
                       train_rewards):
    pids.append(os.getpid())

    torch.manual_seed(args.seed + rank)

    if args.game == 'doom':
        env = create_doom_env(args.env_name,
                              rank,
                              num_skip=args.num_skip,
                              num_stack=args.num_stack)
    elif args.game == 'atari':
        env = create_atari_env(args.env_name)
    elif args.game == 'picolmaze':
        env = create_picolmaze_env(args.num_rooms)
    env.seed(args.seed + rank)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0

    killer = Killer()
    while not killer.kill_now:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())

        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            if done:
                episode_length = 0
                state = env.reset()
                state = torch.from_numpy(state)
            episode_length += 1

            value, logit, (hx, cx) = model(state.unsqueeze(0), hx, cx)
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)

            # Entropy trick
            if 'sparse' in args.env_name.lower():
                max_entropy = torch.log(
                    torch.tensor(logit.size()[1], dtype=torch.float))
                entropy = entropy \
                    if entropy <= args.max_entropy_coef * max_entropy \
                    else torch.tensor(0.0)

            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).flatten().detach()
            log_prob = log_prob.gather(1, action.view(1, 1))

            state, reward, done, _ = env.step(action)
            state = torch.from_numpy(state)

            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            with lock:
                counter.value += 1

            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        train_rewards[rank - 1] = sum(rewards)

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model(state.unsqueeze(0), hx, cx)
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.gae_lambda + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        train_policy_losses[rank - 1] = float((policy_loss).detach().item())
        train_value_losses[rank - 1] = float((value_loss).detach().item())

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()

    env.close()