示例#1
0
        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= int(args.start_timesteps):
            policy.train(_replay_buffer, args.batch_size)

        if done:
            print(
                f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}"
            )
            logger.store(EpRet=episode_reward, EpLen=episode_timesteps)
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        if (t + 1) % args.eval_freq == 0:
            test_agent(policy, eval_env, args.seed, logger)
            if args.save_model:
                policy.save(f"./models/{file_name}")
            logger.log_tabular("EpRet", with_min_and_max=True)
            logger.log_tabular("TestEpRet", with_min_and_max=True)
            logger.log_tabular("EpLen", average_only=True)
            logger.log_tabular("TestEpLen", average_only=True)
            logger.log_tabular("TotalEnvInteracts", t + 1)
            logger.log_tabular("Time", time.time() - start_time)
            logger.dump_tabular()
示例#2
0
                    print(
                        f"Warning: trajectory cut off by local epoch at {episode_timesteps} steps.",
                        flush=True)
                if timeout_done or epoch_done:
                    _, _, v = policy.select_action(state)
                else:
                    v = 0
                _replay_buffer.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=episode_reward, EpLen=episode_timesteps)
                # Reset environment
                state, done = env.reset(), False
                episode_reward = 0
                episode_timesteps = 0

        # perform VPG update
        policy.train(_replay_buffer)

        test_agent(policy, eval_env, args.seed, logger)
        if args.save_model:
            policy.save(f"./models/{file_name}")
        logger.log_tabular("EpRet", with_min_and_max=True)
        logger.log_tabular("TestEpRet", with_min_and_max=True)
        logger.log_tabular("EpLen", average_only=True)
        logger.log_tabular("TestEpLen", average_only=True)
        logger.log_tabular("TotalEnvInteracts",
                           (epoch + 1) * args.steps_per_epoch)
        logger.log_tabular("Time", time.time() - start_time)
        logger.dump_tabular()
示例#3
0
文件: main.py 项目: LQNew/LWDRLD
            step += 1

        eps_schedule.update(
            step)  # Annealing the epsilon, for exploration strategy
        states = next_states

        # If memory fill 50K and mod 4 == 0 (for speed issue), update the policy
        if (step >= args.start_timesteps) and (step % args.update_freq == 0):
            policy.train(_replay_buffer, batch_size=args.batch_size)

        # print log and save model
        if t % args.eval_freq == 0:
            if args.save_model:
                policy.save(f"./models/{file_name}")
            # check time interval
            time_interval = round(time.time() - start_time, 2)
            mean_100_ep_return = round(
                np.mean([epinfo['r'] for epinfo in epinfobuf]),
                2)  # calculate mean return
            print(
                f"Used Step: {step} | Epsilon: {round(eps_schedule.value, 3)} "
                f"| Mean ep 100 return: {mean_100_ep_return} "
                f"| Used Time: {time_interval}")
            # store the logger
            logger.log_tabular("MeanEpReward", mean_100_ep_return)
            logger.log_tabular("TotalEnvInteracts", step)
            logger.log_tabular("Time", time_interval)
            logger.dump_tabular()

    print("The training is done!")