Пример #1
0
    def optimize_params(self, trial, n_prune_evals_per_trial: int = 2, n_tests_per_eval: int = 1):
        train_provider, test_provider = self.data_provider.split_data_train_test(self.train_split_percentage)
        train_provider, validation_provider = train_provider.split_data_train_test(self.train_split_percentage)

        del test_provider

        train_env = SubprocVecEnv([make_env(train_provider, i) for i in range(1)])
        validation_env = SubprocVecEnv([make_env(validation_provider, i) for i in range(1)])

        model_params = self.optimize_agent_params(trial)
        model = self.Model(self.Policy, train_env, verbose=self.model_verbose, nminibatches=1,
                           tensorboard_log=self.tensorboard_path, **model_params)

        last_reward = -np.finfo(np.float16).max
        n_steps_per_eval = int(len(train_provider.data_frame) / n_prune_evals_per_trial)

        for eval_idx in range(n_prune_evals_per_trial):
            try:
                model.learn(n_steps_per_eval)
            except AssertionError:
                raise

            rewards = []
            n_episodes, reward_sum = 0, 0.0

            trades = train_env.get_attr('trades')

            if len(trades[0]) < 1:
                self.logger.info('Pruning trial for not making any trades: ', eval_idx)
                raise optuna.structs.TrialPruned()

            state = None
            obs = validation_env.reset()
            while n_episodes < n_tests_per_eval:
                action, state = model.predict(obs, state=state)
                obs, reward, done, _ = validation_env.step([action])

                reward_sum += reward[0]

                if all(done):
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    n_episodes += 1
                    obs = validation_env.reset()

            last_reward = np.mean(rewards)
            trial.report(-1 * last_reward, eval_idx)

            if trial.should_prune(eval_idx):
                raise optuna.structs.TrialPruned()

        return -1 * last_reward
Пример #2
0
def evaluate_model_on_set(
    set_path,
    model,
    config_path=None,
    config_kw=None,
    metrics=("success", "control_variation", "rise_time", "overshoot",
             "settling_time"),
    norm_data_path=None,
    num_envs=1,
    turbulence_intensity="none",
    use_pid=False,
    writer=None,
    timestep=None,
):
    """
    :param set_path: (str) path to test set file
    :param model: (PPO2 object or [PIDController]) the controller to be evaluated
    :param config_path: (str) path to gym environment configuration file
    :param config_kw: (dict) dictionary of key value pairs to override settings in the configuration file of the gym environment
    :param metrics: ([str]) list of metrics to be computed and recorded
    :param norm_data_path: (str) path to folder containing normalization statistics
    :param num_envs: (int) number of gym environments to run in parallell using multiprocessing
    :param turbulence_intensity: (str) the intensity setting of the wind turbulence
    :param use_pid: (bool) Whether the evaluated controller is a PID controller or not
    :param writer: (tensorboard writer) If supplied, evaluation results will be written to tensorboard log, if not, results are printed to standard output
    :param timestep: (int) What timestep results are written to when using tensorboard logging
    :return: (dict) the metrics computed for the evaluated controller on the test set
    """
    scenarios = list(np.load(set_path, allow_pickle=True))
    scenario_count = len(scenarios)

    if config_kw is None:
        config_kw = {}

    config_kw.update({
        "steps_max": 1500,
        "target": {
            "on_success": "done",
            "success_streak_fraction": 1,
            "success_streak_req": 100,
            "states": {
                0: {
                    "bound": 5
                },
                1: {
                    "bound": 5
                },
                2: {
                    "bound": 2
                }
            },
        },
    })

    if use_pid:
        config_kw["action"] = {"scale_space": False}

    sim_config_kw = {
        "turbulence": turbulence_intensity != "None",
        "turbulence_intensity": turbulence_intensity,
    }

    test_env = SubprocVecEnv([
        make_env(config_path,
                 i,
                 config_kw=config_kw,
                 sim_config_kw=sim_config_kw) for i in range(num_envs)
    ])
    if use_pid:
        dt = test_env.get_attr("simulator")[0].dt
        for pid in model:
            pid.dt = dt
        env_cfg = test_env.get_attr("cfg")[0]
        obs_states = [var["name"] for var in env_cfg["observation"]["states"]]
        try:
            phi_i, theta_i, Va_i = (
                obs_states.index("roll"),
                obs_states.index("pitch"),
                obs_states.index("Va"),
            )
            omega_i = [
                obs_states.index("omega_p"),
                obs_states.index("omega_q"),
                obs_states.index("omega_r"),
            ]
        except ValueError:
            print(
                "When using PID roll, pitch, Va, omega_p, omega_q, omega_r must be part of the observation vector."
            )
    else:
        test_env = VecNormalize(test_env)
        if model.env is not None:
            test_env.obs_rms = model.env.obs_rms
            test_env.ret_rms = model.env.ret_rms
        else:
            assert norm_data_path is not None
            test_env.load_running_average(norm_data_path)
        test_env.training = False

    res = {metric: {} for metric in metrics}
    res["rewards"] = [[] for i in range(scenario_count)]
    active_envs = [i < scenario_count for i in range(num_envs)]
    env_scen_i = [i for i in range(num_envs)]
    test_done = False
    obs = np.array(
        [np.zeros(test_env.observation_space.shape) for i in range(num_envs)])
    done = [True for i in range(num_envs)]
    info = None

    while not test_done:
        for i, env_done in enumerate(done):
            if env_done:
                if len(scenarios) > 0 or active_envs[i]:
                    if len(scenarios) > 0:
                        print("{}/{} scenarios left".format(
                            len(scenarios), scenario_count))
                        scenario = scenarios.pop(0)
                        env_scen_i[i] = (scenario_count - 1) - len(scenarios)
                        obs[i] = test_env.env_method("reset",
                                                     indices=i,
                                                     **scenario)[0]
                        if use_pid:
                            model[i].reset()
                            model[i].set_reference(
                                scenario["target"]["roll"],
                                scenario["target"]["pitch"],
                                scenario["target"]["Va"],
                            )
                    else:
                        active_envs[i] = False
                    if info is not None:
                        for metric in metrics:
                            if isinstance(info[i][metric], dict):
                                for state, value in info[i][metric].items():
                                    if state not in res[metric]:
                                        res[metric][state] = []
                                    res[metric][state].append(value)
                            else:
                                if "all" not in res[metric]:
                                    res[metric]["all"] = []
                                res[metric]["all"].append(info[i][metric])

        if len(scenarios) == 0:
            test_done = not any(active_envs)
        if use_pid:
            actions = []
            for i, pid in enumerate(model):
                roll, pitch, Va = obs[i, phi_i], obs[i, theta_i], obs[i, Va_i]
                omega = obs[i, omega_i]
                if info is not None and "target" in info[i]:
                    pid.set_reference(
                        phi=info[i]["target"]["roll"],
                        theta=info[i]["target"]["pitch"],
                        va=info[i]["target"]["Va"],
                    )
                actions.append(pid.get_action(roll, pitch, Va, omega))
            actions = np.array(actions)
        else:
            actions, _ = model.predict(obs, deterministic=True)
        obs, rew, done, info = test_env.step(actions)
        for i, env_rew in enumerate(rew):
            res["rewards"][env_scen_i[i]].append(env_rew)

    if writer is not None:
        summaries = []
        for metric, metric_v in res.items():
            if isinstance(res[metric], dict):
                for state, v in res[metric].items():
                    summaries.append(
                        tf.Summary.Value(
                            tag="test_set/{}_{}".format(metric, state),
                            simple_value=np.nanmean(v),
                        ))
        writer.add_summary(tf.Summary(value=summaries), timestep)
    else:
        print_results(res)

        return res
Пример #3
0
def main():

    args = get_configuration()
    args.state_dim = util.get_state_dim(args)

    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir, exist_ok=True)

    if args.graph_embedding:

        class MyPolicy(EmbeddingPolicy):
            def __init__(self,
                         sess,
                         ob_space,
                         ac_space,
                         n_env,
                         n_steps,
                         n_batch,
                         reuse=True,
                         **_kwargs):
                super().__init__(sess,
                                 ob_space,
                                 ac_space,
                                 n_env,
                                 n_steps,
                                 n_batch,
                                 args,
                                 reuse=reuse,
                                 **_kwargs)
    else:

        class MyPolicy(EnigmaPolicy):
            def __init__(self,
                         sess,
                         ob_space,
                         ac_space,
                         n_env,
                         n_steps,
                         n_batch,
                         reuse=True,
                         **_kwargs):
                super().__init__(sess,
                                 ob_space,
                                 ac_space,
                                 n_env,
                                 n_steps,
                                 n_batch,
                                 args,
                                 reuse=reuse,
                                 **_kwargs)

    t0 = time.time()

    from mpi4py import MPI as mpi
    comm = mpi.COMM_WORLD
    rank = comm.Get_rank()
    all = comm.Get_size()

    gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
    gpu_count = len(gpus)
    gpu = gpus[rank % gpu_count]
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    print("My rank is {} out of {}, using GPU {}".format(rank, all, gpu))

    if args.model_type == "ppo2":
        from stable_baselines import PPO2 as PPO
        env = SubprocVecEnv([(lambda: ProofEnv.ProofEnv(args))
                             for _ in range(args.parallel_envs)
                             ])  #, start_method="spawn")
    elif args.model_type == "ppo1":
        args.parallel_envs = 1
        env = DummyVecEnv([lambda: ProofEnv.ProofEnv(args)])
        # from stable_baselines import PPO1 as PPO
        from ppo import PPO1 as PPO

    if args.saved_model == None:
        myPolicy = MyPolicy
        if args.model_type == "ppo2":
            model = PPO(
                policy=myPolicy,
                env=env,
                n_steps=args.actorbatch,
                # nminibatches=args.optim_stepsize,
                lam=0.95,
                gamma=args.gamma,
                noptepochs=4,
                ent_coef=args.entcoeff,
                learning_rate=lambda f: f * 2.5e-4,
                cliprange=lambda f: f * 0.1,
                verbose=1)
        elif args.model_type == "ppo1":
            model = PPO(myPolicy,
                        env,
                        verbose=2,
                        timesteps_per_actorbatch=args.actorbatch,
                        schedule=args.lr_schedule,
                        optim_stepsize=args.optim_stepsize,
                        entcoeff=args.entcoeff,
                        optim_batchsize=args.optim_batchsize,
                        gamma=args.gamma)
    else:
        print("Loading model from {}".format(args.saved_model))
        model = PPO.load(args.saved_model)
        model.set_env(env)

    counter = 0

    for ind in range(args.parallel_envs):
        env.env_method("set_model",
                       model,
                       indices=list(range(args.parallel_envs)))

    modelfiles = []
    for train_timestep, train_dir in zip(args.train_timesteps,
                                         args.train_dirs):
        problem_files = sorted(util.list_problems(train_dir))
        problem_files = util.split_list(problem_files, all)[rank]
        problem_files_splitted = util.split_list(problem_files,
                                                 args.parallel_envs,
                                                 extensible=False)

        if args.add_repeating_pretraining:
            for ind in range(args.parallel_envs):
                env.env_method("set_source",
                               problem_files_splitted[ind],
                               indices=[ind],
                               generator_type="repeating")
            # all_thread_timestep = train_timestep * all
            print("PRETRAINING")
            model.learn(total_timesteps=train_timestep)
            print("Pretraining on {} finished in {}".format(
                train_dir, util.format_time(time.time() - t0)))

        for ind in range(args.parallel_envs):
            env.env_method("set_source",
                           problem_files_splitted[ind],
                           indices=[ind])
        # all_thread_timestep = train_timestep * all
        model.learn(total_timesteps=train_timestep)

        modelfile = "{}/ppo1_fcop_train_{}".format(args.outdir, counter)
        modelfiles.append(modelfile)
        if rank == 0:
            model.save(modelfile)
            # logger.logkv("finished_train_problems", counter)
        counter += 1

        print("Training on {} finished in {}".format(
            train_dir, util.format_time(time.time() - t0)))
        statistics_list = env.get_attr("statistics",
                                       indices=list(range(args.parallel_envs)))
        blacklist_list = env.get_attr("blacklist",
                                      indices=list(range(args.parallel_envs)))
        for i, statistics in enumerate(statistics_list):
            print("ENV {} - {} - blacklist: {}\n".format(
                rank, i, blacklist_list[i])),
            util.print_problemdict(statistics, rank)

            # for f in statistics:
            #     statistics[f]["mcts"].display_tree([0])

        # util.print_problemdict(env.envs[0].statistics)

    if len(args.train_dirs) > 0 and len(
            args.train_timesteps) > 0:  # we did training
        print("We have finished training, rank {}".format(rank))

        # for p in problem_files:
        #     vis_policy.vis_policy(env.envs[0], model, p)

        env.close()
        del env
        del model

    # here we wait for everyone
    comm.Barrier()
    print("We have started evaluation, rank {}".format(rank))

    # evaluation without training
    if (args.saved_model is not None) and (len(
            args.train_dirs) == 0):  # no training, just evaluation
        modelfiles = [args.saved_model]

    for evaldir in args.evaldirs:
        for model_index, modelfile in enumerate(modelfiles):
            eval.eval_mpi(args, evaldir, modelfile, model_index)

            # here we wait for everyone
            comm.Barrier()
Пример #4
0
class Agent:
    
    def __init__(self, version, envs, hours = 0, verbose = False, weights = None):
        
        self.version = version
        self.name = "football-ppo{}".format(version) + "-e{}"
        self.path = "models/football-ppo-{}/".format(version)
        
        self.defaults = {
            "env_name": "",
            "representation": "simple115",
            "rewards": "scoring",
            "render": False,
            "write_video": False,
            "dump_frequency": 1,
            "extra_players": None,
            "number_of_left_players_agent_controls": 1,
            "number_of_right_players_agent_controls": 0,
            "enable_sides_swap": False,
            "parallel": 1
        }
        
        self.configs = list(map(lambda b: dict(map(lambda a: (a[0], a[1] if a[0] not in b.keys() else b[a[0]]), self.defaults.items())), envs))
        
        self.training = SubprocVecEnv(reduce(lambda a, b: a + b, list(map(lambda config: [
        
            lambda: football.create_environment(
                env_name = config["env_name"],
                representation = config["representation"],
                rewards = config["rewards"],
                render = config["render"],
                write_video = config["write_video"],
                dump_frequency = config["dump_frequency"],
                extra_players = config["extra_players"],
                number_of_left_players_agent_controls = config["number_of_left_players_agent_controls"],
                number_of_right_players_agent_controls = config["number_of_right_players_agent_controls"],
                enable_sides_swap = config["enable_sides_swap"]
            ) for _ in range(config["parallel"])
        
        ], self.configs)), []))
        
        self.inputs = self.training.get_attr("observation_space")[0].shape[0]
        self.outputs = self.training.get_attr("action_space")[0].n
        
        self.verbose = verbose
        
        if not verbose:
            os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 
            deprecation._PRINT_DEPRECATION_WARNINGS = False
            logger = logging.getLogger()
            logger.setLevel(logging.ERROR)
        
        if weights == None:
            self.model = PPO2(policy = MlpPolicy, env = self.training, verbose = int(self.verbose))
        else:
            self.model = PPO2.load(weights, env = self.training, learning_rate = 0.002)
    
        self.experience = hours * 60
    
    def duration(self, time):
        
        return "{:02}:{:02}".format(time // 60, time % 60)
            
    def progress(self, current, total):
        
        return "{}{}".format("#" * int(current / (total / 10)), " " * (10 - int(current / (total / 10))))
    
    def blank(self):
        
        return [("", "")]
    
    def separator(self, width = 36, inset = "   "):
        
        return [inset + ("-" * width)]
    
    def section(self, *, values, width = 36, inset = "   "):
        
        rows = []
        reserved = 5
        
        space = lambda a: len(str(a[0])) + len(str(a[1]))
        lengths = list(map(lambda a: space(a), values))
        
        width = max(lengths + [width - reserved])
        
        for (name, value), length in zip(values, lengths):
            rows.append(inset + "| " + name + (" " * (width - length)) + " " + str(value) + " |")
        
        return rows
    
    def dump(self, lines):
        
        with open(os.path.join(self.path, "results.txt"), "a+") as dump:
            for line in lines: dump.write(line + ("\r\n" if line != "\r\n" else ""))
         
    def train(self, *, epoch, episodes, verbose):
        
        inset = "   "
        start = datetime.datetime.now()
        
        counts = list(map(lambda a: a["parallel"], self.configs))
        stochastics = ["11_vs_11_stochastic", "11_vs_11_easy_stochastic", "11_vs_11_hard_stochastic"]
        expand = lambda values, counts: reduce(lambda a, b: a + b, map(lambda c: [c[0]] * c[1], zip(values, counts)), [])
        
        results = Results(indexes = expand(list(map(lambda a: a["env_name"] in stochastics, self.configs)), counts))
        
        parallel = sum(counts)
        
        self.model.set_env(self.training)
        
        with output(initial_len = 4 if not verbose else 20 + results.count, interval = 0) as lines:
            
            lines[0] = "\n"
            lines[3] = "\n"
            
            lines[1] = "{}Epoch {}".format(inset, epoch)
            
            def callback(a, b):
                
                matches = self.training.get_attr("last_observation")
                results.temps(matches)
                
                update(
                    clock = int((3000 - matches[0][0]["steps_left"]) * 1.8), 
                    scores = list(map(lambda score: "{}:{}".format(score[0], score[1]), results.scores(matches)))
                )
            
            def update(*, clock, scores = None):
                
                if not verbose: return
                
                if scores == None:
                    scores = ["0:0"] * results.count
                
                matches = list(map(lambda a: "Match {}".format(a), range(1, results.count + 1)))
                
                table = reduce(lambda a, b: a + b, [
                    self.separator(),
                    self.section(values = (results.results() + self.blank() + results.goals() + self.blank() + [("Time", self.duration((datetime.datetime.now() - start).seconds)), ("Experience", self.duration(self.experience + int((clock / 60) * parallel))), ("Match Clock", self.duration(clock))])),
                    self.separator(),
                    self.section(values = list(zip(matches, scores))),
                    self.separator()
                ], [])
                
                for index, row in enumerate(table): 
                    lines[4 + index] = row
                
            for episode in range(1, episodes + 1):
                
                lines[2] = "{}Episode {} of {} - [{}]".format(inset, episode, episodes, self.progress(episode, episodes))
                
                update(clock = 0)
                
                self.model.learn(total_timesteps = 3000 * parallel, callback = callback)
                
                matches = self.training.get_attr("last_observation")
                results.record(matches = matches)
        
                update(clock = 5400, scores = list(map(lambda a: "{}:{}".format(a[0], a[1]), results.scores(matches))))
                self.experience += parallel * 90
                
                time.sleep(1)
        
            self.dump(lines)
    
    def watch(self, *, env, matches, weights, record):
        
        environment = SubprocVecEnv([
        
            lambda: football.create_environment(
                env_name = "11_vs_11_easy_stochastic",
                representation = self.configs[0]["representation"],
                rewards = self.configs[0]["rewards"],
                enable_goal_videos = False,
                enable_full_episode_videos = True,
                render = True,
                write_video = record,
                dump_frequency = 1,
                logdir = "/home/charlie/Projects/Python/Football/videos/",
                extra_players = self.configs[0]["extra_players"],
                number_of_left_players_agent_controls = self.configs[0]["number_of_left_players_agent_controls"],
                number_of_right_players_agent_controls = self.configs[0]["number_of_right_players_agent_controls"],
                enable_sides_swap = self.configs[0]["enable_sides_swap"]
            ) for _ in range(1)
        
        ])
        
        # self.model.set_env(environment)
        
        watch = PPO2.load(weights, env = environment)
        
        for match in range(matches):

            watch.learn(total_timesteps = 3100)

    def run(self, *, epochs, episodes, verbose = True):
        
        if os.path.exists(self.path):
    
            if len(os.listdir(self.path)) > 0:
                print("Directory: {} is not empty. Please make sure you are not overwriting existing models and try again.".format(self.path))
                return
        else:
            os.mkdir(self.path)
        
        for epoch in range(1, epochs):
            
            self.train(epoch = epoch, episodes = episodes, verbose = verbose)
            self.model.save(os.path.join(self.path, self.name.format(epoch)))
            self.watch(env = "11_vs_11_stochastic", matches = 1, weights = os.path.join(self.path, self.name.format(epoch)), record = True)
def main(args):
    log_dir = args.log_path if (args.log_path is not None) else \
        "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    configure_logger(log_dir)

    set_global_seeds(args.seed)

    n_cpu = get_num_workers(args.env) if not args.play else 1
    env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential,
                                args.reward_type, args.n_object,
                                args.curriculum)

    def make_thunk(rank):
        return lambda: make_env(env_id=args.env,
                                rank=rank,
                                log_dir=log_dir,
                                flatten_dict=True,
                                kwargs=env_kwargs)

    env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)])

    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    if "use_cu" in eval_env_kwargs:
        eval_env_kwargs['use_cu'] = False
    eval_env = make_env(env_id=args.env,
                        rank=0,
                        flatten_dict=True,
                        kwargs=eval_env_kwargs)
    print(eval_env)
    if not args.play:
        os.makedirs(log_dir, exist_ok=True)
        train_kwargs = get_train_kwargs("ppo",
                                        args,
                                        parsed_action_noise=None,
                                        eval_env=eval_env)

        # policy = 'MlpPolicy'
        from utils.attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("ppo", args)
        print(policy_kwargs)

        model = PPO2(args.policy,
                     env,
                     verbose=1,
                     nminibatches=32,
                     lam=0.95,
                     noptepochs=10,
                     ent_coef=0.01,
                     learning_rate=3e-4,
                     cliprange=0.2,
                     policy_kwargs=policy_kwargs,
                     **train_kwargs)
        print(model.get_parameter_list())

        def callback(_locals, _globals):
            num_update = _locals["update"]
            if 'FetchStack' in args.env:
                mean_eval_reward = stack_eval_model(eval_env, _locals["self"])
            else:
                mean_eval_reward = eval_model(eval_env, _locals["self"])
            log_eval(num_update, mean_eval_reward)
            if num_update % 10 == 0:
                model_path = os.path.join(log_dir,
                                          'model_' + str(num_update // 10))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        model.learn(total_timesteps=int(args.num_timesteps),
                    callback=callback,
                    seed=args.seed,
                    log_interval=1)
        model.save(os.path.join(log_dir, 'final'))

    else:
        assert args.load_path is not None
        model = PPO2.load(args.load_path)
        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        obs = env.reset()
        goal_dim = env.get_attr('goal')[0].shape[0]
        if 'FetchStack' in args.env:
            while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                    env.get_attr('task_mode')[0] != 1:
                obs = env.reset()
        elif 'FetchPush' in args.env:
            while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61
                       and 0.7 < obs[0][4] < 0.8):
                obs = env.reset()
            env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0]))
            obs = env.env_method('get_obs')
            obs[0] = np.concatenate([
                obs[0][key]
                for key in ['observation', 'achieved_goal', 'desired_goal']
            ])
        else:
            while np.argmax(obs[0][-goal_dim + 3:]) != 0:
                obs = env.reset()
        print('achieved_goal', obs[0][-2 * goal_dim:-goal_dim], 'goal',
              obs[0][-goal_dim:])
        episode_reward = 0.0
        num_episode = 0
        frame_idx = 0
        images = []
        if 'max_episode_steps' not in env_kwargs.keys():
            env_kwargs['max_episode_steps'] = 100
        for i in range(env_kwargs['max_episode_steps'] * 10):
            img = env.render(mode='rgb_array')
            ax.cla()
            ax.imshow(img)
            if env.get_attr('goal')[0].shape[0] <= 3:
                ax.set_title('episode ' + str(num_episode) + ', frame ' +
                             str(frame_idx))
            else:
                ax.set_title('episode ' + str(num_episode) + ', frame ' +
                             str(frame_idx) + ', goal idx ' +
                             str(np.argmax(env.get_attr('goal')[0][3:])))
                if 'FetchStack' in args.env:
                    tasks = ['pick and place', 'stack']
                    ax.set_title('episode ' + str(num_episode) + ', frame ' +
                                 str(frame_idx) + ', task: ' +
                                 tasks[np.argmax(obs[0][-2 * goal_dim - 2:-2 *
                                                        goal_dim])])
            images.append(img)
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            frame_idx += 1
            if not args.export_video:
                plt.pause(0.1)
            else:
                plt.imsave(
                    os.path.join(os.path.dirname(args.load_path),
                                 'tempimg%d.png' % i), img)
            if done:
                print('episode_reward', episode_reward)
                if 'FetchStack' in args.env:
                    while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                            env.get_attr('task_mode')[0] != 1:
                        obs = env.reset()
                else:
                    while np.argmax(obs[0][-goal_dim + 3:]) != 0:
                        obs = env.reset()
                print('goal', obs[0][-goal_dim:])
                episode_reward = 0.0
                frame_idx = 0
                num_episode += 1
                if num_episode >= 10:
                    break
        if args.export_video:
            os.system('ffmpeg -r 5 -start_number 0 -i ' +
                      os.path.dirname(args.load_path) +
                      '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' +
                      os.path.join(os.path.dirname(args.load_path), args.env +
                                   '.mp4'))
            for i in range(env_kwargs['max_episode_steps'] * 10):
                try:
                    os.remove(
                        os.path.join(os.path.dirname(args.load_path),
                                     'tempimg' + str(i) + '.png'))
                except:
                    pass
Пример #6
0
    #os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

    results = []

    config["max_speed"] = 5
    config["max_obs_range"] = 3
    n_cpu = 32

    model = PPO2.load(
        "./obs_range/ppo2_default_{}.zip".format(config["max_obs_range"])
    )  # icies=["logs/best_model.zip"], **config) for _ in range(n_cpu)])
    subpolicies = ["obs_range/ppo2_default_{}".format(config["max_obs_range"])]
    env = SubprocVecEnv(
        [lambda: NavigationEnvDefault(**config) for _ in range(n_cpu)])
    # model = PPO2(policy="MlpPolicy", env=env)
    scores = []
    obs = env.reset()

    for j in range(10000):
        actions, _ = model.predict(obs)
        obs, reward, done, info = env.step(actions)
        if j % 100 == 0:
            print(j, "/", 10000)
    for i in range(n_cpu):
        scores = scores + env.get_attr("last_score", i)
    with open("./obs_range/scores.csv", "a") as f:
        f.writelines("local" + "," + str(np.mean(scores)) + "\n")

    env.close()
    del env
Пример #7
0
    episode_reward = 0

    while True:

        # env.set_attr("keyboard_u", keyboard_u)

        env.render()
        action, _states = model.predict(obs, deterministic=True)
        action[0] = 0
        obs, rewards, dones, info = env.step(action)
        episode_reward += rewards[0]
        if dones[0]:
            performance[cnt, 0] = episode_reward
            episode_reward = 0

            performance[cnt, 1] = env.get_attr("record_count")[0]
            # print(env.get_attr("record_count"))
            performance[cnt, 2] = env.env_method("why_done")[0]
            # print(env.env_method("why_done"))
            if int(performance[cnt, 2]) != 0:
                performance[cnt, 1] = np.inf
            cnt += 1

            break
    print(performance)
    print(np.mean(performance[:, 0]),
          np.min(performance[:, 1]) * 0.1,
          np.max(performance[:, 1]) * 0.1,
          np.mean(performance[:, 1]) * 0.1,
          len(performance[performance[:, 2] == 0]),
          len(performance[performance[:, 2] == 1]),