def main():
    logger.configure(
        'E:\\Project\\Toyota RL\\Toyata 2018\\Toyata RL 4th quarter\\log')
    # 'F:\\GuanYang\\toyota2018_4\\log'
    parser = common_arg_parser()
    parser.add_argument('--load_model_path', default=None)
    parser.set_defaults(num_timesteps=int(2e7))

    args = parser.parse_args()
    env = environment.Env(N=6,
                          pattern=[0, 2, 4, 8, 9, 10],
                          height=30,
                          width=30)

    if not args.play:
        # train the model
        train(env=env,
              num_timesteps=args.num_timesteps,
              load_model_path=args.load_model_path)
    else:
        # construct the model object, load pre-trained model and render
        pi = train(env=env, num_timesteps=1)
        U.load_state(args.load_model_path)
        ob = env.manualSet(modelList=env.pattern)
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            # ob, _, done, _ =  env.step(action)
            ob, rew, done, _ = env.updateEnv(action)
            env.showEnv()
            if done:
                ob = env.manualSet(modelList=env.pattern)
示例#2
0
def policy_run(env, policy_fn, load_model_path, number_rollouts,
               stochastic_policy):
    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space)
    U.initialize()
    # Prepare for rollouts
    # ----------------------------------------
    weight_file = tf.train.latest_checkpoint(load_model_path)
    if weight_file is None:
        print("error: no weight file")
        return
    U.load_state(weight_file)

    for _ in range(number_rollouts):
        ob = env.reset()
        done = False
        ep_rewards = []
        cur_ep_ret = 0
        while not done:
            env.render()
            time.sleep(0.1)
            ac, vpred = pi.act(stochastic_policy, ob)
            ob, rew, done, _ = env.step(ac)
            cur_ep_ret += rew
        ep_rewards.append(cur_ep_ret)

        ep_reward_mean = np.mean(ep_rewards)

        print("ep_reward_mean: {}".format(ep_reward_mean))
示例#3
0
    def load_model(model_path):
        if ScoutExploreTaskRL.act is not None:
            return

        class FakeEnv(object):
            def __init__(self):
                low = np.zeros(6)
                high = np.ones(6)
                self.observation_space = Box(low, high)
                self.action_space = Discrete(8)

        def make_obs_ph(name):
            return ObservationInput(env.observation_space, name=name)

        env = FakeEnv()
        network = deepq.models.mlp([64, 32])
        act_params = {
            'make_obs_ph': make_obs_ph,
            'q_func': network,
            'num_actions': env.action_space.n,
        }

        act = deepq.build_act(**act_params)
        sess = tf.Session()
        sess.__enter__()
        print("load_model path=", model_path)
        load_state(model_path)
        ScoutExploreTaskRL.act = ActWrapper(act, act_params)
        print("load_model ok")
示例#4
0
def eval(env, model_dir):
    from baselines.ppo1 import mlp_policy

    # Load variables
    U.make_session(num_cpu=1).__enter__()
    ob_space = env.observation_space
    ac_space = env.action_space

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy

    # Load variables
    U.load_state(osp.join(model_dir, "model"))

    ob = env.reset()
    while True:
        # print ("Obs: ", ob)
        # print (type(ob))
        ac, vpred = pi.act(True, ob)

        ob, rew, new, _ = env.step(ac)

        if new:
            ob = env.reset()
    env.close()
def submit_round2(walker_env, submit_env, policy_fn, load_model_path,
                  stochastic, actions):
    ob_space = walker_env.observation_space
    ac_space = walker_env.action_space

    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy

    U.initialize()
    U.load_state(load_model_path)

    while True:
        obs = walker_env.reset()
        stepno = 0
        if isinstance(obs, bool) and obs == False:
            break
        done = False
        while not done:
            action, _ = pi.act(stochastic, obs, np.int32(stepno))
            obs, rew, done, info = walker_env.step(action)
            stepno += 1
            if done:
                break

    submit_env.submit()
    def load(path, q_func, env, num_cpu=16):
        with open(path, "rb") as f:
            model_data = dill.load(f)

        def make_obs_ph(name):
            return U.BatchInput(env.observation_space.shape, name=name)
        act_params = {
            'make_obs_ph': make_obs_ph,
            'q_func': q_func,
            'num_actions': env.action_space.n,
        }
        act = deepq.build_act(**act_params)

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6666667)

        sess = U.make_session(num_cpu=num_cpu, gpu_opt=gpu_options)
        #sess = U.make_session(num_cpu=num_cpu)

        sess.__enter__()
        with tempfile.TemporaryDirectory() as td:
            arc_path = os.path.join(td, "packed.zip")
            with open(arc_path, "wb") as f:
                f.write(model_data)

            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
            U.load_state(os.path.join(td, "model"))

        return ActWrapper(act, act_params)
示例#7
0
def predict_action(img, load_model_path):
    ob_space1 = spaces.Box(low=-np.inf,
                           high=np.inf,
                           shape=(5, ),
                           dtype=np.float)
    ob_space2 = spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
    ob_space = (ob_space1, ob_space2)
    ac_space = spaces.Box(low=-np.array(np.ones(2)), high=np.array(np.ones(2)))

    pi = cnn_lstm_policy.CnnPhyLSTMPolicy("pi",
                                          ob_space,
                                          ac_space,
                                          hid_size=64,
                                          num_hid_layers=1)

    U.initialize()
    assert load_model_path is not None
    U.load_state(load_model_path)

    ob1 = np.array([np.cos(1), np.sin(1), 0, 0, 0])
    ob2 = process_img(img)

    ac = pi.act(True, (ob1, ob2), pi.get_initial_state())[0]
    print(pi.get_initial_state())
    return ac
示例#8
0
 def load_model(load_model_path, var_list=None):
     if os.path.isdir(load_model_path):
         ckpt_path = tf.train.latest_checkpoint(load_model_path)
     else:
         ckpt_path = load_model_path
     logger.info("Load checkpoint: %s", ckpt_path)
     U.load_state(ckpt_path, var_list)
示例#9
0
def evaluate(env,
             policy_func,
             load_model_path,
             timesteps_per_batch,
             number_trajs=10,
             stocahstic_policy=False):

    from tqdm import tqdm
    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space, reuse=False)
    U.initialize()
    # Prepare for rollouts
    # ----------------------------------------
    ep_gen = traj_episode_generator(pi,
                                    env,
                                    timesteps_per_batch,
                                    stochastic=stocahstic_policy)
    U.load_state(load_model_path)

    len_list = []
    ret_list = []
    for _ in tqdm(range(number_trajs)):
        traj = ep_gen.__next__()
        ep_len, ep_ret = traj['ep_len'], traj['ep_ret']
        len_list.append(ep_len)
        ret_list.append(ep_ret)
    if stocahstic_policy:
        print('stochastic policy:')
    else:
        print('deterministic policy:')
    print("Average length:", sum(len_list) / len(len_list))
    print("Average return:", sum(ret_list) / len(ret_list))
示例#10
0
def main():
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path',
                        default='checkpoints_best/Humanoid-v2-6914')
    parser.set_defaults(num_timesteps=int(2e8))

    args = parser.parse_args()

    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps,
              seed=args.seed,
              model_path=args.model_path)
    else:
        # construct the model object, load pre-trained model and render
        pi = train(num_timesteps=1, seed=args.seed)
        U.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=123)

        ob = env.reset()
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ = env.step(action)
            env.render()
            time.sleep(0.01)
            if done:
                ob = env.reset()
示例#11
0
def main():
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'policy'))
    parser.set_defaults(num_timesteps=int(2e7))
   
    args = parser.parse_args()
    
    if not args.play:
        # train the model
        train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
    else:       
        # construct the model object, load pre-trained model and render
        pi = train(args.env, num_timesteps=1, seed=args.seed)
        U.load_state(args.model_path)
        env = make_mujoco_env(args.env, seed=0)

        ob = env.reset()        
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ =  env.step(action)
            print(ob,action)
            #env.render()
            if done:
                ob = env.reset()
示例#12
0
def restore_act_and_value(env,
                          path,
                          num_cpu=4,
                          scope="saved/deepq",
                          reuse=None):
    # pdb.set_trace()
    qfunc_path = path + 'model.pkl'
    with open(qfunc_path, "rb") as f:
        q_func = dill.load(f)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    act = build_act(make_obs_ph, q_func, env.action_space.n, scope, reuse)
    value = build_value_function(make_obs_ph, q_func, env.action_space.n,
                                 scope, True)
    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()
    # for debugging
    # for var in tf.global_variables():
    #     print(var.name)
    U.load_state(tf.train.latest_checkpoint(path))
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n
    }
    return ActWrapper(act, act_params), value
示例#13
0
def main():
    """
    Runs the test
    """
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path',
                        default=os.path.join(logger.get_dir(),
                                             'humanoid_policy'))
    parser.set_defaults(num_timesteps=int(2e7))

    args = parser.parse_args()

    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps,
              seed=args.seed,
              model_path=args.model_path)
    else:
        # construct the model object, load pre-trained model and render
        policy = train(num_timesteps=1, seed=args.seed)
        tf_util.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=0)

        obs = env.reset()
        while True:
            action = policy.act(stochastic=False, obs=obs)[0]
            obs, _, done, _ = env.step(action)
            env.render()
            if done:
                obs = env.reset()
示例#14
0
    def evaluate_proximity_predictor(self, var_list):
        config = self._config

        if config.evaluate_all_ckpts:
            from glob import glob
            import pandas as pd
            from tqdm import tqdm

            files = glob(os.path.join(config.log_dir, "*.index"))
            files.sort()
            max_step = max([int(os.path.basename(f).split('.')[0]) for f in files])

            results = {}
            for proximity in self.proximity_predictors:
                results[proximity.env_name] = {'mean': [], 'std': [], 'step': []}
            for i in tqdm(range(0, max_step, 25)):
                logger.log('*** evaluate ckpt {}'.format(i))
                U.load_state(os.path.join(config.log_dir, '%.5d' % i), var_list)
                info = self._evaluate_proximity_predictor()
                for proximity_name, proximity_info in info.items():
                    for key, value in proximity_info.items():
                        results[proximity_name][key].append(value)
                    results[proximity_name]['step'].append(i)
            df = pd.DataFrame(results)
            df.to_pickle('proximity_predictor_evaluation.pkl')
        else:
            self._evaluate_proximity_predictor()
示例#15
0
 def load_model(load_model_path, var_list=None):
     if os.path.isdir(load_model_path):
         ckpt_path = tf.train.latest_checkpoint(load_model_path)
     else:
         ckpt_path = load_model_path
     if ckpt_path:
         U.load_state(ckpt_path, var_list)
     return ckpt_path
    def load_model(self, dirname, iteration=None):
        if iteration is not None:
            dirname = os.path.join(dirname, 'iter_%d' % iteration)
        else:
            dirname = os.path.join(dirname, 'trained_model')

        print('Loading model from %s' % dirname)
        U.load_state(dirname)
        print('Loaded!')
示例#17
0
def evaluate_ppo(num_eps, is_gui):
    sumoseed = 0
    randomseed = 0

    model_dir = '../tf_models/trial9'
    latest_checkpoint = tf.train.latest_checkpoint(model_dir)
    model_path = latest_checkpoint
    pi = train(max_iters=1, callback=None)
    U.load_state(model_path)

    env = LaneChangeEnv(is_train=False)
    ret_eval = 0
    ret_det_eval = 0  # not a integer, will be broadcasted
    danger_num = 0
    crash_num = 0
    level_1_danger = []
    level_2_danger = []
    collision_num = 0
    ep_len_list = []
    success_num = 0
    for i in range(num_eps):
        ep_eval = episode_generator(pi,
                                    env,
                                    is_gui=is_gui,
                                    sumoseed=sumoseed,
                                    randomseed=randomseed)

        ret_eval += ep_eval['ep_ret']
        ret_det_eval += ep_eval['ep_rets_detail']
        danger_num += ep_eval['ep_num_danger']
        crash_num += ep_eval['ep_num_crash']
        level_1_danger.append(1 if ep_eval['ep_num_danger'] > 0 else 0)
        level_2_danger.append((1 if ep_eval['ep_num_crash'] > 0 else 0))
        collision_num += ep_eval['ep_is_collision']
        success_num += int(ep_eval['ep_is_success'])
        if ep_eval['ep_is_success']:
            ep_len_list.append(ep_eval['ep_len'])
        sumoseed += 1
        randomseed += 1

    ret_eval /= float(num_eps)
    ret_det_eval /= float(num_eps)
    danger_rate = danger_num / num_eps
    crash_rate = crash_num / num_eps
    level_1_danger_rate = np.mean(level_1_danger)
    level_2_danger_rate = np.mean(level_2_danger)
    coll_rate = collision_num / num_eps
    success_rate = success_num / float(num_eps)
    success_len = np.mean(ep_len_list)
    print('reward_detail: ', ret_det_eval)
    print('reward: ', ret_eval, '\ndanger_rate: ', danger_rate,
          '\ncrash_rate: ', crash_rate, '\nlevel-1-danger_rate: ',
          level_1_danger_rate, '\nlevel-2-danger_rate: ', level_2_danger_rate,
          '\ncollision_rate: ', coll_rate, '\nsuccess_rate: ', success_rate,
          '\nsucess_len: ', success_len)
    return ret_eval, danger_rate, crash_rate, level_1_danger_rate, level_2_danger_rate, coll_rate, success_rate, success_len
示例#18
0
def runner(env,
           policy_func,
           load_model_path,
           timesteps_per_batch,
           number_trajs,
           stochastic_policy,
           args,
           save=False,
           reuse=False):

    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space, reuse=reuse)
    U.initialize()
    # Prepare for rollouts
    # ----------------------------------------
    U.load_state(load_model_path)

    obs_list = []
    acs_list = []
    len_list = []
    ret_list = []
    total_success = 0
    for _ in tqdm(range(number_trajs)):
        traj = traj_1_generator(pi,
                                env,
                                timesteps_per_batch,
                                stochastic=stochastic_policy)
        obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[
            'ep_len'], traj['ep_ret']
        if traj['is_success'] == True:
            total_success += 1
        obs_list.append(obs)
        acs_list.append(acs)
        len_list.append(ep_len)
        ret_list.append(ep_ret)
    if stochastic_policy:
        print('stochastic policy:')
    else:
        print('deterministic policy:')
    if save:
        filename = load_model_path.split(
            '/')[-1] + '.' + env.spec.id + "seed_{0}".format(args.seed)
        np.savez(filename,
                 obs=np.array(obs_list),
                 acs=np.array(acs_list),
                 lens=np.array(len_list),
                 rets=np.array(ret_list))
    avg_len = sum(len_list) / len(len_list)
    avg_ret = sum(ret_list) / len(ret_list)
    print("Average length:", avg_len)
    print("Average return:", avg_ret)
    return avg_len, avg_ret
示例#19
0
def runner(env,
           policy_func,
           load_model_path,
           timesteps_per_batch,
           number_trajs,
           stochastic_policy,
           save=False,
           reuse=False):

    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space, reuse=reuse)
    U.initialize()
    # Prepare for rollouts
    # ----------------------------------------
    U.load_state(load_model_path)

    obs_list = []
    acs_list = []
    len_list = []
    ret_list = []
    max_x_pos_list = []
    for _ in tqdm(range(number_trajs)):
        traj, max_x_pos = traj_1_generator(pi,
                                           env,
                                           timesteps_per_batch,
                                           stochastic=stochastic_policy)
        obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[
            'ep_len'], traj['ep_ret']
        obs_list.append(obs)
        acs_list.append(acs)
        len_list.append(ep_len)
        ret_list.append(ep_ret)
        max_x_pos_list.append(max_x_pos)
    if stochastic_policy:
        print('stochastic policy:')
    else:
        print('deterministic policy:')
    if save:
        filename = load_model_path.split('/')[-1] + '.' + env.spec.id
        np.savez(filename,
                 obs=np.array(obs_list),
                 acs=np.array(acs_list),
                 lens=np.array(len_list),
                 rets=np.array(ret_list))
    avg_len = sum(len_list) / len(len_list)
    avg_ret = sum(ret_list) / len(ret_list)
    avg_max_x_pos = np.mean(max_x_pos_list)
    print("Average length:", avg_len)
    print("Average return:", avg_ret)
    print("Average max_x_pos:", avg_max_x_pos)
    print("Std max_x_pos:", np.std(max_x_pos_list))
    return avg_len, avg_ret
示例#20
0
    def reload(path):
        with open(path, "rb") as f:
            model_data, act_params = dill.load(f)

        with tempfile.TemporaryDirectory() as td:
            arc_path = os.path.join(td, "packed.zip")
            with open(arc_path, "wb") as f:
                f.write(model_data)

            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
            U.load_state(os.path.join(td, "model"))
示例#21
0
def main():
    set_global_seeds(1)
    args = parse_args()
    with U.make_session(4):  # noqa
        _, env = make_env(args.env)
        act = deepq.build_act(make_obs_ph=lambda name: U.Uint8Input(
            env.observation_space.shape, name=name),
                              q_func=dueling_model if args.dueling else model,
                              num_actions=env.action_space.n)

        U.load_state(os.path.join(args.model_dir, "saved"))
        wang2015_eval(args.env, act, stochastic=args.stochastic)
示例#22
0
def load(path):
    with open(path, "rb") as f:
        model_data = cloudpickle.load(f)
    sess = U.get_session()
    sess.__enter__()
    with tempfile.TemporaryDirectory() as td:
        arc_path = os.path.join(td, "packed.zip")
        with open(arc_path, "wb") as f:
            f.write(model_data)

        zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
        U.load_state(os.path.join(td, "model"))
示例#23
0
def main():
    set_global_seeds(1)
    args = parse_args()
    with U.make_session(4) as sess:  # noqa
        _, env = make_env(args.env)
        act = deepq.build_act(
            make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
            q_func=dueling_model if args.dueling else model,
            num_actions=env.action_space.n)

        U.load_state(os.path.join(args.model_dir, "saved"))
        wang2015_eval(args.env, act, stochastic=args.stochastic)
def evaluate(env, policy_func, load_model_path, video_prefix, record, render, *,
        timesteps_per_batch # what to train on
        ):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    U.initialize()
    U.load_state(load_model_path)

    ep_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=False, record=record, render=render)
    ep_lens = []
    ep_rets = []
    visual_obs = []
    if record:
        record_dir = os.path.join(os.path.dirname(load_model_path), 'video')
        os.makedirs(record_dir, exist_ok=True)
    for _ in tqdm(range(10)):
        ep_traj = ep_gen.__next__()
        ep_lens.append(ep_traj["ep_len"])
        ep_rets.append(ep_traj["ep_ret"])

        # Video recording
        if _ % 2 == 0 and record:
            visual_obs = ep_traj["visual_obs"]
            if video_prefix is None:
                video_path = os.path.join(record_dir, '{}.mp4'.format(_))
            else:
                video_path = os.path.join(record_dir, '{}-{}.mp4'.format(video_prefix, _))

            fps = 15.
            def f(t):
                frame_length = len(visual_obs)
                new_fps = 1./(1./fps + 1./frame_length)
                idx = min(int(t*new_fps), frame_length-1)
                return visual_obs[idx]
            video = mpy.VideoClip(f, duration=len(visual_obs)/fps+2)
            video.write_videofile(video_path, fps, verbose=False)

    print('Episode Length: {}'.format(sum(ep_lens)/10.))
    print('Episode Rewards: {}'.format(sum(ep_rets)/10.))
示例#25
0
def main():
    """
    restore latest model from ckpt
    """
    model_dir = '../tf_models/trial9'
    latest_checkpoint = tf.train.latest_checkpoint(model_dir)
    model_path = latest_checkpoint

    EP_MAX = 20
    EP_LEN_MAX = 1000

    # train flag check: train or animate trained results
    # animate trained results
    pi = train(max_iters=1, callback=None)
    U.load_state(model_path)

    env = LaneChangeEnv(gui=True, label='1', is_train=False)
    sumoseed = 45  #44
    randomseed = 45  # 6 9

    for ep in range(EP_MAX):
        # if env.is_collision:
        #     print('sumoseed:', sumoseed, 'randomseed:', randomseed)
        #     break
        sumoseed += 0
        randomseed += 0
        print('sumoseed:', sumoseed, 'randomseed:', randomseed)
        ob = env.reset(tlane=0,
                       tfc=2,
                       is_gui=True,
                       sumoseed=sumoseed,
                       randomseed=randomseed)
        # ob = env.reset(tlane=0, tfc=2, is_gui=True, sumoseed=None, randomseed=None)

        traci.vehicle.setColor(env.egoID, (255, 69, 0))
        ob_np = np.asarray(ob).flatten()
        speed_list = []
        lat_speed_list = []
        for t in range(EP_LEN_MAX):
            ac = pi.act(stochastic=False, ob=ob_np)[0]
            ob, reward, done, info = env.step(ac)  # need modification
            speed_list.append(env.ego.speed)
            lat_speed_list.append(env.ego.speed_lat)
            ob_np = np.asarray(ob).flatten()
            if done:
                break
        np_array = np.vstack([
            np.linspace(0, len(speed_list) - 1, num=len(speed_list)),
            speed_list, lat_speed_list
        ]).T
        if ep == 1:
            np.savetxt('../data/final.csv', np_array, delimiter=",")
示例#26
0
def maybe_load_model(savedir):
    """Load model if present at the specified path."""
    if savedir is None:
        return
    state_path = os.path.join(os.path.join(savedir, 'training_state.pkl.zip'))
    found_model = os.path.exists(state_path)
    if found_model:
        state = pickle_load(state_path, compression=True)
        model_dir = "model-{}".format(state["num_iters"])
        U.load_state(os.path.join(savedir, model_dir, "saved"))
        logger.log("Loaded models checkpoint at {} iterations".format(
            state["num_iters"]))
        return state
示例#27
0
def load_wrapper(load_path=None, checkpoint_path=None):
    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        if tf.train.latest_checkpoint(td) is not None:
            model_file = os.path.join(td, "model")
            load_state(model_file)

        elif load_path is not None:
            load_state(load_path)

        else:
            raise Warning("Baselines DQN: no model file found")
示例#28
0
def main():

    parser = mujoco_arg_parser()
    parser.add_argument('--model-path')
    parser.add_argument('--sim', default=False, action='store_true')
    parser.add_argument('--hessians', default=False, action='store_true')
    parser.add_argument('--logdir', type=str, default=None)
    args = parser.parse_args()
    logger.configure(args.logdir)

    if not args.model_path:
        raise ValueError('You have to provide a model path.')

    if not args.play:
        # train the model
        train(args.env,
              num_timesteps=args.num_timesteps,
              seed=args.seed,
              model_path=args.model_path,
              target1=args.target1,
              target2=args.target2,
              target3=args.target3,
              output_prefix=args.output_prefix,
              input_file=args.input_file,
              sim=args.sim,
              hessians=args.hessians)
    else:
        # construct the model object, load pre-trained model and render
        pi = train(args.env,
                   num_timesteps=1,
                   seed=args.seed,
                   target1=args.target1,
                   target2=args.target2,
                   target3=args.target3,
                   output_prefix=args.output_prefix,
                   input_file=args.input_file,
                   sim=False)
        U.load_state('models/' + args.model_path)
        env = make_pareto_mujoco_env(args.env,
                                     seed=0,
                                     target1=args.target1,
                                     target2=args.target2,
                                     target3=args.target3)

        ob = env.reset()
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ = env.step(action)
            env.render()
            if done:
                ob = env.reset()
示例#29
0
    def load(path):
        with open(path, "rb") as f:
            model_data, act_params = cloudpickle.load(f)
        act = deepqn.build_act(**act_params)
        sess = tf.Session()
        sess.__enter__()
        with tempfile.TemporaryDirectory() as td:
            arc_path = os.path.join(td, "packed.zip")
            with open(arc_path, "wb") as f:
                f.write(model_data)

            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
            U.load_state(os.path.join(td, "model"))
        return ActWrapper(act, act_params)
示例#30
0
def main():
    parser = argparse.ArgumentParser()
    logger.configure()
    parser.add_argument('--env',
                        type=str,
                        help="The Gym environement ID",
                        default="AttFC_GyroErr-MotorVel_M4_Con-v0")
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--model-path',
                        default=os.path.join(
                            '/root/code/nti/gymfc/humanoid_policy', 'hum'))
    parser.add_argument('--play', action="store_true", default=False)
    parser.add_argument('--num-timesteps', type=int, default=2 * 1e6)
    current_dir = os.path.dirname(__file__)
    config_path = os.path.join(current_dir, "../configs/iris.config")
    print("Loading config from ", config_path)
    os.environ["GYMFC_CONFIG"] = config_path
    args = parser.parse_args()

    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps,
              seed=args.seed,
              model_path=args.model_path,
              env_id=args.env)
    else:
        print(" Making env=", args.env)
        # construct the model object, load pre-trained model and render
        pi = train(num_timesteps=1, seed=args.seed, env_id=args.env)
        U.load_state(args.model_path)

        env = gym.make(args.env)
        # env.render()
        ob = env.reset()
        actuals = []
        desireds = []
        while True:
            desired = env.omega_target  # [0., 0., 0.]
            actual = env.omega_actual  # -[0., 0., 0.]
            actuals.append(actual)
            desireds.append(desired)
            print("sp=", desired, " rate=", actual)
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ = env.step(action)
            if done:
                break
        print(np.array(desireds))
        print(np.array(actuals))
        plot_step_response(np.array(desireds), np.array(actuals))
  def load(path, act_params, num_cpu=16):
    with open(path, "rb") as f:
      model_data = dill.load(f)
    act = deepq.build_act(**act_params)
    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()
    with tempfile.TemporaryDirectory() as td:
      arc_path = os.path.join(td, "packed.zip")
      with open(arc_path, "wb") as f:
        f.write(model_data)

      zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
      U.load_state(os.path.join(td, "model"))

    return ActWrapper(act)
示例#32
0
  def load(path, act_params, num_cpu=16):
    with open(path, "rb") as f:
      model_data = dill.load(f)
    act = deepq.build_act(**act_params)
    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()
    with tempfile.TemporaryDirectory() as td:
      arc_path = os.path.join(td, "packed.zip")
      with open(arc_path, "wb") as f:
        f.write(model_data)

      zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
      U.load_state(os.path.join(td, "model"))

    return ActWrapper(act)
示例#33
0
    def load(path):
        with open(path, "rb") as f:
            model_data, act_params = cloudpickle.load(f)
        act = deepq.build_act(**act_params)
        sess = tf.Session()
        sess.__enter__()
        with tempfile.TemporaryDirectory() as td:
            arc_path = os.path.join(td, "packed.zip")
            with open(arc_path, "wb") as f:
                f.write(model_data)

            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
            load_state(os.path.join(td, "model"))

        return ActWrapper(act, act_params)
示例#34
0
def maybe_load_model(savedir, container):
    """Load model if present at the specified path."""
    if savedir is None:
        return

    state_path = os.path.join(os.path.join(savedir, 'training_state.pkl.zip'))
    if container is not None:
        logger.log("Attempting to download model from Azure")
        found_model = container.get(savedir, 'training_state.pkl.zip')
    else:
        found_model = os.path.exists(state_path)
    if found_model:
        state = pickle_load(state_path, compression=True)
        model_dir = "model-{}".format(state["num_iters"])
        if container is not None:
            container.get(savedir, model_dir)
        U.load_state(os.path.join(savedir, model_dir, "saved"))
        logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"]))
        return state
示例#35
0
def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs,
           stochastic_policy, save=False, reuse=False):

    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space, reuse=reuse)
    U.initialize()
    # Prepare for rollouts
    # ----------------------------------------
    U.load_state(load_model_path)

    obs_list = []
    acs_list = []
    len_list = []
    ret_list = []
    for _ in tqdm(range(number_trajs)):
        traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy)
        obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret']
        obs_list.append(obs)
        acs_list.append(acs)
        len_list.append(ep_len)
        ret_list.append(ep_ret)
    if stochastic_policy:
        print('stochastic policy:')
    else:
        print('deterministic policy:')
    if save:
        filename = load_model_path.split('/')[-1] + '.' + env.spec.id
        np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list),
                 lens=np.array(len_list), rets=np.array(ret_list))
    avg_len = sum(len_list)/len(len_list)
    avg_ret = sum(ret_list)/len(ret_list)
    print("Average length:", avg_len)
    print("Average return:", avg_ret)
    return avg_len, avg_ret
示例#36
0
def main():
    logger.configure()
    parser = mujoco_arg_parser()
    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
    parser.set_defaults(num_timesteps=int(2e7))
   
    args = parser.parse_args()
    
    if not args.play:
        # train the model
        train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
    else:       
        # construct the model object, load pre-trained model and render
        pi = train(num_timesteps=1, seed=args.seed)
        U.load_state(args.model_path)
        env = make_mujoco_env('Humanoid-v2', seed=0)

        ob = env.reset()        
        while True:
            action = pi.act(stochastic=False, ob=ob)[0]
            ob, _, done, _ =  env.step(action)
            env.render()
            if done:
                ob = env.reset()
示例#37
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            reset = False
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
示例#38
0
 def load(self, load_path):
     tf_util.load_state(load_path, sess=self.sess)
示例#39
0
    video_recorder = VideoRecorder(
        env, video_path, enabled=video_path is not None)
    obs = env.reset()
    while True:
        env.unwrapped.render()
        video_recorder.capture_frame()
        action = act(np.array(obs)[None], stochastic=stochastic)[0]
        obs, rew, done, info = env.step(action)
        if done:
            obs = env.reset()
        if len(info["rewards"]) > num_episodes:
            if len(info["rewards"]) == 1 and video_recorder.enabled:
                # save video of first episode
                print("Saved video.")
                video_recorder.close()
                video_recorder.enabled = False
            print(info["rewards"][-1])
            num_episodes = len(info["rewards"])


if __name__ == '__main__':
    with U.make_session(4) as sess:
        args = parse_args()
        env = make_env(args.env)
        act = deepq.build_act(
            make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
            q_func=dueling_model if args.dueling else model,
            num_actions=env.action_space.n)
        U.load_state(os.path.join(args.model_dir, "saved"))
        play(env, act, args.stochastic, args.video)
示例#40
0
def learn(env, policy_func, reward_giver, expert_dataset, rank,
          pretrained, pretrained_weight, *,
          g_step, d_step, entcoeff, save_per_iter,
          ckpt_dir, log_dir, timesteps_per_batch, task_name,
          gamma, lam,
          max_kl, cg_iters, cg_damping=1e-2,
          vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3,
          max_timesteps=0, max_episodes=0, max_iters=0,
          callback=None
          ):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
    vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
        start += sz
    gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                    for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)

        logger.log("********** Iteration %i ************" % iters_so_far)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
            vpredbefore = seg["vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new()  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5*stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                    assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
                                                             include_final_partial_batch=False, batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
        batch_size = len(ob) // d_step
        d_losses = []  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch in dataset.iterbatches((ob, ac),
                                                      include_final_partial_batch=False,
                                                      batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            # update running mean/std for reward_giver
            if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
def learn(env,
          q_func,
          num_actions=4,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
  """Train a deepq model.

Parameters
-------
env: pysc2.env.SC2Env
    environment to train on
q_func: (tf.Variable, int, str, bool) -> tf.Variable
    the model that takes the following inputs:
        observation_in: object
            the output of observation placeholder
        num_actions: int
            number of actions
        scope: str
        reuse: bool
            should be passed to outer variable scope
    and returns a tensor of shape (batch_size, num_actions) with values of every action.
lr: float
    learning rate for adam optimizer
max_timesteps: int
    number of env steps to optimizer for
buffer_size: int
    size of the replay buffer
exploration_fraction: float
    fraction of entire training period over which the exploration rate is annealed
exploration_final_eps: float
    final value of random action probability
train_freq: int
    update the model every `train_freq` steps.
    set to None to disable printing
batch_size: int
    size of a batched sampled from replay buffer for training
print_freq: int
    how often to print out training progress
    set to None to disable printing
checkpoint_freq: int
    how often to save the model. This is so that the best version is restored
    at the end of the training. If you do not wish to restore the best version at
    the end of the training set this variable to None.
learning_starts: int
    how many steps of the model to collect transitions for before learning starts
gamma: float
    discount factor
target_network_update_freq: int
    update the target network every `target_network_update_freq` steps.
prioritized_replay: True
    if True prioritized replay buffer will be used.
prioritized_replay_alpha: float
    alpha parameter for prioritized replay buffer
prioritized_replay_beta0: float
    initial value of beta for prioritized replay buffer
prioritized_replay_beta_iters: int
    number of iterations over which beta will be annealed from initial value
    to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float
    epsilon to add to the TD errors when updating priorities.
num_cpu: int
    number of cpus to use for training
callback: (locals, globals) -> None
    function called at every steps with state of the algorithm.
    If callback returns true training stops.

Returns
-------
act: ActWrapper
    Wrapper over act function. Adds ability to save it and load it.
    See header of baselines/deepq/categorical.py for details on the act function.
"""
  # Create all the functions necessary to train the model

  sess = U.make_session(num_cpu=num_cpu)
  sess.__enter__()

  def make_obs_ph(name):
    return U.BatchInput((32, 32), name=name)

  act, train, update_target, debug = deepq.build_train(
    make_obs_ph=make_obs_ph,
    q_func=q_func,
    num_actions=num_actions,
    optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    gamma=gamma,
    grad_norm_clipping=10,
    scope="deepq")
  #
  # act_y, train_y, update_target_y, debug_y = deepq.build_train(
  #   make_obs_ph=make_obs_ph,
  #   q_func=q_func,
  #   num_actions=num_actions,
  #   optimizer=tf.train.AdamOptimizer(learning_rate=lr),
  #   gamma=gamma,
  #   grad_norm_clipping=10,
  #   scope="deepq_y"
  # )

  act_params = {
    'make_obs_ph': make_obs_ph,
    'q_func': q_func,
    'num_actions': num_actions,
  }

  # Create the replay buffer
  if prioritized_replay:
    replay_buffer = PrioritizedReplayBuffer(
      buffer_size, alpha=prioritized_replay_alpha)
    # replay_buffer_y = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)

    if prioritized_replay_beta_iters is None:
      prioritized_replay_beta_iters = max_timesteps
    beta_schedule = LinearSchedule(
      prioritized_replay_beta_iters,
      initial_p=prioritized_replay_beta0,
      final_p=1.0)

    # beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters,
    #                                  initial_p=prioritized_replay_beta0,
    #                                  final_p=1.0)
  else:
    replay_buffer = ReplayBuffer(buffer_size)
    # replay_buffer_y = ReplayBuffer(buffer_size)

    beta_schedule = None
    # beta_schedule_y = None
  # Create the schedule for exploration starting from 1.
  exploration = LinearSchedule(
    schedule_timesteps=int(exploration_fraction * max_timesteps),
    initial_p=1.0,
    final_p=exploration_final_eps)

  # Initialize the parameters and copy them to the target network.
  U.initialize()
  update_target()
  # update_target_y()

  episode_rewards = [0.0]
  saved_mean_reward = None

  obs = env.reset()
  # Select all marines first
  obs = env.step(
    actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

  player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

  screen = (player_relative == _PLAYER_NEUTRAL).astype(int)  #+ path_memory

  player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
  player = [int(player_x.mean()), int(player_y.mean())]

  if (player[0] > 16):
    screen = shift(LEFT, player[0] - 16, screen)
  elif (player[0] < 16):
    screen = shift(RIGHT, 16 - player[0], screen)

  if (player[1] > 16):
    screen = shift(UP, player[1] - 16, screen)
  elif (player[1] < 16):
    screen = shift(DOWN, 16 - player[1], screen)

  reset = True
  with tempfile.TemporaryDirectory() as td:
    model_saved = False
    model_file = os.path.join("model/", "mineral_shards")
    print(model_file)

    for t in range(max_timesteps):
      if callback is not None:
        if callback(locals(), globals()):
          break
      # Take action and update exploration to the newest value
      kwargs = {}
      if not param_noise:
        update_eps = exploration.value(t)
        update_param_noise_threshold = 0.
      else:
        update_eps = 0.
        if param_noise_threshold >= 0.:
          update_param_noise_threshold = param_noise_threshold
        else:
          # Compute the threshold such that the KL divergence between perturbed and non-perturbed
          # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
          # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
          # for detailed explanation.
          update_param_noise_threshold = -np.log(
            1. - exploration.value(t) +
            exploration.value(t) / float(num_actions))
        kwargs['reset'] = reset
        kwargs[
          'update_param_noise_threshold'] = update_param_noise_threshold
        kwargs['update_param_noise_scale'] = True

      action = act(
        np.array(screen)[None], update_eps=update_eps, **kwargs)[0]

      # action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0]

      reset = False

      coord = [player[0], player[1]]
      rew = 0

      if (action == 0):  #UP

        if (player[1] >= 8):
          coord = [player[0], player[1] - 8]
          #path_memory_[player[1] - 16 : player[1], player[0]] = -1
        elif (player[1] > 0):
          coord = [player[0], 0]
          #path_memory_[0 : player[1], player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 1):  #DOWN

        if (player[1] <= 23):
          coord = [player[0], player[1] + 8]
          #path_memory_[player[1] : player[1] + 16, player[0]] = -1
        elif (player[1] > 23):
          coord = [player[0], 31]
          #path_memory_[player[1] : 63, player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 2):  #LEFT

        if (player[0] >= 8):
          coord = [player[0] - 8, player[1]]
          #path_memory_[player[1], player[0] - 16 : player[0]] = -1
        elif (player[0] < 8):
          coord = [0, player[1]]
          #path_memory_[player[1], 0 : player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 3):  #RIGHT

        if (player[0] <= 23):
          coord = [player[0] + 8, player[1]]
          #path_memory_[player[1], player[0] : player[0] + 16] = -1
        elif (player[0] > 23):
          coord = [31, player[1]]
          #path_memory_[player[1], player[0] : 63] = -1

      if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
        obs = env.step(actions=[
          sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        ])

      new_action = [
        sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
      ]

      # else:
      #   new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

      obs = env.step(actions=new_action)

      player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
      new_screen = (player_relative == _PLAYER_NEUTRAL).astype(
        int)  #+ path_memory

      player_y, player_x = (
        player_relative == _PLAYER_FRIENDLY).nonzero()
      player = [int(player_x.mean()), int(player_y.mean())]

      if (player[0] > 16):
        new_screen = shift(LEFT, player[0] - 16, new_screen)
      elif (player[0] < 16):
        new_screen = shift(RIGHT, 16 - player[0], new_screen)

      if (player[1] > 16):
        new_screen = shift(UP, player[1] - 16, new_screen)
      elif (player[1] < 16):
        new_screen = shift(DOWN, 16 - player[1], new_screen)

      rew = obs[0].reward

      done = obs[0].step_type == environment.StepType.LAST

      # Store transition in the replay buffer.
      replay_buffer.add(screen, action, rew, new_screen, float(done))
      # replay_buffer_y.add(screen, action_y, rew, new_screen, float(done))

      screen = new_screen

      episode_rewards[-1] += rew
      reward = episode_rewards[-1]

      if done:
        obs = env.reset()
        player_relative = obs[0].observation["screen"][
          _PLAYER_RELATIVE]

        screen = (player_relative == _PLAYER_NEUTRAL).astype(
          int)  #+ path_memory

        player_y, player_x = (
          player_relative == _PLAYER_FRIENDLY).nonzero()
        player = [int(player_x.mean()), int(player_y.mean())]

        # Select all marines first
        env.step(actions=[
          sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        ])
        episode_rewards.append(0.0)
        #episode_minerals.append(0.0)

        reset = True

      if t > learning_starts and t % train_freq == 0:
        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if prioritized_replay:

          experience = replay_buffer.sample(
            batch_size, beta=beta_schedule.value(t))
          (obses_t, actions, rewards, obses_tp1, dones, weights,
           batch_idxes) = experience

          # experience_y = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
          # (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y
        else:

          obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
            batch_size)
          weights, batch_idxes = np.ones_like(rewards), None

          # obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(batch_size)
          # weights_y, batch_idxes_y = np.ones_like(rewards_y), None

        td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                          weights)

        # td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y)

        if prioritized_replay:
          new_priorities = np.abs(td_errors) + prioritized_replay_eps
          # new_priorities = np.abs(td_errors) + prioritized_replay_eps
          replay_buffer.update_priorities(batch_idxes,
                                          new_priorities)
          # replay_buffer.update_priorities(batch_idxes, new_priorities)

      if t > learning_starts and t % target_network_update_freq == 0:
        # Update target network periodically.
        update_target()
        # update_target_y()

      mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
      num_episodes = len(episode_rewards)
      if done and print_freq is not None and len(
          episode_rewards) % print_freq == 0:
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("reward", reward)
        logger.record_tabular("mean 100 episode reward",
                              mean_100ep_reward)
        logger.record_tabular("% time spent exploring",
                              int(100 * exploration.value(t)))
        logger.dump_tabular()

      if (checkpoint_freq is not None and t > learning_starts
          and num_episodes > 100 and t % checkpoint_freq == 0):
        if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
          if print_freq is not None:
            logger.log(
              "Saving model due to mean reward increase: {} -> {}".
                format(saved_mean_reward, mean_100ep_reward))
          U.save_state(model_file)
          model_saved = True
          saved_mean_reward = mean_100ep_reward
    if model_saved:
      if print_freq is not None:
        logger.log("Restored model with mean reward: {}".format(
          saved_mean_reward))
      U.load_state(model_file)

  return ActWrapper(act)