Python Buffer示例，baselines.acer.buffer.Buffer Python示例

示例#1

0

显示文件

文件： acer_simple.py 项目： Divyankpandey/baselines

def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1):
    print("Running Acer Simple")
    print(locals())
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes) # HACK
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
                  num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()
    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    env.close()

示例#2

0

显示文件

def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1):
    print("Running Acer Simple")
    print(locals())
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes) # HACK
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
                  num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()
    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    env.close()

示例#3

0

显示文件

文件： expert.py 项目： zhangyx96/baselines

 def __init__(self, env, nsteps, nstack, size):
     self.env = env
     self.nsteps = nsteps
     self.nstack = nstack
     self.size = size
     self.buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=size)
     self.file_dir = None
     self.flag = 3

示例#4

0

显示文件

文件： acer.py 项目： goodbyeearth/old_pmm

def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs):

    '''
    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
    Train an agent with given network architecture on a given environment using ACER.

    Parameters:
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)

    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting factor (default: 0.99)

    log_interval:       int, number of updates between logging events (default: 100)

    buffer_size:        int, size of the replay buffer (default: 50k)

    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)

    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)

    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)
    if not isinstance(env, VecFrameStack):
        env = VecFrameStack(env, 1)

    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    nstack = env.nstack
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps,
                  ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()

    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    return model

示例#5

0

显示文件

文件： acer_simple.py 项目： zjucsphd/openai_acer

def learn(policy, env, flags):
    """

    :param policy:
    :param baselines.common.vec_env.VecEnv env:
    :param baselines.acer.flags.AcerFlags flags:
    """
    print("Running Acer Simple")
    print(flags)

    flags.total_timesteps = int(flags.total_timesteps)

    # disable gpu before creating any tensor
    if not flags.use_gpu:
        tf_util.disable_gpu()

    tf.reset_default_graph()
    set_global_seeds(flags.seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, num_procs=nenvs, flags=flags)

    runner = Runner(env=env, model=model, nsteps=flags.nsteps, nstack=flags.nstack)
    if flags.replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=flags.nsteps, nstack=flags.nstack, size=flags.buffer_size)
    else:
        buffer = None
    nbatch = nenvs*flags.nsteps
    acer = Acer(runner, model, buffer, flags.log_interval, flags.stats_interval)

    saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=flags.permanent_save_hours)
    checkpoint_dir = os.path.join(flags.save_dir, 'checkpoints')
    checkpoint_path = os.path.join(checkpoint_dir, 'model')
    os.makedirs(checkpoint_dir, exist_ok=True)

    # load checkpoint
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint: {}".format(latest_checkpoint))
        saver.restore(model.sess, latest_checkpoint)
        start_steps = model.GSwrapper.get(model.sess)

        if hasattr(env, 'restore_state'):
            env.restore_state(checkpoint_dir, start_steps)
    else:
        start_steps = 0

    coordinator = tf.train.Coordinator()

    def signal_handler(signal, frame):
        if not coordinator.should_stop():
            coordinator.request_stop()
            print("Stopping training...")
        else:
            print("Stop already requested, please wait...")

    signal.signal(signal.SIGINT, signal_handler)
    print("Press CTRL+C to stop")

    acer.tstart = time.time()
    for acer.steps in range(start_steps, flags.total_timesteps, nbatch):

        # on policy training
        acer.call(on_policy=True)

        # off policy training
        if flags.replay_ratio > 0 and buffer.has_atleast(flags.replay_start):
            n = np.random.poisson(flags.replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

        # saving
        do_save = (((acer.steps//nbatch) + 1) % flags.save_interval == 0) or coordinator.should_stop()
        if do_save:
            save_steps = acer.steps+nbatch

            print("Saving at t=%s" % save_steps)

            model.GSwrapper.set(model.sess, save_steps)
            saver.save(model.sess, save_path=checkpoint_path, global_step=save_steps)

            if hasattr(env, 'save_state'):
                env.save_state(checkpoint_dir, save_steps)

        if coordinator.should_stop():
            break

    env.close()

示例#6

0

显示文件

文件： acer.py 项目： MrGoogol/baselines

def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
          trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs):

    '''
    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
    Train an agent with given network architecture on a given environment using ACER.

    Parameters:
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)

    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting factor (default: 0.99)

    log_interval:       int, number of updates between logging events (default: 100)

    buffer_size:        int, size of the replay buffer (default: 50k)

    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)

    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)

    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)
    if not isinstance(env, VecFrameStack):
        env = VecFrameStack(env, 1)

    policy = build_policy(env, network, estimate_q=True, **network_kwargs)
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space

    nstack = env.nstack
    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps,
                  ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
                  trust_region=trust_region, alpha=alpha, delta=delta)

    runner = Runner(env=env, model=model, nsteps=nsteps)
    if replay_ratio > 0:
        buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size)
    else:
        buffer = None
    nbatch = nenvs*nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()

    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    return model

示例#7

0

显示文件

文件： acer_simple.py 项目： zhangyx96/baselines

def learn(policy,
          env,
          seed,
          env_id,
          learn_time,
          expert_buffer_size,
          perform=False,
          use_expert=False,
          save_networks=False,
          network_saving_dir=None,
          total_timesteps=int(80e6),
          nsteps=20,
          nstack=4,
          q_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=10,
          lr=7e-4,
          lrschedule='linear',
          rprop_epsilon=1e-5,
          rprop_alpha=0.99,
          gamma=0.99,
          log_interval=10,
          buffer_size=50000,
          replay_ratio=4,
          replay_start=10000,
          c=10.0,
          trust_region=True,
          alpha=0.99,
          delta=1):

    print(locals())
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space  #Box(84,84,1)
    ac_space = env.action_space  #Discrete(4)

    if use_expert:
        expert = Expert(
            env=env, nsteps=nsteps, nstack=nstack,
            size=expert_buffer_size)  #Exp1:50000; Exp2:25000 ; Exp3:10000
        expert_dir = os.path.join('./expert') + '/expert.pkl'
        file_dir = '/home/zhangxiaoqin/Projects/conda/atari_v1/'
        #expert.load_file_human(file_dir)
        expert.load_file(expert_dir)

    else:
        expert = None

    num_procs = len(env.remotes)  # HACK
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nenvs=nenvs,
                  nsteps=nsteps,
                  nstack=nstack,
                  num_procs=num_procs,
                  ent_coef=ent_coef,
                  q_coef=q_coef,
                  gamma=gamma,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  rprop_alpha=rprop_alpha,
                  rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule,
                  c=c,
                  trust_region=trust_region,
                  alpha=alpha,
                  delta=delta,
                  network_saving_dir=network_saving_dir,
                  use_expert=use_expert,
                  expert=expert)

    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    nstack=nstack,
                    env_id=env_id)

    if replay_ratio > 0:
        buffer = Buffer(env=env,
                        nsteps=nsteps,
                        nstack=nstack,
                        size=buffer_size)
    else:
        buffer = None

    if perform:
        model.load()

    nbatch = nenvs * nsteps
    acer = Acer(runner, model, buffer, log_interval)
    acer.tstart = time.time()

    for acer.steps in range(
            0, total_timesteps, nbatch
    ):  #nbatch samples, 1 on_policy call and multiple off-policy calls

        if acer.steps > learn_time and use_expert:
            print('-------------------------')
            print('Reuse the normal networks')
            print('-------------------------')
            use_expert = False
            expert = None

        acer.call(perform, save_networks, use_expert, expert, on_policy=True)

        if replay_ratio > 0 and buffer.has_atleast(
                replay_start) and not perform:
            n = np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(perform,
                          save_networks,
                          use_expert,
                          expert,
                          on_policy=False)  # no simulation steps in this

    #dir = os.path.join('./models/', 'test.m')
    #model.save('./models/test_2.pkl')
    #
    #

    env.close()

示例#8

0

显示文件

文件： expert.py 项目： zhangyx96/baselines

class Expert:
    def __init__(self, env, nsteps, nstack, size):
        self.env = env
        self.nsteps = nsteps
        self.nstack = nstack
        self.size = size
        self.buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=size)
        self.file_dir = None
        self.flag = 3

    def load_file(self, file_dir):
        self.file_dir = file_dir
        expert_file = open(self.file_dir, 'rb')
        expert_data = pickle.load(expert_file)
        expert_file.close()
        for step_sample in expert_data:
            # print('----------')
            # print(step_sample[5].shape)
            # print('----------')
            self.buffer.put(step_sample[0], step_sample[1], step_sample[2],
                            step_sample[3], step_sample[4], step_sample[5])
            # if self.flag > 0:
            # 	print(self.flag,'**************************************')
            # 	print(step_sample[0], step_sample[1], step_sample[2], step_sample[3], step_sample[4], step_sample[5])
            # 	self.flag = self.flag -1
        del expert_data
        gc.collect()

    def update_obs(self, obs, dones=None):
        if dones is not None:
            self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
        self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
        self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]

    def load_file_human(self, file_dir='/home/zhangxiaoqin/atari_v1/'):
        import agc.dataset as ds
        import agc.util as util
        import cv2
        env_name = 'spaceinvaders'
        nsteps = 20
        next_file_point = 1
        file_point = np.arange(16, dtype=np.int)

        frame_point = np.zeros(
            (16), dtype=np.int
        )  #f_p[0][0]   first_line->file_num,  sec_line->frame_num
        dataset = ds.AtariDataset(file_dir)
        all_trajectories = dataset.trajectories
        screenshoot_dir = os.path.join(file_dir, 'screens/spaceinvaders')
        flag = 1
        k = 0
        while k < 16:
            i = 1
            if i in dataset.trajectories['spaceinvaders']:
                file_point[k] = i
                k = k + 1
            i = i + 1

        init_obs = np.zeros((16, 84, 84, 4), dtype=np.uint8)
        enc_obs = np.split(init_obs, 4, axis=3)  # so now list of obs steps
        mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
        while (flag):
            for _ in range(nsteps):
                #actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones)
                obs = np.zeros([16, 84, 84, 1], dtype=np.uint8)
                for i in np.arange(16):
                    pic_path = os.path.join(screenshoot_dir, str(
                        file_point[i]), str(frame_point[i])) + '.png'
                    pic = cv2.imread(pic_path)
                    pic = cv2.cvtColor(pic, cv2.COLOR_RGB2GRAY)
                    pic = cv2.resize(pic, (84, 84),
                                     interpolation=cv2.INTER_AREA)
                    obs[i, :, :, :] = pic[:, :, None]
                    if frame_point[i] < all_trajectories['spaceinvaders'][
                            file_point[i]][-1]['frame']:
                        frame_point[i] = frame_point[i] + 1
                    else:
                        frame_point[i] = 0
                        file_point[i] = next_file_point
                        next_file_point = next_file_point + 1
                        while next_file_point not in dataset.trajectories[
                                'spaceinvaders'] and next_file_point <= 514:
                            next_file_point = next_file_point + 1
                    if next_file_point > 514:
                        flag = False
                mb_obs.append(np.copy(self.obs))
                mb_actions.append(actions)
                mb_mus.append(mus)
                mb_dones.append(self.dones)
                #obs, rewards, dones, _ = self.env.step(actions)
                #env.render();
                # aa,bb,cc,dd = self.env_s.step(actions[0])
                # self.env_s.render()
                # if cc == True:
                # 	self.env_s.reset()
                # states information for statefull models like LSTM
                self.states = states
                self.dones = dones
                #self.update_obs(obs, dones)
                mb_rewards.append(rewards)
                enc_obs.append(obs)
            mb_obs.append(np.copy(self.obs))
            mb_dones.append(self.dones)
            enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
            mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
            mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
            mb_rewards = np.asarray(mb_rewards,
                                    dtype=np.float32).swapaxes(1, 0)
            mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
            mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
            mb_masks = mb_dones  # Used for statefull models like LSTM's to mask state when done
            mb_dones = mb_dones[:,
                                1:]  # Used for calculating returns. The dones array is now aligned with rewards

    def get(self):
        return self.buffer.get()

    def strip(var, nenvs, nsteps, flat=False):
        vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
        return seq_to_batch(vars[:-1], flat)

    def set_tf(self, sess, expert_train_model, ob_space, ac_space, nenvs,
               nsteps):
        nact = ac_space.n
        nbatch = nenvs * nsteps
        self.A = tf.placeholder(tf.int32, [nbatch])  # actions
        self.D = tf.placeholder(tf.float32, [nbatch])  # dones
        self.R = tf.placeholder(tf.float32, [nbatch])  # rewards, not returns
        self.MU = tf.placeholder(tf.float32, [nbatch, nact])  # mu's
        self.LR = tf.placeholder(tf.float32, [])
        eps = 1e-6

        #step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)

        # params = find_trainable_variables("model")
        # print("Params {}".format(len(params)))
        # for var in params:
        # 	print(var)

        # create polyak averaged model
        #ema = tf.train.ExponentialMovingAverage(alpha)
        #ema_apply_op = ema.apply(params)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i

        v = tf.reduce_sum(tf.stop_gradient(expert_train_model.pi) *
                          expert_train_model.q,
                          axis=-1)  # shape is [nenvs * (nsteps + 1)]
        s_v = tf.reduce_sum(expert_train_model.pi *
                            tf.stop_gradient(expert_train_model.q),
                            axis=-1)
        v = strip(v, nenvs, nsteps, True)
        s_v = strip(s_v, nenvs, nsteps, True)
        # strip off last step
        #f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [expert_train_model.pi, expert_polyak_model.pi, expert_train_model.q])

        fq = lambda var: strip(var, nenvs, nsteps)

        q_i = get_by_index(fq(expert_train_model.q), self.A)
        #v = tf.reduce_max(fq(expert_train_model.q), axis = 1)

        # one_hot_A = tf.one_hot(self.A, nact)
        # pi = fq(expert_train_model.pi)
        # loss_policy = tf.reduce_mean(tf.square(pi-one_hot_A))

        # Get pi and q values for actions taken

        #v = strip(v, nenvs, nsteps, True)

        #loss_q = -tf.reduce_mean(q_i - tf.reshape(v, [nenvs * nsteps, 1]))
        loss_q = tf.nn.relu(tf.reduce_mean(v - q_i))
        loss_policy = -tf.reduce_mean(s_v - tf.stop_gradient(q_i))
        self.expert_loss = loss_q + loss_policy
        #self.expert_loss = loss_policy
        self.loss_q = loss_q
        self.loss_policy = loss_policy

示例#9

0

显示文件

文件： acer.py 项目： npfoss/Halite3RL

def learn(network,
          env,
          seed=None,
          nsteps=20,
          total_timesteps=int(80e6),
          q_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=10,
          lr=7e-4,
          lrschedule='linear',
          rprop_epsilon=1e-5,
          rprop_alpha=0.99,
          gamma=0.99,
          log_interval=100,
          buffer_size=50000,
          replay_ratio=4,
          replay_start=10000,
          c=10.0,
          trust_region=True,
          alpha=0.99,
          delta=1,
          load_path=None,
          **network_kwargs):
    '''
    Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf)
    Train an agent with given network architecture on a given environment using ACER.

    Parameters:
    ----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env:                environment. Needs to be vectorized for parallel environment simulation.
                        The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel) (default: 20)

    nstack:             int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension
                        (last image dimension) (default: 4)

    total_timesteps:    int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M)

    q_coef:             float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods)

    ent_coef:           float, policy entropy coefficient in the optimization objective (default: 0.01)

    max_grad_norm:      float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10),

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output

    rprop_epsilon:      float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)

    rprop_alpha:        float, RMSProp decay parameter (default: 0.99)

    gamma:              float, reward discounting factor (default: 0.99)

    log_interval:       int, number of updates between logging events (default: 100)

    buffer_size:        int, size of the replay buffer (default: 50k)

    replay_ratio:       int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4)

    replay_start:       int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k)

    c:                  float, importance weight clipping factor (default: 10)

    trust_region        bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size  (default: True)

    delta:              float, max KL divergence between the old policy and updated policy (default: 1)

    alpha:              float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99)

    load_path:          str, path to load the model from (default: None)

    **network_kwargs:               keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                    For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''
    print("Running Acer Simple")

    learn_params = {
        "network": network,
        "seed": seed,
        "nsteps": nsteps,
        "total_timesteps": total_timesteps,
        "q_coef": q_coef,
        "ent_coef": ent_coef,
        "max_grad_norm": max_grad_norm,
        "lr": lr,
        "lrschedule": lrschedule,
        "rprop_epsilon": rprop_epsilon,
        "rprop_alpha": rprop_alpha,
        "gamma": gamma,
        "log_interval": log_interval,
        "buffer_size": buffer_size,
        "replay_ratio": replay_ratio,
        "replay_start": replay_start,
        "c": c,
        "trust_region": trust_region,
        "alpha": alpha,
        "delta": delta,
        "load_path": load_path,
        **network_kwargs
    }

    with open("params.json") as f:
        params = json.load(f)

    params["replay_start"] = min(params["replay_start"], params["buffer_size"])
    params["buffer_size"] = min(params["buffer_size"],
                                params["disk_buffer_size"])

    nsteps, buffer_size, disk_buffer_size = params['nsteps'], params[
        'buffer_size'], params['disk_buffer_size']

    for k, v in params.items():
        if k in learn_params:
            learn_params[k] = v

    # print(locals())

    with open("model_params.pkl", "wb") as f:
        pkl.dump(learn_params, f)

    env, policy, nenvs, ob_space, ac_space, nstack, model = create_model(
        **learn_params)
    # *** UNCOMMENT IF YOU WANT TO LOAD OLD VARIABLES
    load_variables("actor.ckpt")
    # ***

    # runner = HaliteRunner(model=model, env=env, gamma=gamma, nsteps=nsteps)
    runner = HaliteRunner(model)  # reads the params json now
    if replay_ratio > 0:
        buffer = Buffer(env=env,
                        nsteps=nsteps,
                        size=buffer_size,
                        disk_size=disk_buffer_size)
    else:
        buffer = None

    nbatch = nenvs * nsteps
    acer = Acer(runner, model, buffer, log_interval, nsteps)
    acer.tstart = time.time()

    for acer.steps in range(
            0, total_timesteps, nbatch
    ):  #nbatch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            n = replay_ratio  #np.random.poisson(replay_ratio)
            for _ in range(n):
                acer.call(on_policy=False)  # no simulation steps in this

    return model

示例#10

0

显示文件

def learn(policy,
          env,
          seed,
          n_steps=20,
          n_stack=4,
          total_timesteps=int(80e6),
          q_coef=0.5,
          ent_coef=0.01,
          max_grad_norm=10,
          learning_rate=7e-4,
          lr_schedule='linear',
          rprop_epsilon=1e-5,
          rprop_alpha=0.99,
          gamma=0.99,
          log_interval=100,
          buffer_size=50000,
          replay_ratio=4,
          replay_start=10000,
          correction_term=10.0,
          trust_region=True,
          alpha=0.99,
          delta=1):
    """
    Train an ACER model.

    :param policy: (ACERPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param env: (Gym environment) The environment to learn from
    :param seed: (int) The initial seed for training
    :param n_steps: (int) The number of steps to run for each environment
    :param n_stack: (int) The number of stacked frames
    :param total_timesteps: (int) The total number of samples
    :param q_coef: (float) Q function coefficient for the loss calculation
    :param ent_coef: (float) Entropy coefficient for the loss caculation
    :param max_grad_norm: (float) The maximum value for the gradient clipping
    :param learning_rate: (float) The learning rate
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param rprop_epsilon: (float) RMS prop optimizer epsilon
    :param rprop_alpha: (float) RMS prop optimizer decay
    :param gamma: (float) Discount factor
    :param log_interval: (int) The number of timesteps before logging.
    :param buffer_size: (int) The buffer size in number of steps
    :param replay_ratio: (float) The number of replay learning per on policy learning on average,
                                 using a poisson distribution
    :param replay_start: (int) The minimum number of steps in the buffer, before learning replay
    :param correction_term: (float) The correction term for the weights
    :param trust_region: (bool) Enable Trust region policy optimization loss
    :param alpha: (float) The decay rate for the Exponential moving average of the parameters
    :param delta: (float) trust region delta value
    """
    print("Running Acer Simple")
    print(locals())
    set_global_seeds(seed)

    n_envs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    num_procs = len(env.remotes)  # HACK
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  n_envs=n_envs,
                  n_steps=n_steps,
                  n_stack=n_stack,
                  num_procs=num_procs,
                  ent_coef=ent_coef,
                  q_coef=q_coef,
                  gamma=gamma,
                  max_grad_norm=max_grad_norm,
                  learning_rate=learning_rate,
                  rprop_alpha=rprop_alpha,
                  rprop_epsilon=rprop_epsilon,
                  total_timesteps=total_timesteps,
                  lr_schedule=lr_schedule,
                  correction_term=correction_term,
                  trust_region=trust_region,
                  alpha=alpha,
                  delta=delta)

    runner = Runner(env=env, model=model, n_steps=n_steps, n_stack=n_stack)
    if replay_ratio > 0:
        buffer = Buffer(env=env,
                        n_steps=n_steps,
                        n_stack=n_stack,
                        size=buffer_size)
    else:
        buffer = None
    n_batch = n_envs * n_steps
    acer = Acer(runner, model, buffer, log_interval)
    acer.t_start = time.time()
    for acer.steps in range(
            0, total_timesteps, n_batch
    ):  # n_batch samples, 1 on_policy call and multiple off-policy calls
        acer.call(on_policy=True)
        if replay_ratio > 0 and buffer.has_atleast(replay_start):
            samples_number = np.random.poisson(replay_ratio)
            for _ in range(samples_number):
                acer.call(on_policy=False)  # no simulation steps in this

    env.close()