Exemplo n.º 1
0
def tests():
    parser_tests()
    a = Actor(global_env,
              Script([Method('method1', [], 'Hurary'),
                      Method('method2:', ['x'], 'x')]))
    n = Number(3)
    env = Env(None, {'a': a, 'n': n})
    test_expr(env, 'a', a)
    should_raise(env, 'b', 'Unbound')
    should_raise(env, 'a method1', 'Unbound')
    should_raise(env, 'a foo', 'No matching method')
    test_expr(env, 'a method2: a', a)
    test_expr(env, 'n * n', Number(9))
    test_expr(env, 'a. n', n)
    test_expr(env, '', void_actor)
    test_expr(env, '42', Number(42))
    test_expr(env, '(a method2: 3) * 2', Number(6))

    s1 = Script([Method('run:', ['x'], 'make Foo')])
    s2 = Script([Method('multiply_by:', ['y'], 'x * y')])
    s1.get_method('run:').set_inner('Foo', s2)
    a1 = Actor(global_env, s1)
    env1 = Env(None, {'a1': a1})
    test_expr(env1, '(a1 run: 4) multiply_by: 5', Number(20))

    assert Method('f', [], 'ab. ab').mark_up_body(global_env) == 'ab. ab'
    assert Method('foo:', ['x'], 'x').get_signature() == 'foo: x'

    s = String('Hello, world!')
    env2 = Env(None, {'s': s})
    test_expr(env2, 's', String('Hello, ' + 'world!'))
    test_expr(env2, 's length', Number(13))
    test_expr(env2, 's from: 1 to: 5', String('Hello'))
    test_expr(env2, "'Hello, world!'", s)
    test_expr(env2, 'let x = 2 * 3. x * x', Number(36))
Exemplo n.º 2
0
def my_if(env, args):
    cur_env = Env(env.name + '_0', env)
    cond = doeval(args[0], cur_env)
    cur_env = Env(env.name + '_1', env)
    if cond == true:
        rs = doeval(args[1], cur_env)
    else:
        rs = doeval(args[2], cur_env)
    return rs
Exemplo n.º 3
0
def dvd(env, args):
    idx = 0
    cur_env = Env(env.name + '_' + str(idx), env)
    rs = doeval(args[0], cur_env)
    idx += 1
    for item in args[1:]:
        cur_env = Env(env.name + '_' + str(idx), env)
        tmp = doeval(item, cur_env).get_value()
        rs /= tmp
        idx += 1
    return rs
Exemplo n.º 4
0
def compare(env, args, func):
    idx = 0
    cur_env = Env(env.name + '_' + str(idx), env)
    pre = doeval(args[0], cur_env)
    idx += 1
    while idx < len(args):
        item = args[idx]
        cur_env = Env(env.name + '_' + str(idx), env)
        tmp = doeval(item, cur_env)
        if not func(float(pre.get_value()), float(tmp.get_value())):
            return false
        pre = tmp
        idx += 1
    return true
Exemplo n.º 5
0
def my_not(env, args):
    idx = 0
    cur_env = Env(env.name + '_' + str(idx), env)
    rs = doeval(args[0], cur_env)
    #是否正确?
    if rs.get_value(): return false
    return true
Exemplo n.º 6
0
def setup_replay(args: argparse.Namespace, env: Env) -> ExperienceReplay:
    D = ExperienceReplay(
        args.experience_size,
        env.observation_size,
        env.action_size,
        args.device
    )
    # Initialise dataset D with random seed episodes
    for _ in range(1, args.seed_episodes + 1):
        observation, done = env.reset(), False
        while not done:
            action = env.sample_random_action()
            next_observation, _, done, info = env.step(action)
            D.append(observation, action, info["reward_dist"], info["reward_coll"], done)
            observation = next_observation

    return D
Exemplo n.º 7
0
def my_or(env, args):
    idx = 0
    for item in args:
        cur_env = Env(env.name + '_' + str(idx), env)
        tmp = doeval(item, cur_env)
        if tmp.get_value(): return true
        idx += 1
    return false
Exemplo n.º 8
0
def add(env, args):
    rs = 0
    idx = 0
    for item in args:
        cur_env = Env(env.name + '_' + str(idx), env)
        tmp = doeval(item, cur_env)
        rs += tmp.get_value()
        idx += 1
    return rs
Exemplo n.º 9
0
def my_list(env, args):
    if len(args) == 0: return nil
    # print args
    rs_args = []
    for idx, item in enumerate(args):
        cur_env = Env(env.name + '_' + str(idx), env)
        tmp = doeval(item, cur_env)
        rs_args.append(tmp)
    return My_List(rs_args)
Exemplo n.º 10
0
def evaluate(conf):
    agent_func = _import_module(ModelBase, 'models.', conf['model']['name'])
    agent = agent_func(EasyDict(conf['model']))

    envs = []
    for i in range(conf['env'].get('count', 1)):
        envs.append(Env(agent, None, EasyDict(conf['env'])))

    envs[0].play()
    envs[0].close()
Exemplo n.º 11
0
 def apply(self, env, args):
     if len(args) > len(self.args):
         raise Exception('too many args!')
     cur_env = Env(env.name + '_0', self.env)
     for symbol, target in zip(self.args, args):
         define(cur_env, [symbol, target])
     if len(args) < len(self.args):
         new_args = copy.copy(self.args[len(args):])
         new_func = Func(new_args, self.body, cur_env)
         return new_func
     rs = nil
     for tmp_body in self.body:
         rs = doeval(tmp_body, cur_env)
     return rs
Exemplo n.º 12
0
def train(conf):
    agent_func = _import_module(ModelBase, 'models.', conf['model']['name'])
    agent = agent_func(EasyDict(conf['model']))
    coach_func = _import_module(TrainBase, 'train.', conf['train']['name'])
    coach = coach_func(agent, EasyDict(conf['train']))

    assert (agent is not None)
    assert (coach is not None)

    envs = []
    for i in range(conf['env'].get('count', 1)):
        envs.append(Env(agent, coach, EasyDict(conf['env'])))

    envs[0].play()
    envs[0].close()
Exemplo n.º 13
0
def doeval(ele, env):
    if type(ele) == str:
        ele = AST([ele])

    # print env, ele
    cmd = ele.get_cmd()
    args = ele.get_args()
    if isinstance(cmd, AST):
        cur_env = Env(env.name + '_0', env)
        opt = doeval(cmd, cur_env)
    else:
        opt = env.search_symbol(cmd)
        if opt is None:
            print 'symbol %s not found!' % cmd
            return None
    rs = opt.apply(env, args)
    # print 'eval result: %s' % str(rs)
    return rs
Exemplo n.º 14
0
    # TODO: Figure out why after a lot of steps the meta
    # file becomes so large, for now set max_to_keep=1.
    saver = tf.train.Saver(max_to_keep=1)
    if args.resume and args.checkpoint_dir:
        load_checkpoint(saver, args.checkpoint_dir, sess)

    replay = SimpleExperienceReplay(args.replay_capacity, args.batch_size,
                                    args.history_window,
                                    (args.height, args.width))
    buf = Buffer(args.history_window, (args.height, args.width))
    obs_preprocess = lambda o: preprocess(o, args.height, args.width)
    reward_clipper = lambda r: np.clip(r, -1.0, 1.0)

    # wrap Gym Env so we can easily process observations
    # and rewards
    env = Env(gym_env, obs_preprocess, reward_clipper)

    # Initialize replay with some random experiences
    print("Initializing replay with {} experiences".format(args.random_start))
    random_start(env, replay, args.random_start)

    print("Training DDQN Agent")
    if args.monitor_dir:
        if args.resume:
            env.monitor.start(args.monitor_dir, resume=True)
        else:
            env.monitor.start(args.monitor_dir, force=True)

    ql = DDQN(main_model, target_model, args.batch_size, n_actions, args.gamma)
    ql.update_target_weights()
Exemplo n.º 15
0
tf.set_random_seed(args.seed)
gym_env.seed(args.seed)

network_input_shape = (args.history_window, args.height, args.width)
n_actions = gym_env.action_space.n
observation_shape = gym_env.observation_space.shape

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Graph().as_default() as g, tf.Session(config=config) as sess:
    sess = tf.Session(config=config)
    K.set_session(sess)

    main = nn(network_input_shape, n_actions)
    target = nn(network_input_shape, n_actions)
    main.compile(optimizer='rmsprop', loss='mse')

    saver = tf.train.Saver()
    load_checkpoint(saver, args.checkpoint_dir, sess)

    buf = Buffer(args.history_window, (args.height, args.width))
    obs_preprocess = lambda i: preprocess(i, args.height, args.width)
    reward_clip = lambda r: np.clip(r, -1.0, 1.0)
    env = Env(gym_env, obs_preprocess, reward_clip)

    ql = DDQN(main, target, args.batch_size, n_actions, args.gamma)

    print("Playing {} games ...".format(args.games))
    for _ in range(args.games):
        play(ql, env, buf, epsilon=args.epsilon)
Exemplo n.º 16
0
def cdr(env, args):
    cur_env = Env(env.name + '_0', env)
    pair = doeval(args[0], cur_env)
    if not isinstance(pair, Pair):
        raise Exception('cdr should be used to a pair!')
    return pair.cdr()
Exemplo n.º 17
0
    sess.run(tf.initialize_variables([t]))

    # TODO: Figure out why after a lot of steps the meta
    # file becomes so large, for now set max_to_keep=1.
    saver = tf.train.Saver(max_to_keep=1)
    if args.resume and args.checkpoint_dir:
        load_checkpoint(saver, args.checkpoint_dir, sess)

    replay = SimpleExperienceReplay(args.replay_capacity, args.batch_size, args.history_window, (args.height, args.width))
    buf = Buffer(args.history_window, (args.height, args.width))
    obs_preprocess = lambda o: preprocess(o, args.height, args.width)
    reward_clipper = lambda r: np.clip(r, -1.0, 1.0)

    # wrap Gym Env so we can easily process observations
    # and rewards
    env = Env(gym_env, obs_preprocess, reward_clipper)

    # Initialize replay with some random experiences
    print("Initializing replay with {} experiences".format(args.random_start))
    random_start(env, replay, args.random_start)

    print("Training DDQN Agent")
    if args.monitor_dir:
        if args.resume:
            env.monitor.start(args.monitor_dir, resume=True)
        else:
            env.monitor.start(args.monitor_dir, force=True)

    ql = DDQN(main_model, target_model, args.batch_size, n_actions, args.gamma)
    ql.update_target_weights()
Exemplo n.º 18
0
import tensorflow as tf
import numpy as np
import time

from trainers.ppo_rnn_trainer_test import Trainer
from envs import Pong as Env

env = Env(stacks=4, skips=1, return_seq=True)
trainer = Trainer(env.action_space.n)
running_reward = -21
steps = 0
for e in range(100000):
    s = env.reset()
    ep_score = 0
    done = False
    start = 0.

    h = trainer.agent.initial_hidden()

    while not done:
        a, v, l, h = trainer.action(np.array(s[-1]), h)

        n_s, r, done, info = env.step(a)
        trainer.add(start, s, a, l, v, n_s, r, done)
        trainer.update(h)

        start = 1.
        s = n_s
        ep_score += r
        steps += 1
Exemplo n.º 19
0
import tensorflow as tf
import numpy as np
import time

from trainers.ppo_trainer import Trainer
from envs import Atari as Env
from envs import Pong

# env_name = "LunarLanderContinuous-v2"
env_name = "LunarLander-v2"
# env_name = "BipedalWalkerHardcore-v2"
# env_name = "BipedalWalker-v2"


env = Env(env_name)
# env = Pong(img_size=64)
trainer = Trainer(env.action_space.n, env.observation_space)
running_reward=-21
steps=0
for e in range(100000):
    s = env.reset()
    ep_score = 0
    done = False
    
    while not done:
        a, v, l = trainer.action(s)

        n_s, r, done, info = env.step(a)
        trainer.add(s, a, l, v, n_s, r, done)
        trainer.update()
        
Exemplo n.º 20
0
def collect_experience(args: argparse.Namespace,
                       env: Env,
                       models: Tuple[nn.Module, nn.Module, nn.Module, nn.Module],
                       planner: nn.Module,
                       explore: bool = True,
                       desc: str = "Collecting episode") -> Dict[str, List[torch.Tensor]]:
    """collect an episode by applying policy on the real env.
    """
    # unpack models
    transition_model, _, _, encoder = models
    # storage
    experience = {
        "belief": [],
        "state": [],
        "action": [],
        "observation": [],
        "reward_dist": [],
        "reward_coll": [],
        "done": []
    }
    with torch.no_grad():
        # h[-1], s[-1], a[-1], o[0]
        belief = torch.zeros(1, args.belief_size, device=args.device)
        posterior_state = torch.zeros(1, args.state_size, device=args.device)
        action = torch.zeros(1, env.action_size, device=args.device)
        observation = env.reset()

        for _ in trange(args.max_episode_length // args.action_repeat, leave=False, desc=desc):
            # h[t] = f(h[t-1], a[t-1])
            # s[t] ~ Prob(s|h[t])
            # action and observation need extra time dimension because transition model uses batch operation
            belief, _, _, _, posterior_state, _, _ = transition_model.forward(
                posterior_state,
                action.unsqueeze(dim=0),
                belief,
                encoder(observation.to(device=args.device)).unsqueeze(dim=0))
            belief, posterior_state = belief.squeeze(dim=0), posterior_state.squeeze(dim=0)

            # a[t] = pi(h[t], s[t]) + noise
            # action is bounded by action range
            action = planner(belief, posterior_state)
            if explore:
                action += args.action_noise * torch.randn_like(action)
            action.clamp_(min=env.action_range[0], max=env.action_range[1])

            # o[t+1] ~ Prob(o|x[t], a[t]), r[t+1], z[t+1]
            next_observation, _, done, info = env.step(action[0].cpu())

            # save h[t], s[t], a[t], o[t], r[t+1], z[t+1]
            experience["belief"].append(belief)
            experience["state"].append(posterior_state)
            experience["action"].append(action.cpu())
            experience["observation"].append(observation)
            experience["reward_dist"].append(info["reward_dist"])
            experience["reward_coll"].append(info["reward_coll"])
            experience["done"].append(done)

            if done:
                break
            else:
                observation = next_observation

    return experience
Exemplo n.º 21
0
def is_null(env, args):
    cur_env = Env(env.name + '_0', env)
    rs = doeval(args[0], cur_env)
    if rs == nil: return true
    return false
Exemplo n.º 22
0
 def __call__(self, *args):
     return evaluate(self.body, Env(self.parms, args, self.env))
Exemplo n.º 23
0
def cons(env, args):
    cur_env = Env(env.name + '_0', env)
    first = doeval(args[0], cur_env)
    cur_env = Env(env.name + '_1', env)
    second = doeval(args[1], cur_env)
    return Pair(first, second)
Exemplo n.º 24
0
def define(env, args):
    symbol = args[0]
    cur_env = Env(env.name + '_0', env)
    target = doeval(args[1], cur_env)
    env.add_symbol(symbol, target)
Exemplo n.º 25
0
import tensorflow as tf
import numpy as np
import time

from trainers.ppo_rnn_trainer import Trainer
from envs import Mario as Env

env = Env(stacks=1, skips=2)
trainer = Trainer(env.action_space.n)
running_reward = -21
steps = 0
for e in range(100000):
    s = env.reset()
    ep_score = 0
    done = False
    start = 0.

    h = trainer.agent.initial_hidden()

    while not done:
        a, v, l, h = trainer.action(s, h)

        n_s, r, done, info = env.step(a)
        trainer.add(start, s, h, a, l, v, n_s, r, done)
        trainer.update()

        start = 1.
        s = n_s
        ep_score += r
        steps += 1
Exemplo n.º 26
0
Arquivo: actors.py Projeto: void4/hmph
 def call(self, actor, selector, arguments):
     return self.body.eval(
         Env(actor.get_env(), bind(self.parameters, arguments)), actor)