Exemplo n.º 1
0
    def test1(self):
        
        env = RunEnv(visualize=False)
        observation = env.reset()

        action = env.action_space.sample()
        action[5] = np.NaN
        self.assertRaises(ValueError, env.step, action)
Exemplo n.º 2
0
class OsimEnv(Env):
    def __init__(self,
                 visualize=True,
                 test=False,
                 step_size=0.01,
                 processor=None,
                 timestep_limit=1000):
        self.visualize = visualize
        self._osim_env = RunEnv(visualize=visualize)
        self._osim_env.stepsize = step_size
        self._osim_env.spec.timestep_limit = timestep_limit
        self._osim_env.horizon = timestep_limit
        # self._osim_env.integration_accuracy = 1e-1
        if test:
            self._osim_env.timestep_limit = 1000
        self.processor = processor
        print "stepsize: " + str(self._osim_env.stepsize)

    def reset(self, seed=None, difficulty=2):
        observation = self._osim_env.reset(seed=seed, difficulty=difficulty)
        if self.processor:
            observation, reward, done, info = self.processor.process_step(
                observation, 0.0, False, dict())

        return observation

    def step(self, action):
        if self.processor:
            action = self.processor.process_action(action)

        observation, reward, done, info = self._osim_env.step(action)

        if self.processor:
            observation, reward, done, info = self.processor.process_step(
                observation, reward, done, info)

        return observation, reward, done, info

    def get_observation_dim(self):
        return len(self.reset())

    def get_action_dim(self):
        nb_actions = self._osim_env.action_space.shape[0]
        return nb_actions

    # FOR PICKLING
    def __setstate__(self, state):
        self.__init__(visualize=state['visualize'])

    def __getstate__(self):
        state = {'visualize': self.visualize}
        return state
def main():

    env = RunEnv(visualize=True)
    env.close()

    with open('save.p', 'r') as f:
        population = pickle.load(f)

    nn = population[0][0]
    total_reward = 0
    observation = env.reset()

    total_reward = 0
    observation = env.reset()
    for i in range(200):
        step = nn.compute(i)
        observation, reward, done, info = env.step(step)

        total_reward += reward
        if done:
            break

    print total_reward
Exemplo n.º 4
0
class LTR(BasicTask):
    name = 'LearningToRun'
    success_threshold = 2000
    def __init__(self):
        BasicTask.__init__(self)
        self.env = RunEnv(visualize=False)

    def step(self, action):
        action = np.clip(action, 0, 1)
        next_state, reward, done, info = self.env.step(action)
        return np.asarray(next_state) / math.pi, reward, done, info

    def reset(self):
        state = self.env.reset(difficulty=0, seed=np.random.randint(0, 10000000))
        return np.asarray(state) / math.pi
Exemplo n.º 5
0
class OsimAdapter:
    def __init__(self):
        self.env = RunEnv(visualize=False)
        self.reset()

    def reset(self, difficulty=2):
        self.reward = 0
        self.total_reward = 0
        self.timestamp = 0.
        self.features = np.array(
            (self.env.reset(difficulty=difficulty))).reshape((1, -1))
        self.last_obs = np.zeros(shape=(1, 41))
        self.features = np.concatenate([self.features, self.last_obs], axis=1)
        self.done = False
        return self.features

    def get_action_space(self):
        space = [1] * 18
        return space

    def get_observation_space(self):
        return 41 * 2

    def step(self, actions):
        mean_possible = (np.array(self.env.action_space.low) +
                         np.array(self.env.action_space.high)) / 2.
        actions = np.array(actions) + mean_possible
        actions *= (np.array(self.env.action_space.high) -
                    np.array(self.env.action_space.low))
        actions = np.clip(actions, self.env.action_space.low,
                          self.env.action_space.high)
        obs, reward1, done, _ = self.env.step(actions)
        reward2 = 0
        if not done:
            obs, reward2, done, _ = self.env.step(actions)
        self.features = np.array(obs).reshape((1, -1))
        self.features = np.concatenate(
            [self.features, self.features - self.last_obs], axis=1)
        self.last_obs = np.array(obs).reshape((1, -1))
        self.reward = reward1 + reward2
        self.total_reward += self.reward
        self.done = done
        self.timestamp += 1

    def get_total_reward(self):
        return self.total_reward
Exemplo n.º 6
0
def test(args):
    print('start testing')

    ddpg = DDPG()
    ddpg.load_model(args.model, load_memory=False)
    env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obs)

    np.random.seed(args.seed)
    for i in range(1):
        step = 0
        state = env.reset(difficulty=2)
        fg = FeatureGenerator()

        state = fg.gen(state)
        #obs = fg.traj[0]
        #print(obs.left_knee_r, obs.right_knee_r)

        ep_reward = 0
        ep_memories = []
        while True:
            action = ddpg.select_action(list(state))
            next_state, reward, done, info = env.step(action.tolist())
            next_state = fg.gen(next_state)

            #obs = fg.traj[0]
            #print(obs.left_knee_r, obs.right_knee_r)

            print('step: {0:03d}'.format(step), end=', action: ')
            for act in action:
                print('{0:.3f}'.format(act), end=', ')
            print()

            state = next_state
            ep_reward += reward
            step += 1

            print('reward:', ep_reward)

            if done:
                break

        print('\nEpisode: {} Reward: {}, n_steps: {}'.format(
            i, ep_reward, step))
Exemplo n.º 7
0
def standalone_headless_isolated(conn,
                                 visualize,
                                 n_obstacles,
                                 run_logs_dir,
                                 additional_info,
                                 higher_pelvis=0.65):
    try:
        e = RunEnv(visualize=visualize, max_obstacles=n_obstacles)
        if higher_pelvis != 0.65:
            bind_alternative_pelvis_judgement(e, higher_pelvis)
        e = MyRunEnvLogger(e,
                           log_dir=run_logs_dir,
                           additional_info=additional_info)

        while True:
            msg = conn.recv()

            # messages should be tuples,
            # msg[0] should be string

            if msg[0] == 'reset':
                o = e.reset(difficulty=msg[1], seed=msg[2])
                conn.send(o)
            elif msg[0] == 'step':
                ordi = e.step(msg[1])
                conn.send(ordi)
            elif msg[0] == 'close':
                e.close()
                conn.send(None)

                import psutil
                current_process = psutil.Process()
                children = current_process.children(recursive=True)
                for child in children:
                    child.terminate()
                return
    except Exception as e:
        import traceback
        print(traceback.format_exc())
        conn.send(e)
Exemplo n.º 8
0
class GameManager:
    def __init__(self, game_name, display):
        self.game_name = game_name
        self.display = display

        # self.env = gym.make(game_name)
        self.env = RunEnv(self.display)
        self.reset()

    def reset(self):
        observation = self.env.reset()
        return observation

    def step(self, action):
        self._update_display()
        observation, reward, done, info = self.env.step(action)
        return observation, reward, done, info

    def _update_display(self):
        # if self.display:
        #     self.env.render()
        return
Exemplo n.º 9
0
def test(actor, critic, args, act_update_fn):
    act_fn, _, _ = act_update_fn(actor, critic, None, None, args)
    env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obstacles)

    all_episode_metrics = []
    for episode in range(args.num_episodes):
        episode_metrics = {
            "reward": 0.0,
            "step": 0,
        }

        observation_handler = create_observation_handler(args)
        action_handler = create_action_handler(args)
        observation = env.reset(difficulty=2, seed=SEEDS[episode % len(SEEDS)])
        action = np.zeros(ACTION_SHAPE, dtype=np.float32)
        observation = observation_handler(observation, action)

        done = False
        while not done:
            print(episode_metrics["reward"])
            action = act_fn(observation)

            observation, reward, done, _ = env.step(action_handler(action))

            episode_metrics["reward"] += reward
            episode_metrics["step"] += 1

            if done:
                break

            observation = observation_handler(observation, action)

        all_episode_metrics.append(episode_metrics)

    df = pd.DataFrame(all_episode_metrics)
    pprint(df.describe())
Exemplo n.º 10
0
class keras_model(object):
    def __init__(self, shared_object):

        log_info('setting keras_model main parameters')
        self.shared_object = shared_object
        self.model_class = shared_object.get('model_class', None)
        self.name = shared_object.get('model_name', None)
        self.network_name = shared_object.get('network', None)
        self.train_bool = shared_object.get('train', True)
        self.test_bool = shared_object.get('test', True)
        self.load_bool = not (self.train_bool)
        self.submit_bool = shared_object.get('submit', False)
        self.tokenId = shared_object.get('submit_token', None)
        self.visualize = shared_object.get('visualize', False)
        self.save_bool = shared_object.get('save', True)
        self.save_path = shared_object.get(
            'save_path',
            os.path.join('model_weights', self.model_class, self.name,
                         self.network_name + '.h5f'))
        self.save_folder = os.path.dirname(self.save_path)
        self.env = shared_object.get('env', None)
        if self.env == None:
            self.env = RunEnv(self.visualize)
            self.shared_object['env'] = self.env
            shared_object['env'] = self.env

        self.env.reset()
        self.nb_actions = self.env.action_space.shape[0]
        self.metrics = shared_object.get('metrics', ['mae'])
        self.optimizer_name = shared_object.get('optimizer', 'Adam')
        self.optimizer_params = shared_object.get('optimizer_params', None)

        log_info("setting keras_model's training parameters")
        self.train_parameters = {}
        self.train_parameters['nb_steps'] = shared_object.get('nb_steps', 0)
        self.train_parameters['action_repetition'] = shared_object.get(
            'action_repetition', 0)
        self.train_parameters['callback_names'] = shared_object.get(
            'callback_names', None)
        self.train_parameters['callbacks'] = load_callbacks(
            self.train_parameters['callback_names'])
        self.train_parameters['verbose'] = shared_object.get('verbose', 0)
        self.train_parameters['nb_max_start_steps'] = shared_object.get(
            'nb_max_start_steps', 0)
        self.train_parameters['start_step_policy'] = shared_object.get(
            'start_step_policy', None)
        self.train_parameters['log_interval'] = shared_object.get(
            'log_interval', 1)
        self.train_parameters['nb_max_episode_steps'] = shared_object.get(
            'nb_max_episode_steps', self.env.timestep_limit)

        log_info("setting keras_model's testing parameters")

        self.test_parameters = {}
        self.test_parameters['nb_episodes'] = shared_object.get(
            'test_nb_episodes', 1)
        self.test_parameters['nb_max_episode_steps'] = shared_object.get(
            'test_nb_max_episode_steps', 1)

        log_info('loading networks : {}'.format(self.network_name))
        self.load_networks()
        log_info('loading networks done')

        log_info('building optimizer : {}'.format(self.optimizer_name))
        self.build_optimizer()
        log_info('optimizer built sucessfully')

        log_info('building the agent')
        self.build_agent()
        log_info('agent sucessfully built')

    def train(self):
        """
		# Arguments
            nb_steps (integer): Number of training steps to be performed.
            action_repetition (integer): Number of times the agent repeats the same action without
                observing the environment again. Setting this to a value > 1 can be useful
                if a single action only has a very small effect on the environment.
            callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
                List of callbacks to apply during training. See [callbacks](/callbacks) for details.
            verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
            visualize (boolean): If `True`, the environment is visualized during training. However,
                this is likely going to slow down training significantly and is thus intended to be
                a debugging instrument.
            nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
                of each episode using `start_step_policy`. Notice that this is an upper limit since
                the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
                at the beginning of each episode.
            start_step_policy (`lambda observation: action`): The policy
                to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
            log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
            nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
                automatically resetting the environment. Set to `None` if each episode should run
                (potentially indefinitely) until the environment signals a terminal state.
        # Returns
            A `keras.callbacks.History` instance that recorded the entire training process.
        """

        callback_history = self.agent.fit(
            self.env,
            nb_steps=self.train_parameters['nb_steps'],
            action_repetition=self.train_parameters['action_repetition'],
            callbacks=self.train_parameters['callbacks'],
            verbose=self.train_parameters['verbose'],
            visualize=self.visualize,
            nb_max_start_steps=self.train_parameters['nb_max_start_steps'],
            start_step_policy=self.train_parameters['start_step_policy'],
            log_interval=self.train_parameters['log_interval'],
            nb_max_episode_steps=self.train_parameters['nb_max_episode_steps'])

        if self.save_bool:
            if not (os.pasth.exists(self.save_folder)):
                os.makedirs(self.save_folder)
            agent.save_weights(self.savePath, overwrite=True)

        return callback_history

    def load(self):
        log_info('loading model : {}'.format(self.name))
        self.agent.load_weights(self.save_path)

    def test(self):
        self.agent.test(
            self.env,
            nb_episodes=self.test_parameters['nb_episodes'],
            visualize=self.test_parameters['visualize'],
            nb_max_episode_steps=self.test_parameters['nb_max_episode_steps'])

    def submit(self):

        remote_base = 'http://grader.crowdai.org:1729'
        env = RunEnv(visualize=self.visualize)
        client = Client(remote_base)

        # Create environment
        observation = client.env_create(self.submit_token)

        # Run a single step
        #
        # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one
        while True:
            [observation, reward, done,
             info] = client.env_step(self.agent.forward(observation))
            if done:
                observation = client.env_reset()
                if not observation:
                    break

        client.submit()

    def build_optimizer(self):
        log_info('loading optimizer class : {}'.format(self.optimizer_name))
        optimizer_class = import_class('keras.optimizers.{}'.format(
            self.optimizer_name))
        self.optimizer = optimizer_class(**self.optimizer_params)

    def run(self):
        if self.train_bool:
            log_info('starting to train the model...')
            self.train()
        if self.load_bool:
            log_info('starting to load the model...')
            self.load()
        if self.test_bool:
            log_info('starting testing the model ...')
            self.test()
        if self.submit_bool:
            log_info('starting to submit the model')
            self.submit()

    def load_networks(self):
        network_class = import_class('models.nn.{}.{}'.format(
            self.model_class, self.network_name))
        self.networks = network_class(self.shared_object)

    def build_agent(self):
        raise NotImplementedError
Exemplo n.º 11
0
        callback=on_iteration_start,
        verbose=args.verbose,
    )

    env.close()

    if MPI.COMM_WORLD.Get_rank() == 0:
        plot_history(history)
        save_model()

    if args.repeat:
        cmd = 'python run_osim.py --repeat --train --model %s --steps %s --size %s' % (args.model, args.steps, args.size)
        subprocess.call(cmd.split(' '))

if args.test:
    observation = env.reset()
    observation = preprocess(observation, step=1, verbose=args.verbose)
    pi = policy_fn('pi', env.observation_space, env.action_space)

    if not load_model():
        exit(0)

    done = False
    total = 0
    steps = 0
    while not done:
        action = pi.act(True, observation)[0]
        observation, reward, done, info = env.step(action)
        if args.visualize:
            vis.pointCameraAt(opensim.Vec3(observation[1], 0, 0), opensim.Vec3(0, 1, 0))
        observation = preprocess(observation, step=steps + 2, verbose=args.verbose)
Exemplo n.º 12
0
'''
Script to print observation values of osim Running Environment
'''

import numpy as np
from envs.diffEnv import diffEnv
from osim.env import RunEnv

env = RunEnv(max_obstacles = 10)
obs = env.reset(difficulty = 2, seed=47)

def print_obs(obs):
    #iterate = range(len(obs))
    iterate = [29,31,33,35]
    for i in iterate:
        print (str(i) + ": " + str(obs[i]))

try:
    print_obs(obs)
    for t in range(200):
        action= np.zeros(18)
        obs, _, done, _ = env.step(action)
        print("")
        print_obs(obs)
        if done: break
    
    while True:
        pass
except KeyboardInterrupt:
    pass
Exemplo n.º 13
0
def test(rank, args, shared_model, opt_ac):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = numpy.zeros(41)

    if args.render:
        env = RunEnv(visualize=True)
    else:
        env = RunEnv(visualize=False)

    running_state = ZFilter((num_inputs, ), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    start_time = time.time()

    for i_episode in count(1):
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            state = numpy.array(state)
            #global last_state
            #last_state = state
            #last_state,_ = update_observation(last_state,state)
            #last_state,state = update_observation(last_state,state)
            #print(state.shape[0])
            #print(state[41])
            state = running_state(state)

            reward_sum = 0
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                #timer = time.time()
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)

                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(action)
                    puts('ERROR')
                    return
                #print('NN take:')
                #print(time.time()-timer)
                #print(action)
                #print("------------------------")

                #timer = time.time()
                if args.skip:
                    #env.step(action)
                    _, reward, _, _ = env.step(action)
                    reward_sum += reward
                next_state, reward, done, _ = env.step(action)
                next_state = numpy.array(next_state)
                reward_sum += reward

                #print('env take:')
                #print(time.time()-timer)

                #timer = time.time()

                #last_state ,next_state = update_observation(last_state,next_state)
                next_state = running_state(next_state)
                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                #print('update take:')
                #print(time.time()-timer)

                #timer = time.time()

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #print('memory take:')
                #print(time.time()-timer)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state

            num_steps += (t - 1)
            num_episodes += 1
            #print(num_episodes)
            reward_batch += reward_sum

        #print(num_episodes)
        reward_batch /= num_episodes
        batch = memory.sample()

        #update_params_actor_critic(batch,args,shared_model,ac_net,opt_ac)
        time.sleep(60)

        if i_episode % args.log_interval == 0:
            File = open(PATH_TO_MODEL + '/record.txt', 'a+')
            File.write("Time {}, episode reward {}, Average reward {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, reward_batch))
            File.close()
            #print('TestEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.format(
            #    i_episode, reward_sum, reward_batch))
            print("Time {}, episode reward {}, Average reward {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, reward_batch))
            #print('!!!!')

        epoch = i_episode
        if reward_batch > best_result:
            best_result = reward_batch
            save_model(
                {
                    'epoch': epoch,
                    'bh': args.bh,
                    'state_dict': shared_model.state_dict(),
                    'optimizer': opt_ac.state_dict(),
                }, PATH_TO_MODEL, 'best')

        if epoch % 30 == 1:
            save_model(
                {
                    'epoch': epoch,
                    'bh': args.bh,
                    'state_dict': shared_model.state_dict(),
                    'optimizer': opt_ac.state_dict(),
                }, PATH_TO_MODEL, epoch)
Exemplo n.º 14
0
from osim.env import RunEnv

env = RunEnv(visualize=True)
observation = env.reset(difficulty=0)
for i in range(200):
    observation, reward, done, info = env.step(env.action_space.sample())
    print(reward)
    if done:
        break
Exemplo n.º 15
0
def standalone_headless_isolated(pq, cq, plock):
    # locking to prevent mixed-up printing.
    plock.acquire()
    print('starting headless...',pq,cq)
    try:
        import traceback
        from osim.env import RunEnv
        e = RunEnv(visualize=True,max_obstacles=0)
        # bind_alternative_pelvis_judgement(e)
        # use_alternative_episode_length(e)
    except Exception as e:
        print('error on start of standalone')
        traceback.print_exc()

        plock.release()
        return
    else:
        plock.release()

    def report(e):
        # a way to report errors ( since you can't just throw them over a pipe )
        # e should be a string
        print('(standalone) got error!!!')
        # conn.send(('error',e))
        # conn.put(('error',e))
        cq.put(('error',e))

    def floatify(n_p):
        return [float(n_p[i]) for i in range(len(n_p))]

    try:
        previous_o = None
        while True:
            # msg = conn.recv()
            # msg = conn.get()
            msg = pq.get()
            # messages should be tuples,
            # msg[0] should be string

            # isinstance is dangerous, commented out
            # if not isinstance(msg,tuple):
            #     raise Exception('pipe message received by headless is not a tuple')

            if msg[0] == 'reset': #or (previous_o==None and msg[0]=='step'):
                o = e.reset(difficulty=0)
                o = floatify(o)
                o_processed = generate_observation(o, o)
                previous_o = o
                cq.put(o_processed)

            elif msg[0] == 'step':
                actions = msg[1]
                o,r,d,i = e.step(np.array(actions))
                o = floatify(o) # floatify the observation
                o_processed = generate_observation(o, previous_o)
                previous_o = o
                cq.put((o_processed, r, d, i))
            elif msg[0] == 'action_space':
                a_s = e.action_space
                r_a_s = (a_s.low.tolist(), a_s.high.tolist(), a_s.shape)
                cq.put(r_a_s)
            elif msg[0] == 'observation_space':
                o_s = get_observation_space()
                r_o_s = (o_s['low'].tolist(), o_s['high'].tolist(),o_s['shape'])
                cq.put(r_o_s)
            else:
                cq.close()
                pq.close()
                del e
                break
    except Exception as e:
        traceback.print_exc()
        report(str(e))

    return # end process
 def test_first_obs(self):
     env = RunEnv(visualize=False)
     observation_start = env.reset()
     observation, reward, done, info = env.step(env.action_space.sample())
     self.assertAlmostEqual(observation_start[-1], observation[-1])
     self.assertAlmostEqual(observation_start[-2], observation[-2])
Exemplo n.º 17
0
class Actor(multiprocessing.Process):
    def __init__(self, args, task_q, result_q, actor_id, monitor):
        multiprocessing.Process.__init__(self)
        self.task_q = task_q
        self.result_q = result_q
        self.args = args
        self.monitor = False


    def act(self, obs):
        obs = np.expand_dims(obs, 0)
        action_dist_mu, action_dist_logstd = self.session.run([self.action_dist_mu, self.action_dist_logstd], feed_dict={self.obs: obs})
        # samples the guassian distribution
        act = action_dist_mu + np.exp(action_dist_logstd)*np.random.randn(*action_dist_logstd.shape)
        return act.ravel(), action_dist_mu, action_dist_logstd

    def run(self):
        
        self.env = RunEnv(visualize=False)
        self.env.reset(difficulty = 2, seed = int(time.time()))
        if self.monitor:
            self.env.monitor.start('monitor/', force=True)

        # tensorflow variables (same as in model.py)
        self.observation_size = 55+7
        self.action_size = np.prod(self.env.action_space.shape)
        self.hidden_size = 128
        weight_init = tf.random_uniform_initializer(-0.05, 0.05)
        bias_init = tf.constant_initializer(0)
        # tensorflow model of the policy
        self.obs = tf.placeholder(tf.float32, [None, self.observation_size])
        self.debug = tf.constant([2,2])
        with tf.variable_scope("policy-a"):
            h1 = fully_connected(self.obs, self.observation_size, self.hidden_size, weight_init, bias_init, "policy_h1")
            h1 = tf.nn.relu(h1)
            h2 = fully_connected(h1, self.hidden_size, self.hidden_size, weight_init, bias_init, "policy_h2")
            h2 = tf.nn.relu(h2)
            h3 = fully_connected(h2, self.hidden_size, self.action_size, weight_init, bias_init, "policy_h3_1")
            h3 = tf.nn.tanh(h3,name="policy_h3")
            action_dist_logstd_param = tf.Variable((.01*np.random.randn(1, self.action_size)).astype(np.float32), name="policy_logstd")
        self.action_dist_mu = h3
        self.action_dist_logstd = tf.tile(action_dist_logstd_param, tf.stack((tf.shape(self.action_dist_mu)[0], 1)))

        config = tf.ConfigProto(
            device_count = {'CPU': 0}
        )
        self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())
        var_list = tf.trainable_variables()

        self.set_policy = SetPolicyWeights(self.session, var_list)

        while True:
            # get a task, or wait until it gets one
            next_task = self.task_q.get(block=True)
            if next_task == 1:
                # the task is an actor request to collect experience
                path = self.rollout()
                self.task_q.task_done()
                self.result_q.put(path)
            elif next_task == 2:
                print "kill message"
                if self.monitor:
                    self.env.monitor.close()
                self.task_q.task_done()
                break
            else:
                # the task is to set parameters of the actor policy
                self.set_policy(next_task)
                # super hacky method to make sure when we fill the queue with set parameter tasks,
                # an actor doesn't finish updating before the other actors can accept their own tasks.
                time.sleep(0.1)
                self.task_q.task_done()
        return

    def rollout(self):
        obs, actions, rewards, action_dists_mu, action_dists_logstd = [], [], [], [], []
        self.old_observation = None
        plain_obs = self.env.reset(difficulty = 2, seed = int(time.time()))
        processed_observation, self.old_observation = go(plain_obs, self.old_observation, step=1)
        ob = filter(processed_observation)
        for i in xrange(self.args.max_pathlength - 1):
            obs.append(ob)
            action, action_dist_mu, action_dist_logstd = self.act(ob)
            action = np.clip(action,a_max=1.0,a_min=0.0)
            actions.append(action)
            action_dists_mu.append(action_dist_mu)
            action_dists_logstd.append(action_dist_logstd)
            res = self.env.step(action)
            processed_observation, self.old_observation = go(res[0], self.old_observation, step=1)
            ob = filter(processed_observation)
            rewards.append((res[1]))
            if res[2] or i == self.args.max_pathlength - 2:
                path = {"obs": np.concatenate(np.expand_dims(obs, 0)),
                             "action_dists_mu": np.concatenate(action_dists_mu),
                             "action_dists_logstd": np.concatenate(action_dists_logstd),
                             "rewards": np.array(rewards),
                             "actions":  np.array(actions)}
                return path
                break
Exemplo n.º 18
0
running_state = ZFilter((num_inputs, ), clip=5)
running_reward = ZFilter((1, ), demean=False, clip=10)
episode_lengths = []
last_state = 41 * [0]

for i_episode in count(1):
    memory = Memory()

    num_steps = 0
    reward_batch = 0
    num_episodes = 0
    while num_steps < args.batch_size:
        #state = env.reset()
        #print(num_steps)
        state = env.reset(difficulty=0)

        last_state, state = process_observation(last_state, state)
        #print(len(state))

        state = numpy.array(state)
        state = running_state(state)

        reward_sum = 0
        for t in range(10000):  # Don't infinite loop while learning
            #print(t)
            if args.use_sep_pol_val:
                action = select_action(state)
            else:
                action = select_action_actor_critic(state)
            #print(action)
Exemplo n.º 19
0
def train(rank, params, traffic_light, counter, shared_model,
          shared_grad_buffers, shared_obs_stats, test_n):
    torch.manual_seed(params.seed)
    #env = gym.make(params.env_name)
    env = RunEnv(visualize=False)
    #num_inputs = env.observation_space.shape[0]
    #num_outputs = env.action_space.shape[0]
    num_inputs = params.num_inputs
    num_outputs = params.num_outputs
    model = Model(num_inputs, num_outputs)
    last_state = []

    memory = ReplayMemory(params.exploration_size)

    #state = env.reset()
    state = env.reset(difficulty=0)

    last_state, state = process_observation(last_state, state)

    state = numpy.array(state)
    state = Variable(torch.Tensor(state).unsqueeze(0))
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        model.load_state_dict(shared_model.state_dict())

        w = -1
        av_reward = 0
        nb_runs = 0
        reward_0 = 0
        t = -1
        while w < params.exploration_size:
            t += 1
            states = []
            actions = []
            rewards = []
            values = []
            returns = []
            advantages = []
            av_reward = 0
            cum_reward = 0
            cum_done = 0

            # Perform K steps
            for step in range(params.num_steps):
                w += 1
                shared_obs_stats.observes(state)
                state = shared_obs_stats.normalize(state)
                states.append(state)
                mu, sigma_sq, v = model(state)
                eps = torch.randn(mu.size())
                action = (mu + sigma_sq.sqrt() * Variable(eps))
                actions.append(action)
                values.append(v)
                env_action = action.data.squeeze().numpy()

                state, reward, done, _ = env.step(env_action)

                last_state, state = process_observation(last_state, state)
                state = numpy.array(state)

                done = (done or episode_length >= params.max_episode_length)
                cum_reward += reward
                reward = max(min(reward, 1), -1)
                rewards.append(reward)
                if done:
                    cum_done += 1
                    av_reward += cum_reward
                    cum_reward = 0
                    episode_length = 0
                    state = env.reset(difficulty=0)
                    last_state = []
                    last_state, state = process_observation(last_state, state)
                    state = numpy.array(state)
                state = Variable(torch.Tensor(state).unsqueeze(0))
                if done:
                    break

            # one last step
            R = torch.zeros(1, 1)
            if not done:
                _, _, v = model(state)
                R = v.data
            # compute returns and GAE(lambda) advantages:
            values.append(Variable(R))
            R = Variable(R)
            A = Variable(torch.zeros(1, 1))
            for i in reversed(range(len(rewards))):
                td = rewards[i] + params.gamma * values[i + 1].data[
                    0, 0] - values[i].data[0, 0]
                A = float(td) + params.gamma * params.gae_param * A
                advantages.insert(0, A)
                R = A + values[i]
                returns.insert(0, R)
            # store usefull info:
            memory.push([states, actions, returns, advantages])

        # policy grad updates:
        av_reward /= float(cum_done + 1)
        model_old = Model(num_inputs, num_outputs)
        model_old.load_state_dict(model.state_dict())
        if t == 0:
            reward_0 = av_reward - (1e-2)
        #batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(params.batch_size)
        for k in range(params.num_epoch):
            # load new model
            model.load_state_dict(shared_model.state_dict())
            model.zero_grad()
            # get initial signal
            signal_init = traffic_light.get()
            # new mini_batch
            batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(
                params.batch_size)
            # old probas
            mu_old, sigma_sq_old, v_pred_old = model_old(batch_states.detach())
            probs_old = normal(batch_actions, mu_old, sigma_sq_old)
            # new probas
            mu, sigma_sq, v_pred = model(batch_states)
            probs = normal(batch_actions, mu, sigma_sq)
            # ratio
            ratio = probs / (1e-10 + probs_old)
            # clip loss
            surr1 = ratio * torch.cat(
                [batch_advantages] * num_outputs,
                1)  # surrogate from conservative policy iteration
            surr2 = ratio.clamp(1 - params.clip, 1 + params.clip) * torch.cat(
                [batch_advantages] * num_outputs, 1)
            loss_clip = -torch.mean(torch.min(surr1, surr2))
            # value loss
            vfloss1 = (v_pred - batch_returns)**2
            v_pred_clipped = v_pred_old + (v_pred - v_pred_old).clamp(
                -params.clip, params.clip)
            vfloss2 = (v_pred_clipped - batch_returns)**2
            loss_value = 0.5 * torch.mean(torch.max(vfloss1, vfloss2))
            # entropy
            loss_ent = -params.ent_coeff * torch.mean(
                probs * torch.log(probs + 1e-5))
            # total
            total_loss = (loss_clip + loss_value + loss_ent)
            #print(total_loss.data[0])
            # before step, update old_model:
            model_old.load_state_dict(model.state_dict())
            # prepare for step
            total_loss.backward(retain_variables=True)
            #ensure_shared_grads(model, shared_model)
            #shared_model.cum_grads()
            shared_grad_buffers.add_gradient(model)

            counter.increment()

            # wait for a new signal to continue
            while traffic_light.get() == signal_init:
                pass

        test_n += 1
        memory.clear()
Exemplo n.º 20
0
class WrapperEnv():
    def __init__(self,
                 game='l2r',
                 visualize=False,
                 max_obstacles=10,
                 skip_count=1):
        self.env = RunEnv(visualize=visualize, max_obstacles=max_obstacles)
        self.step_count = 0
        self.old_observation = None
        self.skip_count = 1  # skip_count  # 4
        self.last_x = 0
        self.current_x = 0
        self.observation_space_shape = (76, )
        self.action_space = self.env.action_space
        self.difficulty = 2

    def obg(self, plain_obs):
        # observation generator
        # derivatives of observations extracted here.
        processed_observation, self.old_observation = go(plain_obs,
                                                         self.old_observation,
                                                         step=self.step_count)
        return np.array(processed_observation)

    def process_action(self, action):
        processed_action = [(v + 1.0) / 2 for v in action]
        return processed_action

    def step(self, action):
        action = [float(action[i]) for i in range(len(action))]
        action = self.process_action(action)

        import math
        for num in action:
            if math.isnan(num):
                print('NaN met', action)
                raise RuntimeError('this is bullshit')

        sr = 0
        sp = 0
        o, oo = [], []
        d, i = 0, 0
        self.last_x = self.current_x
        for j in range(self.skip_count):
            self.step_count += 1
            oo, r, d, i = self.env.step(action)
            self.current_x = oo[1]
            headx = oo[22]
            px = oo[1]
            py = oo[2]
            kneer = oo[7]
            kneel = oo[10]
            lean = min(0.3, max(0, px - headx - 0.15)) * 0.05
            joint = sum([max(0, k - 0.1)
                         for k in [kneer, kneel]]) * 0.03  # * 0.03
            penalty = lean + joint

            o = self.obg(oo)
            sr += r
            sp += penalty

            if d is True:
                break
        res = [o, sr, d, sp]
        # res = [o, sr, d, i]
        return res

    def reset(self, difficulty=2):
        self.difficulty = difficulty
        self.step_count = 0
        self.old_observation = None
        oo = self.env.reset(difficulty=difficulty)
        self.last_x = oo[1]
        self.current_x = oo[1]
        o = self.obg(oo)
        return o

    def seed(self, s):
        self.env.seed(s)
Exemplo n.º 21
0
from osim.env import RunEnv
import opensim

env = RunEnv(visualize=True)
observation = env.reset(seed=0)

s = 0
for s in range(50000):
    d = False

    if s == 30:
        state_old = opensim.State(env.osim_model.state)
        print("State stored")
        print(state_old)
    if s % 50 == 49:
        env.osim_model.revert(state_old)
        state_old = opensim.State(state_old)
        print("Rollback")
        print(state_old)

    o, r, d, i = env.step(env.action_space.sample())
Exemplo n.º 22
0
def train(rank, args, traffic_light, counter, shared_model,
          shared_grad_buffers, shared_obs_stats, opt_ac):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = [0] * 41
    last_v = [0] * 10
    #last_state = numpy.zeros(48)

    env = RunEnv(visualize=False)

    #running_state = ZFilter((num_inputs,), clip=5)
    #running_reward = ZFilter((1,), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    #running_state = ZFilter((num_inputs,), clip=5)

    start_time = time.time()

    for i_episode in range(args.start_epoch + 1, 999999):
        #print(shared_obs_stats.n[0])
        #print('hei')
        #if rank == 0:
        #    print(running_state.rs._n)

        signal_init = traffic_light.get()
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        #Tot_loss = 0
        #Tot_num =
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            #state = numpy.array(state)

            last_state, last_v, state = process_observation(
                last_state, last_v, state)

            state = numpy.array(state)

            #state = running_state(state)

            state = Variable(torch.Tensor(state).unsqueeze(0))
            shared_obs_stats.observes(state)
            state = shared_obs_stats.normalize(state)
            state = state.data[0].numpy()

            #print(state)
            #return

            #print(AA)

            #print(type(AA))
            #print(type(state))
            #print(AA.shape)
            #print(state.shape)

            reward_sum = 0
            #timer = time.time()
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)
                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(state)
                    print(action)
                    print(ac_net.affine1.weight)
                    print(ac_net.affine1.weight.data)
                    print('ERROR')
                    #action = select_action_actor_critic(state,ac_net)
                    #action = action.data[0].numpy()
                    #state = state + numpy.random.rand(args.feature)*0.001

                    raise RuntimeError('action NaN problem')
                #print(action)
                #print("------------------------")
                #timer = time.time()
                reward = 0
                if args.skip:
                    #env.step(action)
                    _, A, _, _ = env.step(action)
                    reward += A
                    _, A, _, _ = env.step(action)
                    reward += A
                BB = numpy.append(action, action)
                next_state, A, done, _ = env.step(BB)
                reward += A
                #print(next_state)
                #last_state = process_observation(state)
                last_state, last_v, next_state = process_observation(
                    last_state, last_v, next_state)

                next_state = numpy.array(next_state)
                #print(next_state)
                #print(next_state.shape)
                #return
                reward_sum += reward
                #print('env:')
                #print(time.time()-timer)

                #last_state ,next_state = update_observation(last_state,next_state)

                #next_state = running_state(next_state)

                next_state = Variable(torch.Tensor(next_state).unsqueeze(0))
                shared_obs_stats.observes(next_state)
                next_state = shared_obs_stats.normalize(next_state)
                next_state = next_state.data[0].numpy()

                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t - 1)
            num_episodes += 1

            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()

        #print('env:')
        #print(time.time()-timer)

        #timer = time.time()
        update_params_actor_critic(batch, args, ac_net, opt_ac)
        shared_grad_buffers.add_gradient(ac_net)

        counter.increment()

        epoch = i_episode
        if (i_episode % args.log_interval == 0) and (rank == 0):

            print(
                'TrainEpisode {}\tTime{}\tLast reward: {}\tAverage reward {:.2f}'
                .format(
                    i_episode,
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, reward_batch))

            epoch = i_episode
            if reward_batch > best_result:
                best_result = reward_batch
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': shared_model.state_dict(),
                        'optimizer': opt_ac.state_dict(),
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, 'best')

            if epoch % 30 == 1:
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': shared_model.state_dict(),
                        'optimizer': opt_ac.state_dict(),
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, epoch)
        # wait for a new signal to continue
        while traffic_light.get() == signal_init:
            pass
Exemplo n.º 23
0
from osim.env import RunEnv
import numpy as np
import time
from Preprocessing import Preprocessing

import sys
#sys.path.insert(0, sys.path[0] + '/DDPG/DDPG.py')

from DDPG.DDPG import DDPG

#env = RunEnv(visualize=False)
env = RunEnv(visualize=True)
observation = env.reset(difficulty=0)

episodes = 100000
agent = DDPG(.9, 2000, 54, 18, .0001, criticpath='critic', actorpath='actor')

for episode in range(0, episodes):

    #env.step(action)
    # action is a list of length 18. values between [0,1]
    ## specifics: 9 muscles per leg, 2 legs = 18.
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    observation = np.array(observation)
    Preprocess = Preprocessing(observation, delta=0.01)
    prevState = Preprocess.GetState(observation)
    agent.step = 0
    agent.OUprocess(.312, 0.15, 0.0)
    for i in range(1, 1000):
        if i > 1:
Exemplo n.º 24
0
from osim.env import RunEnv
import argparse
import numpy as np

parser = argparse.ArgumentParser(
    description='Train or test neural net motor controller')
parser.add_argument('--seed', type=int, default=None)
args = parser.parse_args()

env = RunEnv(visualize=True)
if not args.seed:
    seed = np.random.randint(2**32 - 1)
else:
    seed = args.seed
print("Seed = %d" % seed)
observation = env.reset(difficulty=2, seed=args.seed)
observation, reward, done, info = env.step(env.action_space.sample())

raw_input()
from osim.env import RunEnv
import numpy as np
import copy
import pickle

env = RunEnv(visualize=False)
observation = env.reset(difficulty = 0)
sin=np.sin
file_Name = "w_best"

array=np.array

T=4


alpha=0.01
alpha_0=0.01
#TODO: we should exploit the Fourier property for which higher harmonics weights tend to decays as 1/x^n for smooth and continous functions

#I initialize to 0 the weights list, 4 weights for each muscle (I compose the periodic function with 4 elements of a Fourier Series)
#I define weights only for 9 periodic functions, as I assume that the legs move symmetrically in time.

w=[]

for i in range(9):
    w.append(np.array([0.,0.,0.,0.,0.,0.,0.,0.]))



def output(a,T,t):
    # Output of a 4th degree Fourier Series of sin.
Exemplo n.º 26
0
            return rvel

        rvel = relative_vel(vel)
        left_rvel = relative_vel(left_vel)
        right_rvel = relative_vel(right_vel)

        central += [v * 10 for v in rvel]
        left += [v * 10 for v in left_rvel]
        right += [v * 10 for v in right_rvel]

        left += [np.clip(0.0 - obs.left_toe_y, 0.0, 0.05) * 20]
        left += [np.clip(0.05 - obs.left_talus_y, 0.0, 0.05) * 20]

        right += [np.clip(0.0 - obs.right_toe_y, 0.0, 0.05) * 20]
        right += [np.clip(0.05 - obs.right_talus_y, 0.0, 0.05) * 20]

        extero = self.draw_balls(obs.pelvis_x)

        self.step += 1

        #print(len(central), len(left), len(right), len(extero))
        return central + left + right + extero


if __name__ == '__main__':
    from osim.env import RunEnv
    env = RunEnv(visualize=False)
    state = env.reset()
    fg = FeatureGenerator()
    fg.gen(state)
Exemplo n.º 27
0
def test(rank, params, shared_model, shared_obs_stats, test_n):
    PATH_TO_MODEL = '../models/' + params.bh
    torch.manual_seed(params.seed + rank)
    best_result = -1000
    work_dir = mkdir('exp', 'ppo')
    monitor_dir = mkdir(work_dir, 'monitor')
    last_state = []
    #env = gym.make(params.env_name)
    if params.render:
        env = RunEnv(visualize=True)
    else:
        env = RunEnv(visualize=False)
    #env = wrappers.Monitor(env, monitor_dir, force=True)
    #num_inputs = env.observation_space.shape[0]
    #num_outputs = env.action_space.shape[0]
    num_inputs = params.num_inputs
    num_outputs = params.num_outputs
    model = Model(num_inputs, num_outputs)

    #state = env.reset()
    state = env.reset(difficulty=0)

    last_state, state = process_observation(last_state, state)
    state = numpy.array(state)

    state = Variable(torch.Tensor(state).unsqueeze(0))
    reward_sum = 0
    done = True

    start_time = time.time()

    episode_length = 0
    epoch = 0
    while True:
        #print(episode_length)
        episode_length += 1
        model.load_state_dict(shared_model.state_dict())
        shared_obs_stats.observes(state)
        #print(shared_obs_stats.n[0])
        state = shared_obs_stats.normalize(state)
        mu, sigma_sq, _ = model(state)
        eps = torch.randn(mu.size())
        action = mu + sigma_sq.sqrt() * Variable(eps)
        env_action = action.data.squeeze().numpy()
        state, reward, done, _ = env.step(env_action)

        last_state, state = process_observation(last_state, state)
        state = numpy.array(state)

        reward_sum += reward

        if done:
            print("Time {}, epoch {} ,episode reward {}, episode length {}".
                  format(
                      time.strftime("%Hh %Mm %Ss",
                                    time.gmtime(time.time() - start_time)),
                      epoch, reward_sum, episode_length))
            epoch = epoch + 1
            if reward_sum > best_result:
                best_result = reward_sum

                save_model(
                    {
                        'epoch': epoch,
                        'bh': params.bh,
                        'state_dict': model.state_dict(),
                        #'optimizer' : shared_obs_stats.state_dict(),
                    },
                    PATH_TO_MODEL,
                    'best')

            if epoch % 100 == 1:
                save_model(
                    {
                        'epoch': epoch,
                        'bh': params.bh,
                        'state_dict': model.state_dict(),
                        #'optimizer' : shared_obs_stats.state_dict(),
                    },
                    PATH_TO_MODEL,
                    epoch)

            reward_sum = 0
            episode_length = 0
            state = env.reset(difficulty=0)

            last_state = []
            last_state, state = process_observation(last_state, state)
            state = numpy.array(state)
            time.sleep(10)

        state = Variable(torch.Tensor(state).unsqueeze(0))
Exemplo n.º 28
0
def train(rank, args, shared_model, opt_ac, can_save, shared_obs_stats):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = [1] * 48

    if args.render and can_save:
        env = RunEnv(visualize=True)
    else:
        env = RunEnv(visualize=False)

    #running_state = ZFilter((num_inputs,), clip=5)
    #running_reward = ZFilter((1,), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    start_time = time.time()

    for i_episode in count(1):
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())
        ac_net.zero_grad()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        #Tot_loss = 0
        #Tot_num =
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            last_state = process_observation(state)
            state = process_observation(state)
            last_state, state = transform_observation(last_state, state)

            state = numpy.array(state)
            #global last_state
            #last_state,_ = update_observation(last_state,state)
            #last_state,state = update_observation(last_state,state)
            #print(state.shape[0])
            #print(state[41])
            state = Variable(torch.Tensor(state).unsqueeze(0))
            shared_obs_stats.observes(state)
            state = shared_obs_stats.normalize(state)
            state = state.data[0].numpy()
            #state = running_state(state)

            reward_sum = 0
            #timer = time.time()
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)
                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(state)
                    print(action)
                    print('ERROR')
                    raise RuntimeError('action NaN problem')
                #print(action)
                #print("------------------------")
                #timer = time.time()

                BB = numpy.append(action, action)
                #print(BB)

                reward = 0
                if args.skip:
                    #env.step(action)
                    _, A, _, _ = env.step(BB)
                    reward += A
                    _, A, _, _ = env.step(BB)
                    reward += A

                next_state, A, done, _ = env.step(BB)
                reward += A
                next_state = process_observation(next_state)
                last_state, next_state = transform_observation(
                    last_state, next_state)

                next_state = numpy.array(next_state)
                reward_sum += reward
                #print('env:')
                #print(time.time()-timer)

                #last_state ,next_state = update_observation(last_state,next_state)
                #next_state = running_state(next_state)
                next_state = Variable(torch.Tensor(next_state).unsqueeze(0))
                shared_obs_stats.observes(next_state)
                next_state = shared_obs_stats.normalize(next_state)
                next_state = next_state.data[0].numpy()
                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t - 1)
            num_episodes += 1

            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()

        #print('env:')
        #print(time.time()-timer)

        #timer = time.time()
        update_params_actor_critic(batch, args, shared_model, ac_net, opt_ac)
        #print('backpropagate:')
        #print(time.time()-timer)

        epoch = i_episode
        if (i_episode % args.log_interval == 0) and (rank == 0):

            print('TrainEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.
                  format(i_episode, reward_sum, reward_batch))
            if reward_batch > best_result:
                best_result = reward_batch
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': ac_net.state_dict(),
                        'optimizer': opt_ac,
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, 'best')

            if epoch % 30 == 1:
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': ac_net.state_dict(),
                        'optimizer': opt_ac,
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, epoch)
Exemplo n.º 29
0
# Plots the action sample space to see the range

from osim.env import RunEnv
import matplotlib.pyplot as plt

env = RunEnv(visualize=False)
env.reset(difficulty=0)
samples = [env.action_space.sample() for i in range(300)]
y1 = [v[0] for v in samples]
plt.plot(samples)
plt.show()
Exemplo n.º 30
0
def standalone_headless_isolated(pq, cq, plock):
    # locking to prevent mixed-up printing.
    plock.acquire()
    print('starting headless...', pq, cq)
    try:
        import traceback
        from osim.env import RunEnv
        e = RunEnv(visualize=False, max_obstacles=10)
        # bind_alternative_pelvis_judgement(e)
        # use_alternative_episode_length(e)
    except Exception as e:
        print('error on start of standalone')
        traceback.print_exc()

        plock.release()
        return
    else:
        plock.release()

    def report(e):
        # a way to report errors ( since you can't just throw them over a pipe )
        # e should be a string
        print('(standalone) got error!!!')
        # conn.send(('error',e))
        # conn.put(('error',e))
        cq.put(('error', e))

    def floatify(np):
        return [float(np[i]) for i in range(len(np))]

    try:
        while True:
            # msg = conn.recv()
            # msg = conn.get()
            msg = pq.get()
            # messages should be tuples,
            # msg[0] should be string

            # isinstance is dangerous, commented out
            # if not isinstance(msg,tuple):
            #     raise Exception('pipe message received by headless is not a tuple')

            if msg[0] == 'reset':
                o = e.reset(difficulty=2)
                # conn.send(floatify(o))
                cq.put(floatify(o))
                # conn.put(floatify(o))
            elif msg[0] == 'step':
                o, r, d, i = e.step(msg[1])
                o = floatify(o)  # floatify the observation
                cq.put((o, r, d, i))
                # conn.put(ordi)
                # conn.send(ordi)
            else:
                # conn.close()
                cq.close()
                pq.close()
                del e
                break
    except Exception as e:
        traceback.print_exc()
        report(str(e))

    return  # end process
Exemplo n.º 31
0
class OpenSim(Env):  # low dimensional observations
    """ Class to setup the OpenSim-RL environment (https://github.com/praveen-palanisamy/pytorch-rl.git) Where the agent has to learn to run! Continuous (18 dim) action space."""
    def __init__(self, args, env_ind=0):
        super(OpenSim, self).__init__(args, env_ind)

        assert self.env_type == "opensim"
        try:
            from osim.env import RunEnv
        except ImportError as e:
            self.logger.warning("WARNING: opensim not found")

        self.env = RunEnv(visualize=True)
        #self.env.seed(self.seed)    # NOTE: so each env would be different

        # action space setup
        self.actions = range(self.action_dim)
        self.logger.warning("Action Space: %s", self.env.action_space)

        # state space setup
        self.logger.warning("State  Space: %s", self.state_shape)

        # continuous space
        #if args.agent_type == "a3c":
        self.enable_continuous = True  #args.enable_continuous

    def _preprocessState(self, state):  # NOTE: here no preprecessing is needed
        return state

    @property
    def action_dim(self):
        return self.env.action_space.shape[0]

    @property
    def state_shape(self):
        return self.env.observation_space.shape[0]

    def render(self):
        #if self.mode == 2:
        #    frame = self.env.render(mode='rgb_array')
        #    frame_name = self.img_dir + "frame_%04d.jpg" % self.frame_ind
        #    self.imsave(frame_name, frame)
        #    self.logger.warning("Saved  Frame    @ Step: " + str(self.frame_ind) + " To: " + frame_name)
        #    self.frame_ind += 1
        #    return frame
        #else:
        #    return self.env.render()
        return

    def visual(self):
        pass

    def sample_random_action(self):
        return self.env.action_space.sample()

    def reset(self):
        self._reset_experience()
        self.exp_state1 = self.env.reset()
        return self._get_experience()

    def step(self, action):
        self.exp_action = action
        if self.enable_continuous:
            self.exp_state1, self.exp_reward, self.exp_terminal1, _ = self.env.step(
                self.exp_action)
        return self._get_experience()