Exemplo n.º 1
0
def main(): 
    timeout = 30
    num_cpus = psutil.cpu_count()
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',filename='app.log',level=logging.DEBUG) 
    print(os.getpid())
    print(os.getppid())
    if not ray.is_initialized():
        ray.init(include_webui=True)

    files = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']

    db.setup()
    print(os.getpid())
    print(os.getppid())
    with ray.profile('Event'):

        for i in range(10):
            time.sleep(randint(0, 4))
            try:
                ray.get(worker.remote(i))
            except Exception as e:
                raise e
                print(e.message)
            finally:
                print('finally')
Exemplo n.º 2
0
def gradient_worker(ps, X, y, batch_size):
    n_batches = X.shape[0] // batch_size
    start_idx = 0
    
    for batch_idx in range(n_batches):
        X_b = X[start_idx:start_idx+batch_size]
        y_b = y[start_idx:start_idx+batch_size]
        cur_theta = ray.get(ps.get_params.remote())
        with ray.profile("Calculate Grad"):
            cur_grad = calc_grad(X_b, y_b, cur_theta)
        ps.update_params.remote(cur_grad)

        start_idx += batch_size
Exemplo n.º 3
0
 def f():
     with ray.profile("custom_event", extra_data={"name": "custom name"}):
         pass
Exemplo n.º 4
0
    def collect_experience(self):

        with ray.profile("Actor collection loop",
                         extra_data={'Actor id': str(self.id)}):
            # collection loop -  COLLECTS EPISODES OF EXPERIENCE UNTIL training_done
            while True:

                cassieEnv = True

                if self.actor_timesteps % self.load_freq == 0:
                    # PUTTING WAIT ON THIS SHOULD MAKE THIS EXACT SAME AS NON-DISTRIBUTED, if using one actor
                    # Query learner for latest model and termination flag

                    self.policy, self.training_done = ray.get(
                        self.learner_id.get_global_policy.remote())

                    #global_policy_state_dict, training_done = ray.get(self.learner_id.get_global_policy.remote())
                    #self.policy.load_state_dict(global_policy_state_dict)
                    print("loaded global model")

                # self.policy, self.training_done = ray.get(self.learner_id.get_global_policy.remote())
                # print("loaded global model")

                if self.training_done:
                    break

                obs = self.env.reset()
                done = False
                episode_reward = 0
                episode_timesteps = 0

                # nested collection loop - COLLECTS TIMESTEPS OF EXPERIENCE UNTIL episode is over
                while episode_timesteps < self.max_traj_len and not done:

                    #self.env.render()

                    # Select action randomly or according to policy
                    if self.actor_timesteps < self.start_timesteps:
                        #print("selecting action randomly {}".format(done_bool))
                        action = torch.randn(
                            self.env.action_space.shape[0]
                        ) if cassieEnv is True else self.env.action_space.sample(
                        )
                        action = action.numpy()
                    else:
                        #print("selecting from policy")
                        action = select_action(self.policy, np.array(obs),
                                               device)
                        if self.act_noise != 0:
                            action = (action + np.random.normal(
                                0,
                                self.act_noise,
                                size=self.env.action_space.shape[0])).clip(
                                    self.env.action_space.low,
                                    self.env.action_space.high)

                    # Perform action
                    new_obs, reward, done, _ = self.env.step(action)
                    done_bool = 1.0 if episode_timesteps + 1 == self.max_traj_len else float(
                        done)
                    episode_reward += reward

                    # Store data in replay buffer
                    self.memory_id.add.remote(
                        (obs, new_obs, action, reward, done_bool))

                    # call update from model server
                    self.learner_id.update_and_evaluate.remote()

                    # update state
                    obs = new_obs

                    # increment step counts
                    episode_timesteps += 1
                    self.actor_timesteps += 1

                    # increment global step count
                    self.learner_id.increment_step_count.remote()

                # episode is over, increment episode count and plot episode info
                self.episode_num += 1

                # pass episode details to visdom logger on memory server
                self.memory_id.plot_actor_results.remote(
                    self.id, self.actor_timesteps, episode_reward)

                ray.wait([self.learner_id.increment_episode_count.remote()],
                         num_returns=1)

                if self.taper_load_freq and self.taper_timesteps >= 2000:
                    self.load_freq = self.load_freq // 2
                    print("Increased load frequency")
Exemplo n.º 5
0
    def update_eval_model(self, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        with ray.profile("Learner optimization loop", extra_data={'Episode count': str(self.episode_count)}):

            start_time = time.time()

            if ray.get(self.memory.storage_size.remote()) < self.batch_size:
                print("not enough experience yet")
                return

            # randomly sample a mini-batch transition from memory_server
            x, y, u, r, d = ray.get(self.memory.sample.remote(self.batch_size))
            state = torch.FloatTensor(x).to(self.device)
            action = torch.FloatTensor(u).to(self.device)
            next_state = torch.FloatTensor(y).to(self.device)
            done = torch.FloatTensor(1 - d).to(self.device)
            reward = torch.FloatTensor(r).to(self.device)

            # Select action according to policy and add clipped noise
            noise = torch.FloatTensor(u).data.normal_(
                0, policy_noise).to(self.device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) +
                           noise).clamp(-self.max_action, self.max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (done * self.discount * target_Q).detach()

            # Get current Q estimates
            current_Q1, current_Q2 = self.critic(state, action)

            # Compute critic loss
            critic_loss = F.mse_loss(
                current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            self.update_counter += 1

            # Delayed policy updates
            if self.update_counter % policy_freq == 0:

                print("optimizing at timestep {} | time = {} | replay size = {} | episode count = {} | update count = {} ".format(self.step_count, time.time()-start_time, ray.get(self.memory.storage_size.remote()), self.episode_count, self.update_counter))

                # Compute actor loss
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()

                # Optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Update the frozen target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(
                        self.tau * param.data + (1 - self.tau) * target_param.data)

                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(
                        self.tau * param.data + (1 - self.tau) * target_param.data)