Exemplo n.º 1
0
    def make_rollout(self):
        variables_server = Redis(port=12000)
        if self.scale != 'off':
            try:
                means = hlp.load_object(variables_server.get("means"))
                stds = hlp.load_object(variables_server.get("stds"))
                self.sess.run(self.norm_set_op,
                              feed_dict=dict(zip(self.norm_phs,
                                                 [means, stds])))
            except:
                pass
        try:
            weights = [
                hlp.load_object(variables_server.get("weight_{}".format(i)))
                for i in range(len(self.weights))
            ]
            self.set_weights(weights)
        except:
            pass
        env = self.env
        if self.test_mode:
            n_tasks = self.n_tests
            timesteps_per_worker = 100000000
        else:
            n_tasks = 10000
            timesteps_per_worker = self.timesteps_per_batch // self.n_workers

        timestep = 0
        i_task = 0

        paths = []
        while timestep < timesteps_per_worker and i_task < n_tasks:
            path = {}
            observations, action_tuples, rewards, dist_tuples, timestamps = [], [], [], [], []
            sums = np.zeros((1, env.get_observation_space()))
            sumsqrs = np.zeros(sums.shape)

            env.reset()
            while not env.done and env.timestamp < self.timesteps_per_launch:
                sums += env.features
                sumsqrs += np.square(env.features)
                observations.append(env.features[0])
                timestamps.append(env.timestamp)

                if not self.test_mode:
                    actions, dist_tuple = self.act(env.features,
                                                   return_dists=True)
                    dist_tuples.append(dist_tuple)
                else:
                    actions = self.act(env.features, exploration=False)
                env.step(actions)
                timestep += 1

                action_tuples.append(actions)
                rewards.append(env.reward)

            path["observations"] = np.array(observations)
            path["action_tuples"] = np.array(action_tuples)
            path["rewards"] = np.array(rewards)
            if not self.test_mode:
                path["dist_tuples"] = np.array(dist_tuples)
            path["timestamps"] = np.array(timestamps)
            path["sumobs"] = sums
            path["sumsqrobs"] = sumsqrs
            path["terminated"] = env.done
            path["total"] = env.get_total_reward()
            paths.append(path)
            i_task += 1

        if self.distributed:
            variables_server.set("paths_{}".format(self.id_worker),
                                 hlp.dump_object(paths))
        else:
            self.paths = paths
Exemplo n.º 2
0
    def train(self):
        cmd_server = 'redis-server --port 12000'
        p = subprocess.Popen(cmd_server, shell=True, preexec_fn=os.setsid)
        self.variables_server = Redis(port=12000)
        means = "-"
        stds = "-"
        if self.scale != 'off':
            if self.timestep == 0:
                print("Time to measure features!")
                if self.distributed:
                    worker_args = \
                        {
                            'config': self.config,
                            'test_mode': False,
                        }
                    hlp.launch_workers(worker_args, self.n_workers)
                    paths = []
                    for i in range(self.n_workers):
                        paths += hlp.load_object(
                            self.variables_server.get("paths_{}".format(i)))
                else:
                    self.test_mode = False
                    self.make_rollout()
                    paths = self.paths

                for path in paths:
                    self.sums += path["sumobs"]
                    self.sumsqrs += path["sumsqrobs"]
                    self.sumtime += path["observations"].shape[0]

            stds = np.sqrt(
                (self.sumsqrs - np.square(self.sums) / self.sumtime) /
                (self.sumtime - 1))
            means = self.sums / self.sumtime
            print("Init means: {}".format(means))
            print("Init stds: {}".format(stds))
            self.variables_server.set("means", hlp.dump_object(means))
            self.variables_server.set("stds", hlp.dump_object(stds))
            self.sess.run(self.norm_set_op,
                          feed_dict=dict(zip(self.norm_phs, [means, stds])))
        while True:
            print("Iteration {}".format(self.timestep))
            start_time = time.time()

            if self.distributed:
                weights = self.get_weights()
                for i, weight in enumerate(weights):
                    self.variables_server.set("weight_" + str(i),
                                              hlp.dump_object(weight))
                worker_args = \
                    {
                        'config': self.config,
                        'test_mode': False,
                    }
                hlp.launch_workers(worker_args, self.n_workers)
                paths = []
                for i in range(self.n_workers):
                    paths += hlp.load_object(
                        self.variables_server.get("paths_{}".format(i)))
            else:
                self.test_mode = False
                self.make_rollout()
                paths = self.paths

            observations = np.concatenate(
                [path["observations"] for path in paths])
            actions = np.concatenate([path["action_tuples"] for path in paths])
            action_dists = []
            for _ in range(len(self.n_actions)):
                action_dists.append([])
            returns = []
            advantages = []
            for path in paths:
                self.sums += path["sumobs"]
                self.sumsqrs += path["sumsqrobs"]
                self.sumtime += path["rewards"].shape[0]
                dists = path["dist_tuples"]

                for i in range(len(self.n_actions)):
                    action_dists[i] += [dist[i][0] for dist in dists]
                returns += hlp.discount(path["rewards"], self.gamma,
                                        path["timestamps"]).tolist()
                values = self.sess.run(
                    self.value,
                    feed_dict={self.state_input: path["observations"]})
                values = np.append(values,
                                   0 if path["terminated"] else values[-1])
                deltas = (path["rewards"] + self.gamma * values[1:] -
                          values[:-1])
                advantages += hlp.discount(deltas, self.gamma,
                                           path["timestamps"]).tolist()
            returns = np.array(returns)
            advantages = np.array(advantages)

            if self.normalize == 'ranks':
                ranks = np.zeros_like(advantages)
                ranks[np.argsort(advantages)] = np.arange(
                    ranks.shape[0], dtype=np.float32) / (ranks.shape[0] - 1)
                ranks -= 0.5
                advantages = ranks[:]
            elif self.normalize == 'center':
                advantages -= np.mean(advantages)
                advantages /= (np.std(advantages, ddof=1) + 0.001)

            feed_dict = {
                self.state_input: observations,
                self.targets["return"]: returns,
                self.targets["advantage"]: advantages
            }

            for i in range(len(self.n_actions)):
                feed_dict[self.targets["old_dist_{}".format(i)]] = np.array(
                    action_dists[i])
                feed_dict[self.targets["action_{}".format(i)]] = actions[:, i]

            for i in range(self.value_updates):
                self.sess.run(self.value_train_op, feed_dict)

            train_rewards = np.array([path["rewards"].sum() for path in paths])
            train_lengths = np.array([len(path["rewards"]) for path in paths])

            thprev = self.get_flat()

            def fisher_vector_product(p):
                feed_dict[self.targets["flat_tangent"]] = p
                return self.sess.run(self.fisher_vector_product,
                                     feed_dict) + 0.1 * p

            g = self.sess.run(self.policy_grad, feed_dict)
            stepdir = hlp.conjugate_gradient(fisher_vector_product, -g)

            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / self.max_kl)
            fullstep = stepdir / (lm + 1e-18)

            def loss_kl(th):
                self.set_from_flat(th)
                return self.sess.run([self.loss, self.KL], feed_dict=feed_dict)

            theta = hlp.linesearch(loss_kl, thprev, fullstep, self.max_kl)
            self.set_from_flat(theta)

            lossafter, kloldnew = self.sess.run([self.loss, self.KL],
                                                feed_dict=feed_dict)

            print("Time for testing!")

            if self.distributed:
                weights = self.get_weights()
                for i, weight in enumerate(weights):
                    self.variables_server.set("weight_" + str(i),
                                              hlp.dump_object(weight))
                worker_args = \
                    {
                        'config': self.config,
                        'test_mode': True,
                    }
                hlp.launch_workers(worker_args, self.n_workers)
                paths = []
                for i in range(self.n_workers):
                    paths += hlp.load_object(
                        self.variables_server.get("paths_{}".format(i)))
            else:
                self.test_mode = True
                self.make_rollout()
                paths = self.paths

            total_rewards = np.array([path["total"] for path in paths])
            eplens = np.array([len(path["rewards"]) for path in paths])

            if self.scale != 'full':
                stds = np.sqrt(
                    (self.sumsqrs - np.square(self.sums) / self.sumtime) /
                    (self.sumtime - 1))
                means = self.sums / self.sumtime
                self.variables_server.set("means", hlp.dump_object(means))
                self.variables_server.set("stds", hlp.dump_object(stds))
                self.sess.run(self.norm_set_op,
                              feed_dict=dict(zip(self.norm_phs,
                                                 [means, stds])))

            print("""
-------------------------------------------------------------
Mean test score:           {test_scores}
Mean train score:          {train_scores}
Mean test episode length:  {test_eplengths}
Mean train episode length: {train_eplengths}
Max test score:            {max_test}
Max train score:           {max_train}
KL between old and new     {kl}
Loss after update          {loss}
Mean of features:          {means}
Std of features:           {stds}
-------------------------------------------------------------
                """.format(means=means,
                           stds=stds,
                           test_scores=np.mean(total_rewards),
                           test_eplengths=np.mean(eplens),
                           train_scores=np.mean(train_rewards),
                           train_eplengths=np.mean(train_lengths),
                           max_test=np.max(total_rewards),
                           max_train=np.max(train_rewards),
                           kl=kloldnew,
                           loss=lossafter))
            self.timestep += 1
            self.train_scores.append(np.mean(train_rewards))
            self.test_scores.append(np.mean(total_rewards))
            if self.timestep % self.save_every == 0:
                self.save(self.config[:-5])
Exemplo n.º 3
0
    def train(self):
        cmd_server = 'redis-server --port 12000'
        p = subprocess.Popen(cmd_server, shell=True, preexec_fn=os.setsid)
        self.variables_server = Redis(port=12000)
        means = "-"
        stds = "-"
        if self.scale != 'off':
            if self.timestep == 0:
                print("Time to measure features!")
                if self.distributed:
                    worker_args = \
                        {
                            'config': self.config,
                            'test_mode': False,
                        }
                    hlp.launch_workers(worker_args, self.n_workers)
                    paths = []
                    for i in range(self.n_workers):
                        paths += hlp.load_object(self.variables_server.get("paths_{}".format(i)))
                else:
                    self.test_mode = False
                    self.make_rollout()
                    paths = self.paths

                for path in paths:
                    self.sums += path["sumobs"]
                    self.sumsqrs += path["sumsqrobs"]
                    self.sumtime += path["observations"].shape[0]

            stds = np.sqrt((self.sumsqrs - np.square(self.sums) / self.sumtime) / (self.sumtime - 1))
            means = self.sums / self.sumtime
            print("Init means: {}".format(means))
            print("Init stds: {}".format(stds))
            self.variables_server.set("means", hlp.dump_object(means))
            self.variables_server.set("stds", hlp.dump_object(stds))
            self.sess.run(self.norm_set_op, feed_dict=dict(zip(self.norm_phs, [means, stds])))

        weights = self.get_weights()
        for i, weight in enumerate(weights):
            self.variables_server.set("weight_" + str(i), hlp.dump_object(weight))
            self.variables_server.set('momentum_{}'.format(i), hlp.dump_object(np.zeros(weight.shape)))
            self.variables_server.set('velocity_{}'.format(i), hlp.dump_object(np.zeros(weight.shape)))
        self.variables_server.set('update_steps', hlp.dump_object(0))

        worker_args = \
            {
                'config': self.config,
                'test_mode': False,
            }
        hlp.launch_workers(worker_args, self.n_workers, command='work', wait=False)

        while True:
            time.sleep(self.test_every)
            print("Time for testing!")
            if self.distributed:
                worker_args = \
                    {
                        'config': self.config,
                        'test_mode': True,
                    }
                hlp.launch_workers(worker_args, self.n_workers)
                paths = []
                for i in range(self.n_workers):
                    paths += hlp.load_object(self.variables_server.get("paths_{}".format(i)))
            else:
                self.test_mode = True
                self.make_rollout()
                paths = self.paths

            total_rewards = np.array([path["total"] for path in paths])
            eplens = np.array([len(path["rewards"]) for path in paths])

            print("""
-------------------------------------------------------------
Mean test score:           {test_scores}
Mean test episode length:  {test_eplengths}
Max test score:            {max_test}
Number of train episodes:  {number}
Mean of features:          {means}
Std of features:           {stds}
-------------------------------------------------------------
                """.format(
                means=means,
                stds=stds,
                test_scores=np.mean(total_rewards),
                test_eplengths=np.mean(eplens),
                max_test=np.max(total_rewards),
                number=self.variables_server.llen('results')
            ))
            self.timestep += 1
            self.train_scores = [hlp.load_object(res) for res in self.variables_server.lrange('results', 0, -1)][::-1]

            self.test_scores.append(np.mean(total_rewards))
            if self.timestep % self.save_every == 0:
                self.save(self.config[:-5])