def update_step(self, t, lr, batch_data):
        """
        Performs an update of parameters by sampling from replay_buffer

        Args:
            t: number of iteration (episode and move)
            replay_buffer: ReplayBuffer instance .sample() gives batches
            lr: (float) learning rate
        Returns:
            loss: (Q - Q_target)^2
        """

        fd = {
            # inputs
            self.s: batch_data.observations,
            self.target_action: batch_data.target,
            self.pred_flag: batch_data.mask,
            self.slen: batch_data.seqlen,
            self.hs: DNC.zero_state(self.config, self.config.batch_size),
            self.lr: lr, 
            # extra info
            self.eval_acc_placeholder: self.eval_acc
        }

        loss_eval, grad_norm_eval, summary, _ = self.sess.run([self.loss, self.grad_norm, 
                                                 self.merged, self.train_op], feed_dict=fd)

        # tensorboard stuff
        self.file_writer.add_summary(summary, t)
        
        return loss_eval, grad_norm_eval
Пример #2
0
    def update_step(self, t, replay_buffer, lr):
        """
        Performs an update of parameters by sampling from replay_buffer

        Args:
            t: number of iteration (episode and move)
            replay_buffer: ReplayBuffer instance .sample() gives batches
            lr: (float) learning rate
        Returns:
            loss: (Q - Q_target)^2
        """

        s_batch, slen_batch, a_batch, past_a_batch, r_batch, done_mask_batch, seq_mask_batch, sp_batch, splen_batch = replay_buffer.sample_batch(
            self.config.batch_size)

        fd = {
            # inputs
            self.s: s_batch,
            self.slen: slen_batch,
            self.hs: DNC.zero_state(self.config, self.config.batch_size),
            self.a: a_batch,
            self.past_a: past_a_batch,
            self.r: r_batch,
            self.sp: sp_batch,
            self.splen: splen_batch,
            self.hsp: DNC.zero_state(self.config, self.config.batch_size),
            self.done_mask: done_mask_batch,
            self.seq_mask: seq_mask_batch,
            self.lr: lr,
            # extra info
            self.avg_reward_placeholder: self.avg_reward,
            self.max_reward_placeholder: self.max_reward,
            self.std_reward_placeholder: self.std_reward,
            self.avg_q_placeholder: self.avg_q,
            self.max_q_placeholder: self.max_q,
            self.std_q_placeholder: self.std_q,
            self.eval_reward_placeholder: self.eval_reward,
        }

        loss_eval, grad_norm_eval, summary, _ = self.sess.run(
            [self.loss, self.grad_norm, self.merged, self.train_op],
            feed_dict=fd)

        # tensorboard stuff
        self.file_writer.add_summary(summary, t)

        return loss_eval, grad_norm_eval
    def evaluate(self, model_i, curri_idx=None, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        if curri_idx is None:
            curri_idx = -1
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env

        accs = []

        for i in range(num_episodes):
            encoding_batch = []
            predflag_batch = []
            target_action_batch = []
            slen_batch = []
            max_len = 0
            for j in range(self.config.batch_size):
                #config = self.config
                #config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[curri_idx]
                #self.env.reset(config) # h x w x c
                encoding, target_action, predflag = model_i.gen_sample_seq(self.config.ndigits, self.config.nway)
                encoding_batch.append(encoding[None])
                predflag_batch.append(predflag[None])
                target_action_batch.append(target_action[None])
                slen_batch.append(encoding.shape[0])
                if encoding.shape[0]>max_len:
                    max_len = encoding.shape[0]

            batch_data = DatasetTensors(np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in encoding_batch], axis=0),
                np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in target_action_batch], axis=0),
                np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1]])], axis=1) for x in predflag_batch], axis=0), np.array(slen_batch).astype('int32'))

            h_state = DNC.zero_state(self.config, batch_size=self.config.batch_size)
            pred_action, h_state = self.sess.run([self.q, self.hs_out], feed_dict={self.s: batch_data.observations, self.hs: h_state, self.slen: batch_data.seqlen})
            for j in range(self.config.batch_size):
                accs.append((pred_action[j]*np.expand_dims(batch_data.mask[j],1) == batch_data.target[j]*np.expand_dims(batch_data.mask[j],1)).reshape(-1).all())

        avg_acc = np.mean(accs)
        if num_episodes > 1:
            msg = "Average acc: {:04.2f}".format(avg_acc)
            self.logger.info(msg)
        return avg_acc
Пример #4
0
    def update_step(self, t, lr, batch_data):
        """
        Performs an update of parameters by sampling from replay_buffer

        Args:
            t: number of iteration (episode and move)
            replay_buffer: ReplayBuffer instance .sample() gives batches
            lr: (float) learning rate
        Returns:
            loss: (Q - Q_target)^2
        """

        fd = {
            # inputs
            self.s: batch_data.observations,
            self.target_action: batch_data.target,
            self.pred_flag: batch_data.mask,
            self.slen: batch_data.seqlen,
            self.hs: DNC.zero_state(self.config, self.config.batch_size),
            self.lr: lr,
            # extra info
            self.eval_acc_placeholder: self.eval_acc
        }

        loss_eval, grad_norm_eval, summary, _ = self.sess.run(
            [self.loss, self.grad_norm, self.merged, self.train_op],
            feed_dict=fd)
        '''
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()

        loss_eval, grad_norm_eval, summary, _ = self.sess.run([self.loss, self.grad_norm, 
                                                 self.merged, self.train_op], feed_dict=fd, options=run_options, run_metadata=run_metadata)

        # Create the Timeline object, and write it to a json
        tl = timeline.Timeline(run_metadata.step_stats)
        ctf = tl.generate_chrome_trace_format()
        with open('timeline.json', 'w') as f:
            f.write(ctf)
        '''

        # tensorboard stuff
        self.file_writer.add_summary(summary, t)

        return loss_eval, grad_norm_eval
Пример #5
0
    def evaluate(self, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env
        env.state.is_render_image = self.config.render_test

        # replay memory to play
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = []

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            h_state = DNC.zero_state(self.config, batch_size=1)
            slen = np.ones(1).astype('int32')
            action = 0
            for j in range(50):
                if self.config.render_test: env.render()

                #### for replay_buffer
                # store last state in buffer
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                action, action_q, h_state = self.get_action([q_input], h_state,
                                                            slen, [action])
                #print(action, action_q)

                # perform action in env
                new_state, reward, done = env.step(action)

                # store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # count reward
                total_reward += reward
                if done:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward
Пример #6
0
    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()  # h x w x c
            h_state = DNC.zero_state(self.config, batch_size=1)
            slen = np.ones(1).astype('int32')
            action = 0
            for i in range(200):
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                #### for replay_buffer
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values, h_state = self.get_best_action(
                    [q_input], h_state, slen, [action])
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                # perform action in env
                new_state, reward, done = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate()]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Пример #7
0
    def evaluate(self,
                 cr_schedule,
                 curri_idx=None,
                 env=None,
                 num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        if curri_idx is None:
            curri_idx = -1
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env

        accs = []
        gt_len = []

        for i in range(num_episodes):
            config = self.config
            config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[
                curri_idx]
            env.reset(config)  # h x w x c
            h_state = DNC.zero_state(config, batch_size=1)
            encoding, predflag, target_action = env.prepare_seq()
            slen = np.array(encoding.shape[0]).astype('int32')
            # describe graph, query and planning
            h_state = self.sess.run(self.hs_out,
                                    feed_dict={
                                        self.s: encoding[None],
                                        self.hs: h_state,
                                        self.slen: slen
                                    })
            past_state = -1
            past_action_onehot = -1
            path_len = 0
            for i in range(config.max_step_len):
                gt_action = env.get_gt_action()
                next_state = env.next_state(gt_action)
                if self.config.use_transition_only_during_answering:
                    current_encoding = GraphWorld.convert_triplets_to_encoding(
                        np.array([[-1, -1,
                                   past_action_onehot]]).astype('int32'),
                        config.ndigits, config.nway)
                else:
                    current_encoding = GraphWorld.convert_triplets_to_encoding(
                        np.array([[env.current_state, next_state,
                                   -1]]).astype('int32'), config.ndigits,
                        config.nway)
                    #current_encoding = GraphWorld.convert_triplets_to_encoding(np.array([[env.current_state, env.target_state, past_action_onehot]]).astype('int32'), config.ndigits, config.nway)
                current_encoding = np.concatenate(
                    [current_encoding, np.array([[0, 1]])], axis=1)
                pred_action, h_state = self.sess.run(
                    [self.q, self.hs_out],
                    feed_dict={
                        self.s: current_encoding[None],
                        self.hs: h_state,
                        self.slen: np.ones(1).astype('int32')
                    })
                past_state = env.current_state
                _, done, past_action_onehot = env.step(pred_action.reshape(-1))
                path_len += 1
                if done:
                    break

            accs.append(
                len(env.path[env.src_state]) == path_len
                and env.current_state == env.target_state)
            gt_len.append(len(env.path[env.src_state]))

        avg_acc = np.mean(accs)
        if num_episodes > 1:
            msg = "Average acc: {:04.2f}".format(avg_acc)
            self.logger.info(msg)
        return avg_acc
Пример #8
0
    def train(self, beta_schedule, lr_schedule, cr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        self.init_averages()

        t = last_eval = curri_idx = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            t += 1
            last_eval += 1
            config = self.config
            config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit = cr_schedule[
                curri_idx]
            self.env.reset(config)  # h x w x c
            h_state = DNC.zero_state(config, batch_size=1)
            encoding, predflag, target_action = self.env.prepare_seq()
            slen = np.array(encoding.shape[0]).astype('int32')
            # describe graph, query and planning
            h_state = self.sess.run(self.hs_out,
                                    feed_dict={
                                        self.s: encoding[None],
                                        self.hs: h_state,
                                        self.slen: slen
                                    })
            past_state = -1
            past_action_onehot = -1
            encoding_a = np.zeros([config.max_step_len, encoding.shape[1]])
            predflag_a = np.zeros(config.max_step_len)
            target_action_a = np.zeros(
                [config.max_step_len, target_action.shape[1]])
            for i in range(config.max_step_len):
                if self.config.use_transition_only_during_answering:
                    current_encoding = GraphWorld.convert_triplets_to_encoding(
                        np.array([[-1, -1,
                                   past_action_onehot]]).astype('int32'),
                        config.ndigits, config.nway)
                else:
                    current_encoding = GraphWorld.convert_triplets_to_encoding(
                        np.array([[
                            past_state, self.env.current_state,
                            past_action_onehot
                        ]]).astype('int32'), config.ndigits, config.nway)
                    #current_encoding = GraphWorld.convert_triplets_to_encoding(np.array([[self.env.current_state, self.env.target_state, past_action_onehot]]).astype('int32'), config.ndigits, config.nway)
                current_encoding = np.concatenate(
                    [current_encoding, np.array([[0, 1]])], axis=1)
                gt_action = self.env.get_gt_action()
                encoding_a[i, :] = current_encoding[0]
                predflag_a[i] = 1
                target_action_a[i, :] = gt_action

                pred_action, h_state = self.sess.run(
                    [self.q, self.hs_out],
                    feed_dict={
                        self.s: current_encoding[None],
                        self.hs: h_state,
                        self.slen: np.ones(1).astype('int32')
                    })
                action = self.get_action(pred_action.reshape(-1), gt_action,
                                         beta_schedule.epsilon)
                past_state = self.env.current_state
                _, done, past_action_onehot = self.env.step(action)
                slen += 1
                if done:
                    break

            batch_data = DatasetTensors(
                np.concatenate([encoding, encoding_a], axis=0)[None],
                np.concatenate([target_action, target_action_a], axis=0)[None],
                np.concatenate([predflag, predflag_a], axis=0)[None], slen)

            # perform a training step
            loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon,
                                                   batch_data)

            # logging stuff
            if ((t % config.log_freq == 0)
                    and (t % config.learning_freq == 0)):
                self.update_averages(scores_eval)
                beta_schedule.update(t)
                lr_schedule.update(t)
                prog.update(t + 1,
                            exact=[("Loss", loss_eval), ("Grads", grad_eval),
                                   ("lr", lr_schedule.epsilon)])

            if t >= config.nsteps_train:
                break

            if last_eval >= config.eval_freq:
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate(cr_schedule, curri_idx)]
                if scores_eval[-1] > 0.8:
                    curri_idx += 1
                    msg = "Upgrade to lesson {:d}".format(int(curri_idx))
                    self.logger.info(msg)
                    self.logger.info(
                        "----------Start Computing Final Score----------")
                    scores_eval += [self.evaluate(cr_schedule)]
                    self.logger.info(
                        "----------Finish Computing Final Score----------")

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(cr_schedule)]
        export_plot(scores_eval, "Scores", self.config.plot_output)