Python DNC примеры, utils.dnc.DNC Python примеры использования

Пример #1

0

Показать файл

    def add_placeholders_op(self):
        """
        Adds placeholders to the graph

        These placeholders are used as inputs by the rest of the model building and will be fed
        data during training.
        """
        # here, typically, a state shape is (5,3,22)

        state_shape = list([
            self.env.args.visible_radius_unit_front + 1,
            2 * self.env.args.visible_radius_unit_side + 1,
            len(self.env.state.xmap.item_class_id) + 1
        ])
        self.s = tf.placeholder(tf.bool,
                                shape=(None, None, state_shape[0],
                                       state_shape[1], state_shape[2]))
        self.hs = DNC.state_placeholder(self.config)
        self.slen = tf.placeholder(tf.int32, shape=(None))
        self.sp = tf.placeholder(tf.bool,
                                 shape=(None, None, state_shape[0],
                                        state_shape[1], state_shape[2]))
        self.hsp = DNC.state_placeholder(self.config)
        self.splen = tf.placeholder(tf.int32, shape=(None))

        self.a = tf.placeholder(tf.int32, shape=(None))  # (nb*state_history,)
        self.past_a = tf.placeholder(tf.int32,
                                     shape=(None))  # (nb*state_history,)
        self.r = tf.placeholder(tf.float32,
                                shape=(None))  # (nb*state_history,)
        self.done_mask = tf.placeholder(tf.bool,
                                        shape=(None))  # (nb*state_history,)
        self.seq_mask = tf.placeholder(tf.bool,
                                       shape=(None))  # (nb*state_history,)
        self.lr = tf.placeholder(tf.float32, shape=(None))

Пример #2

0

Показать файл

Файл: drqn_xworld_planner.py Проект: yangyi02/LandmarkBasedNav

    def update_step(self, t, lr, batch_data):
        """
        Performs an update of parameters by sampling from replay_buffer

        Args:
            t: number of iteration (episode and move)
            replay_buffer: ReplayBuffer instance .sample() gives batches
            lr: (float) learning rate
        Returns:
            loss: (Q - Q_target)^2
        """

        fd = {
            # inputs
            self.s: batch_data.observations,
            self.target_action: batch_data.target,
            self.pred_flag: batch_data.mask,
            self.slen: batch_data.seqlen,
            self.hs: DNC.zero_state(self.config, self.config.batch_size),
            self.lr: lr, 
            # extra info
            self.eval_acc_placeholder: self.eval_acc
        }

        loss_eval, grad_norm_eval, summary, _ = self.sess.run([self.loss, self.grad_norm, 
                                                 self.merged, self.train_op], feed_dict=fd)

        # tensorboard stuff
        self.file_writer.add_summary(summary, t)
        
        return loss_eval, grad_norm_eval

Пример #3

0

Показать файл

Файл: drqn_xworld_planner.py Проект: yangyi02/LandmarkBasedNav

    def get_q_values_op(self, state, seq_len, h_state, scope, reuse=False):
        """
        Returns Q values for all actions

        Args:
            state: (tf tensor) 
                shape = (batch_size, seq_len, img_w, img_h, nchannel)
            seq_len: (tf tensor)
                shape = (batch_size,)
            h_state: (tf tensor) 
                shape = (batch_size, h_size)
            scope: (string) scope name, that specifies if target network or not
            reuse: (bool) reuse of variables in the scope

        Returns:
            out: (tf tensor) of shape = (batch_size * seq_len, num_actions)
            h_state_out: (tf tensor) of shape = (batch_size, h_size)
        """
        num_classes = len(self.env.state.xmap.item_class_id)
        ndigits = self.config.ndigits
        nway = self.config.nway
        dnc_h_size = self.config.dnc_h_size
        out = state
        with tf.variable_scope(scope, reuse = reuse):
            dnc_cell = DNC(config=self.config, output_size=dnc_h_size, clip_value=self.config.dnc_clip_val)
            out, h_state_out = tf.nn.dynamic_rnn(inputs=out, cell=dnc_cell, sequence_length=seq_len, dtype=tf.float32, initial_state=h_state)
            #### out has size (nb, seq_len, dnc_h_size) #####
            out = layers.fully_connected(out, 256, activation_fn = tf.nn.relu, weights_initializer=layers.xavier_initializer(), biases_initializer=tf.zeros_initializer())
            out = layers.fully_connected(out, 2*(num_classes-1)*ndigits*nway, activation_fn = None, weights_initializer=layers.xavier_initializer(), biases_initializer=tf.zeros_initializer())
            #### out has size (nb, seq_len, state_shape[0]*state_shape[1]*state_shape[2]) #####
        return out, h_state_out

Пример #4

0

Показать файл

    def update_step(self, t, replay_buffer, lr):
        """
        Performs an update of parameters by sampling from replay_buffer

        Args:
            t: number of iteration (episode and move)
            replay_buffer: ReplayBuffer instance .sample() gives batches
            lr: (float) learning rate
        Returns:
            loss: (Q - Q_target)^2
        """

        s_batch, slen_batch, a_batch, past_a_batch, r_batch, done_mask_batch, seq_mask_batch, sp_batch, splen_batch = replay_buffer.sample_batch(
            self.config.batch_size)

        fd = {
            # inputs
            self.s: s_batch,
            self.slen: slen_batch,
            self.hs: DNC.zero_state(self.config, self.config.batch_size),
            self.a: a_batch,
            self.past_a: past_a_batch,
            self.r: r_batch,
            self.sp: sp_batch,
            self.splen: splen_batch,
            self.hsp: DNC.zero_state(self.config, self.config.batch_size),
            self.done_mask: done_mask_batch,
            self.seq_mask: seq_mask_batch,
            self.lr: lr,
            # extra info
            self.avg_reward_placeholder: self.avg_reward,
            self.max_reward_placeholder: self.max_reward,
            self.std_reward_placeholder: self.std_reward,
            self.avg_q_placeholder: self.avg_q,
            self.max_q_placeholder: self.max_q,
            self.std_q_placeholder: self.std_q,
            self.eval_reward_placeholder: self.eval_reward,
        }

        loss_eval, grad_norm_eval, summary, _ = self.sess.run(
            [self.loss, self.grad_norm, self.merged, self.train_op],
            feed_dict=fd)

        # tensorboard stuff
        self.file_writer.add_summary(summary, t)

        return loss_eval, grad_norm_eval

Пример #5

0

Показать файл

Файл: drqn_xworld_planner.py Проект: yangyi02/LandmarkBasedNav

    def evaluate(self, model_i, curri_idx=None, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        if curri_idx is None:
            curri_idx = -1
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env

        accs = []

        for i in range(num_episodes):
            encoding_batch = []
            predflag_batch = []
            target_action_batch = []
            slen_batch = []
            max_len = 0
            for j in range(self.config.batch_size):
                #config = self.config
                #config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[curri_idx]
                #self.env.reset(config) # h x w x c
                encoding, target_action, predflag = model_i.gen_sample_seq(self.config.ndigits, self.config.nway)
                encoding_batch.append(encoding[None])
                predflag_batch.append(predflag[None])
                target_action_batch.append(target_action[None])
                slen_batch.append(encoding.shape[0])
                if encoding.shape[0]>max_len:
                    max_len = encoding.shape[0]

            batch_data = DatasetTensors(np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in encoding_batch], axis=0),
                np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in target_action_batch], axis=0),
                np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1]])], axis=1) for x in predflag_batch], axis=0), np.array(slen_batch).astype('int32'))

            h_state = DNC.zero_state(self.config, batch_size=self.config.batch_size)
            pred_action, h_state = self.sess.run([self.q, self.hs_out], feed_dict={self.s: batch_data.observations, self.hs: h_state, self.slen: batch_data.seqlen})
            for j in range(self.config.batch_size):
                accs.append((pred_action[j]*np.expand_dims(batch_data.mask[j],1) == batch_data.target[j]*np.expand_dims(batch_data.mask[j],1)).reshape(-1).all())

        avg_acc = np.mean(accs)
        if num_episodes > 1:
            msg = "Average acc: {:04.2f}".format(avg_acc)
            self.logger.info(msg)
        return avg_acc

Пример #6

0

Показать файл

    def update_step(self, t, lr, batch_data):
        """
        Performs an update of parameters by sampling from replay_buffer

        Args:
            t: number of iteration (episode and move)
            replay_buffer: ReplayBuffer instance .sample() gives batches
            lr: (float) learning rate
        Returns:
            loss: (Q - Q_target)^2
        """

        fd = {
            # inputs
            self.s: batch_data.observations,
            self.target_action: batch_data.target,
            self.pred_flag: batch_data.mask,
            self.slen: batch_data.seqlen,
            self.hs: DNC.zero_state(self.config, self.config.batch_size),
            self.lr: lr,
            # extra info
            self.eval_acc_placeholder: self.eval_acc
        }

        loss_eval, grad_norm_eval, summary, _ = self.sess.run(
            [self.loss, self.grad_norm, self.merged, self.train_op],
            feed_dict=fd)
        '''
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()

        loss_eval, grad_norm_eval, summary, _ = self.sess.run([self.loss, self.grad_norm, 
                                                 self.merged, self.train_op], feed_dict=fd, options=run_options, run_metadata=run_metadata)

        # Create the Timeline object, and write it to a json
        tl = timeline.Timeline(run_metadata.step_stats)
        ctf = tl.generate_chrome_trace_format()
        with open('timeline.json', 'w') as f:
            f.write(ctf)
        '''

        # tensorboard stuff
        self.file_writer.add_summary(summary, t)

        return loss_eval, grad_norm_eval

Пример #7

0

Показать файл

Файл: drqn_xworld_planner.py Проект: yangyi02/LandmarkBasedNav

    def add_placeholders_op(self):
        """
        Adds placeholders to the graph

        These placeholders are used as inputs by the rest of the model building and will be fed
        data during training.
        """
        # here, typically, a state shape is (2*4*3+2)

        num_classes = len(self.env.state.xmap.item_class_id)
        ndigits = self.config.ndigits
        nway = self.config.nway
        self.s = tf.placeholder(tf.float32, shape=(None, None, 2*(num_classes-1)*ndigits*nway+2))
        self.hs = DNC.state_placeholder(self.config)
        self.slen = tf.placeholder(tf.int32, shape=(None))
        self.pred_flag = tf.placeholder(tf.float32, shape=(None, None)) # (nb, state_history)
        self.target_action = tf.placeholder(tf.float32, shape=(None, None, 2*(num_classes-1)*ndigits*nway)) # (nb, state_history, num_actions)
        self.lr = tf.placeholder(tf.float32, shape=(None))

Пример #8

0

Показать файл

    def evaluate(self, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env
        env.state.is_render_image = self.config.render_test

        # replay memory to play
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = []

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            h_state = DNC.zero_state(self.config, batch_size=1)
            slen = np.ones(1).astype('int32')
            action = 0
            for j in range(50):
                if self.config.render_test: env.render()

                #### for replay_buffer
                # store last state in buffer
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                action, action_q, h_state = self.get_action([q_input], h_state,
                                                            slen, [action])
                #print(action, action_q)

                # perform action in env
                new_state, reward, done = env.step(action)

                # store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # count reward
                total_reward += reward
                if done:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward

Пример #9

0

Показать файл

    def train(self, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train

        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            state = self.env.reset()  # h x w x c
            h_state = DNC.zero_state(self.config, batch_size=1)
            slen = np.ones(1).astype('int32')
            action = 0
            for i in range(200):
                t += 1
                last_eval += 1
                last_record += 1
                if self.config.render_train: self.env.render()

                #### for replay_buffer
                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                # chose action according to current Q and exploration
                best_action, q_values, h_state = self.get_best_action(
                    [q_input], h_state, slen, [action])
                action = exp_schedule.get_action(best_action)

                # store q values
                max_q_values.append(max(q_values))
                q_values += list(q_values)
                # perform action in env
                new_state, reward, done = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                # perform a training step
                loss_eval, grad_eval = self.train_step(t, replay_buffer,
                                                       lr_schedule.epsilon)

                # logging stuff
                if ((t > self.config.learning_start)
                        and (t % self.config.log_freq == 0)
                        and (t % self.config.learning_freq == 0)):
                    self.update_averages(rewards, max_q_values, q_values,
                                         scores_eval)
                    exp_schedule.update(t)
                    lr_schedule.update(t)
                    if len(rewards) > 0:
                        prog.update(t + 1,
                                    exact=[("Loss", loss_eval),
                                           ("Avg R", self.avg_reward),
                                           ("Max R", np.max(rewards)),
                                           ("eps", exp_schedule.epsilon),
                                           ("Grads", grad_eval),
                                           ("Max Q", self.max_q),
                                           ("lr", lr_schedule.epsilon)])

                elif (t < self.config.learning_start) and (
                        t % self.config.log_freq == 0):
                    sys.stdout.write("\rPopulating the memory {}/{}...".format(
                        t, self.config.learning_start))
                    sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

            if (t > self.config.learning_start) and (last_eval >
                                                     self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate()]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)

Пример #10

0

Показать файл

    def get_q_values_op(self,
                        state,
                        past_a,
                        seq_len,
                        h_state,
                        scope,
                        reuse=False):
        """
        Returns Q values for all actions

        Args:
            state: (tf tensor) 
                shape = (batch_size, seq_len, img_w, img_h, nchannel)
            seq_len: (tf tensor)
                shape = (batch_size,)
            h_state: (tf tensor) 
                shape = (batch_size, h_size)
            scope: (string) scope name, that specifies if target network or not
            reuse: (bool) reuse of variables in the scope

        Returns:
            out: (tf tensor) of shape = (batch_size * seq_len, num_actions)
            h_state_out: (tf tensor) of shape = (batch_size, h_size)
        """
        num_actions = self.env.agent.num_actions
        dnc_h_size = self.config.dnc_h_size
        out = state
        past_a = tf.one_hot(past_a, num_actions)
        state_shape = list([
            self.env.args.visible_radius_unit_front + 1,
            2 * self.env.args.visible_radius_unit_side + 1,
            len(self.env.state.xmap.item_class_id) + 1
        ])

        with tf.variable_scope(scope, reuse=reuse):
            dnc_cell = DNC(config=self.config,
                           output_size=dnc_h_size,
                           clip_value=self.config.dnc_clip_val)
            out = tf.reshape(
                out,
                shape=[-1, state_shape[0] * state_shape[1] * state_shape[2]])
            out = tf.concat([out, past_a], axis=1)
            out = tf.reshape(
                out,
                shape=[
                    -1, state_shape[0] * state_shape[1] * state_shape[2] +
                    num_actions
                ])
            out = layers.fully_connected(
                out,
                200,
                activation_fn=tf.nn.relu,
                weights_initializer=layers.xavier_initializer(),
                biases_initializer=tf.zeros_initializer())
            out = layers.fully_connected(
                out,
                100,
                activation_fn=tf.nn.relu,
                weights_initializer=layers.xavier_initializer(),
                biases_initializer=tf.zeros_initializer())
            out = tf.reshape(out, shape=[-1, tf.shape(state)[1], 100])
            out, h_state_out = tf.nn.dynamic_rnn(inputs=out,
                                                 cell=dnc_cell,
                                                 sequence_length=seq_len,
                                                 dtype=tf.float32,
                                                 initial_state=h_state)
            # out here has shape (batch_size, state_history, dnc_h_size)
            #out = layers.fully_connected(tf.reshape(out, shape=[-1,dnc_h_size]), num_actions, activation_fn = None, weights_initializer=layers.xavier_initializer(), biases_initializer=tf.zeros_initializer())
            out = tf.reshape(out, shape=[-1, dnc_h_size])
            streamA, streamV = tf.split(out, 2, axis=1)
            advantage = layers.fully_connected(
                streamA,
                num_actions,
                activation_fn=None,
                weights_initializer=layers.xavier_initializer(),
                biases_initializer=tf.zeros_initializer())
            value = layers.fully_connected(
                streamV,
                1,
                activation_fn=None,
                weights_initializer=layers.xavier_initializer(),
                biases_initializer=tf.zeros_initializer())
            out = value + tf.subtract(
                advantage, tf.reduce_mean(advantage, axis=1, keep_dims=True))

        return out, h_state_out

Пример #11

0

Показать файл

    def evaluate(self,
                 cr_schedule,
                 curri_idx=None,
                 env=None,
                 num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        if curri_idx is None:
            curri_idx = -1
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env

        accs = []
        gt_len = []

        for i in range(num_episodes):
            config = self.config
            config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[
                curri_idx]
            env.reset(config)  # h x w x c
            h_state = DNC.zero_state(config, batch_size=1)
            encoding, predflag, target_action = env.prepare_seq()
            slen = np.array(encoding.shape[0]).astype('int32')
            # describe graph, query and planning
            h_state = self.sess.run(self.hs_out,
                                    feed_dict={
                                        self.s: encoding[None],
                                        self.hs: h_state,
                                        self.slen: slen
                                    })
            past_state = -1
            past_action_onehot = -1
            path_len = 0
            for i in range(config.max_step_len):
                gt_action = env.get_gt_action()
                next_state = env.next_state(gt_action)
                if self.config.use_transition_only_during_answering:
                    current_encoding = GraphWorld.convert_triplets_to_encoding(
                        np.array([[-1, -1,
                                   past_action_onehot]]).astype('int32'),
                        config.ndigits, config.nway)
                else:
                    current_encoding = GraphWorld.convert_triplets_to_encoding(
                        np.array([[env.current_state, next_state,
                                   -1]]).astype('int32'), config.ndigits,
                        config.nway)
                    #current_encoding = GraphWorld.convert_triplets_to_encoding(np.array([[env.current_state, env.target_state, past_action_onehot]]).astype('int32'), config.ndigits, config.nway)
                current_encoding = np.concatenate(
                    [current_encoding, np.array([[0, 1]])], axis=1)
                pred_action, h_state = self.sess.run(
                    [self.q, self.hs_out],
                    feed_dict={
                        self.s: current_encoding[None],
                        self.hs: h_state,
                        self.slen: np.ones(1).astype('int32')
                    })
                past_state = env.current_state
                _, done, past_action_onehot = env.step(pred_action.reshape(-1))
                path_len += 1
                if done:
                    break

            accs.append(
                len(env.path[env.src_state]) == path_len
                and env.current_state == env.target_state)
            gt_len.append(len(env.path[env.src_state]))

        avg_acc = np.mean(accs)
        if num_episodes > 1:
            msg = "Average acc: {:04.2f}".format(avg_acc)
            self.logger.info(msg)
        return avg_acc

Пример #12

0

Показать файл

    def train(self, beta_schedule, lr_schedule, cr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        self.init_averages()

        t = last_eval = curri_idx = 0  # time control of nb of steps
        scores_eval = []  # list of scores computed at iteration time

        prog = Progbar(target=self.config.nsteps_train)

        # interact with environment
        while t < self.config.nsteps_train:
            t += 1
            last_eval += 1
            config = self.config
            config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit = cr_schedule[
                curri_idx]
            self.env.reset(config)  # h x w x c
            h_state = DNC.zero_state(config, batch_size=1)
            encoding, predflag, target_action = self.env.prepare_seq()
            slen = np.array(encoding.shape[0]).astype('int32')
            # describe graph, query and planning
            h_state = self.sess.run(self.hs_out,
                                    feed_dict={
                                        self.s: encoding[None],
                                        self.hs: h_state,
                                        self.slen: slen
                                    })
            past_state = -1
            past_action_onehot = -1
            encoding_a = np.zeros([config.max_step_len, encoding.shape[1]])
            predflag_a = np.zeros(config.max_step_len)
            target_action_a = np.zeros(
                [config.max_step_len, target_action.shape[1]])
            for i in range(config.max_step_len):
                if self.config.use_transition_only_during_answering:
                    current_encoding = GraphWorld.convert_triplets_to_encoding(
                        np.array([[-1, -1,
                                   past_action_onehot]]).astype('int32'),
                        config.ndigits, config.nway)
                else:
                    current_encoding = GraphWorld.convert_triplets_to_encoding(
                        np.array([[
                            past_state, self.env.current_state,
                            past_action_onehot
                        ]]).astype('int32'), config.ndigits, config.nway)
                    #current_encoding = GraphWorld.convert_triplets_to_encoding(np.array([[self.env.current_state, self.env.target_state, past_action_onehot]]).astype('int32'), config.ndigits, config.nway)
                current_encoding = np.concatenate(
                    [current_encoding, np.array([[0, 1]])], axis=1)
                gt_action = self.env.get_gt_action()
                encoding_a[i, :] = current_encoding[0]
                predflag_a[i] = 1
                target_action_a[i, :] = gt_action

                pred_action, h_state = self.sess.run(
                    [self.q, self.hs_out],
                    feed_dict={
                        self.s: current_encoding[None],
                        self.hs: h_state,
                        self.slen: np.ones(1).astype('int32')
                    })
                action = self.get_action(pred_action.reshape(-1), gt_action,
                                         beta_schedule.epsilon)
                past_state = self.env.current_state
                _, done, past_action_onehot = self.env.step(action)
                slen += 1
                if done:
                    break

            batch_data = DatasetTensors(
                np.concatenate([encoding, encoding_a], axis=0)[None],
                np.concatenate([target_action, target_action_a], axis=0)[None],
                np.concatenate([predflag, predflag_a], axis=0)[None], slen)

            # perform a training step
            loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon,
                                                   batch_data)

            # logging stuff
            if ((t % config.log_freq == 0)
                    and (t % config.learning_freq == 0)):
                self.update_averages(scores_eval)
                beta_schedule.update(t)
                lr_schedule.update(t)
                prog.update(t + 1,
                            exact=[("Loss", loss_eval), ("Grads", grad_eval),
                                   ("lr", lr_schedule.epsilon)])

            if t >= config.nsteps_train:
                break

            if last_eval >= config.eval_freq:
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d" % (t))
                scores_eval += [self.evaluate(cr_schedule, curri_idx)]
                if scores_eval[-1] > 0.8:
                    curri_idx += 1
                    msg = "Upgrade to lesson {:d}".format(int(curri_idx))
                    self.logger.info(msg)
                    self.logger.info(
                        "----------Start Computing Final Score----------")
                    scores_eval += [self.evaluate(cr_schedule)]
                    self.logger.info(
                        "----------Finish Computing Final Score----------")

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(cr_schedule)]
        export_plot(scores_eval, "Scores", self.config.plot_output)

Python DNC примеры использования