Exemplo n.º 1
0
                           num_masks=0,
                           device=info['DEVICE'])

    if args.model_loadpath is not '':
        # what about random states - they will be wrong now???
        # TODO - what about target net update cnt
        target_net.load_state_dict(model_dict['target_net_state_dict'])
        policy_net.load_state_dict(model_dict['policy_net_state_dict'])
        opt.load_state_dict(model_dict['optimizer'])
        print("loaded model state_dicts")
        # TODO cant load buffer yet
        if args.buffer_loadpath == '':
            args.buffer_loadpath = args.model_loadpath.replace(
                '.pkl', '_train_buffer.pkl')
            print("auto loading buffer from:%s" % args.buffer_loadpath)
            rbuffer.load(args.buffer_loadpath)
    info['args'] = args
    write_info_file(info, model_base_filepath, total_steps)
    random_state = np.random.RandomState(info["SEED"])

    board_logger = TensorBoardLogger(model_base_filedir)
    last_target_update = 0
    print("Starting training")
    all_rewards = []

    epsilon_by_frame = lambda frame_idx: info['EPSILON_MIN'] + (info[
        'EPSILON_MAX'] - info['EPSILON_MIN']) * math.exp(-1. * frame_idx /
                                                         info['EPSILON_DECAY'])
    for epoch_num in range(epoch_start, info['N_EPOCHS']):
        ep_reward, total_steps, etime = run_training_episode(
            epoch_num, total_steps)
Exemplo n.º 2
0
class dqnRunner():
    def __init__(self, sess, params, out_dir=None, agentB_sess=None):
        self.params = params
        self.sess = sess
        self.agentB_sess = agentB_sess

        self.lock = threading.Lock()
        self.modelStoreIntv = 150
        self.bufferStoreIntv = 150
        self.annealSteps = params['annealSteps']

        self.state_dim = params['pxRes']
        if self.params['verbose']:
            printT("tensorflow version: {}".format(tf.__version__))

        # create environment
        self.env = Environment(sess, params, self)
        self.numActions = self.env.numActions

        # load classifier for reward calculation
        if self.params['classNN'] is not None:
            with tf.device("/device:CPU:0"):
                self.rewardClassNet = ClassConvNetEval(self.sess, params)
                self.env.rewardClassNet = self.rewardClassNet

        # just gets or resets global_step
        self.global_step = None
        variables = tf.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        for v in variables:
            if "global_step" in v.name:
                self.global_step = v
        if self.global_step is None:
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)
        self.resetGlStep = tf.assign(self.global_step, 0)

        # load actual dqn
        self.q = DQN(self.sess, self.params['out_dir'], self.global_step,
                     self.params, self.numActions)

        self.evalMethods = ["agent", "random"]
        self.evalMethod = "agent"
        self.qAgentB = None
        if (not self.params['agentB'] is None) and self.params['interEval']:
            self.qAgentB = DQN(self.agentB_sess,
                               self.params['out_dir'],
                               self.global_step,
                               self.params,
                               self.numActions,
                               agentB=True)
            self.evalMethod = "agentA"
            self.evalMethods = ["agentA", "random", "fixed", "agentB"]
            self.sess.as_default()

        # replay buffer (size and type)
        if self.params['replaySz'] is None:
            self.replayBufferSize = 1000000
        else:
            self.replayBufferSize = self.params['replaySz']
        self.replay = ReplayBuffer(self.replayBufferSize)

        # variables for exploration decay
        self.action_step = tf.Variable(0,
                                       name='action_step',
                                       trainable=False,
                                       dtype=tf.int32)
        self.increment_ac_step_op = tf.assign(self.action_step,
                                              self.action_step + 1)
        self.global_action_step = tf.Variable(0,
                                              name='global_action_step',
                                              trainable=False,
                                              dtype=tf.int32)
        self.increment_gac_step_op = tf.assign(self.global_action_step,
                                               self.global_action_step + 1)
        self.episode_step = tf.Variable(0,
                                        name='episode_step',
                                        trainable=False,
                                        dtype=tf.int32)
        self.increment_ep_step_op = tf.assign(self.episode_step,
                                              self.episode_step + 1)
        self.resetEpStep = tf.assign(self.episode_step, 0)
        self.resetAcStep = tf.assign(self.action_step, 0)
        self.resetGAcStep = tf.assign(self.global_action_step, 0)

        # save state
        self.saver = tf.train.Saver(
            max_to_keep=self.params['keepNewestModels'])

        fn = os.path.join(self.params['out_dir'], "mainLoopTime.txt")
        self.mainLoopTimeFile = open(fn, "a")

        fn_ = os.path.join(self.params['out_dir'], "learnLoopTime.txt")
        self.learnLoopTimeFile = open(fn_, "a")

    # main function, runs the learning process
    def run(self):
        # debugging variables, for tensorboard
        if self.params['evaluation']:
            # evaluation episodes, no exploration
            eval_reward = tf.Variable(0., name="evalReward")
            eval_reward_op = tf.summary.scalar("Eval-Reward", eval_reward)
            eval_disc_reward = tf.Variable(0., name="evalDiscReward")
            eval_disc_reward_op = tf.summary.scalar("Eval-Reward_discounted",
                                                    eval_disc_reward)
            eval_stepCount = tf.Variable(0., name="evalStepCount")
            eval_stepCount_op = tf.summary.scalar("Eval-StepCount",
                                                  eval_stepCount)
            eval_sum_vars = [eval_reward, eval_disc_reward, eval_stepCount]
            eval_sum_op = tf.summary.merge(
                [eval_reward_op, eval_disc_reward_op, eval_stepCount_op])

        # (discounted) reward per episode
        episode_reward = tf.Variable(0., name="episodeReward")
        episode_reward_op = tf.summary.scalar("Reward", episode_reward)
        episode_disc_reward = tf.Variable(0., name="episodeDiscReward")
        episode_disc_reward_op = tf.summary.scalar("Reward_discounted",
                                                   episode_disc_reward)

        # average (max q)
        episode_ave_max_q = tf.Variable(0., name='epsideAvgMaxQ')
        episode_ave_max_q_op = tf.summary.scalar("Qmax_Value",
                                                 episode_ave_max_q)

        # number of steps for episode
        stepCount = tf.Variable(0., name="stepCount")
        stepCount_op = tf.summary.scalar("StepCount", stepCount)

        # number of learning iterations(total number of mini batches so far)
        global_step_op = tf.summary.scalar("GlobalStep", self.global_step)

        # current exploration epsilon
        epsilonVar = tf.Variable(0., name="epsilon")
        epsilonVar_op = tf.summary.scalar("Epsilon", epsilonVar)

        summary_vars = [
            episode_reward, episode_disc_reward, episode_ave_max_q, stepCount,
            epsilonVar
        ]
        summary_ops = tf.summary.merge([
            episode_reward_op, episode_disc_reward_op, episode_ave_max_q_op,
            stepCount_op, epsilonVar_op
        ])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.params['out_dir'], "train"), self.sess.graph)

        self.action_vars = []
        self.action_ops = []
        for a in range(self.numActions):
            action = tf.Variable(0., name="qval_action_" + str(a))
            action_op = tf.summary.scalar("Q-Value_Action_" + str(a), action)
            self.action_vars.append(action)
            self.action_ops.append(action_op)
        self.action_ops = tf.summary.merge(self.action_ops)

        # initialize all tensorflow variables
        # and finalize graph (cannot be modified anymore)
        self.sess.run(tf.initialize_all_variables())
        self.sess.graph.finalize()

        # for debugging, variable values before and after
        if self.params['veryveryverbose']:
            variables = tf.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
                                          scope="DQN")
            for v in variables:
                if v.name.endswith("conv1_2/weights:0"):
                    print(v.name, self.sess.run(v))

        # do we want to use pretrained weights for the dqn
        # from the classifier or a pretrained agent?
        if self.params['resume']:
            pass
        elif self.params['useClassNN']:
            print("restoring dqn net from classNN: {}".format(
                self.params['classNN']))
            if "ckpt" in self.params['classNN']:
                self.q.saver.restore(self.sess, self.params['classNN'])
            else:
                self.q.saver.restore(
                    self.sess,
                    tf.train.latest_checkpoint(self.params['classNN']))
        elif self.params['dqnNN'] is not None:
            print("restoring dqn net from dqnNN: {}".format(
                self.params['dqnNN']))
            if "ckpt" in self.params['dqnNN']:
                self.q.saver.restore(self.sess, self.params['dqnNN'])
            else:
                self.q.saver.restore(
                    self.sess,
                    tf.train.latest_checkpoint(self.params['dqnNN']))

        # main network weights are set, now run target init op
        self.sess.run(self.q.target_nn_init_op)

        if (self.params['agentB'] is not None) and self.params['interEval']:
            print("restoring agentB net from {}".format(self.params['agentB']))
            if "ckpt" in self.params['agentB']:
                self.qAgentB.saver.restore(self.agentB_sess,
                                           self.params['agentB'])
            else:
                self.qAgentB.saver.restore(
                    self.agentB_sess,
                    tf.train.latest_checkpoint(self.params['agentB']))

        # for debugging, variable values before and after
        if self.params['veryveryverbose']:
            variables = tf.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
                                          scope="DQN")
            for v in variables:
                if v.name.endswith("conv1_2/weights:0"):
                    print(v.name, self.sess.run(v))

        print("initialize classifier network")
        if self.params['classNN'] is not None:
            print("restoring reward class net from classNN: {}".format(
                self.params['classNN']))
            if "ckpt" in self.params['classNN']:
                self.rewardClassNet.saver.restore(self.sess,
                                                  self.params['classNN'])
            else:
                self.rewardClassNet.saver.restore(
                    self.sess,
                    tf.train.latest_checkpoint(self.params['classNN']))

        # load previously trained model
        if not self.params['resume'] and self.params['loadModel']:
            if "ckpt" in self.params['loadModel']:
                self.saver.restore(self.sess, self.params['loadModel'])
            else:
                self.saver.restore(
                    self.sess,
                    tf.train.latest_checkpoint(self.params['loadModel']))
            printT("Model {} restored.".format(self.params['loadModel']))

        # load previously filled replay buffer
        if not self.params['resume'] and self.params['loadReplay'] is not None:

            self.replay.load(self.params['loadReplay'])

            printT("Buffer {} restored.".format(self.params['loadReplay']))

        # resume old run
        if self.params['resume']:
            self.saver.restore(
                sess,
                tf.train.latest_checkpoint(
                    os.path.join(self.params['out_dir'], "models")))
            printT("Model {} restored.".format(
                tf.train.latest_checkpoint(
                    os.path.join(self.params['out_dir'], "models"))))
            # if not self.params['interEval'] :
            self.replay.load(
                os.path.join(self.params['out_dir'], "replayBuffer"))
            printT("Buffer {} restored.".format(self.params['out_dir']))
        else:
            self.sess.run(self.resetGlStep)

        # start immediately for interactive test runs
        try:
            if os.environ['IS_INTERACTIVE'] == 'true' \
               and \
               not self.params['sleep']:
                self.params['startLearning'] = 1
        except KeyError:
            pass

        # exploration variables
        self.startEpsilon = self.params['epsilonStart']
        self.endEpsilon = self.params['epsilonStop']
        self.epsilon = sess.run(epsilonVar)

        # evaluation/learning/exploration
        self.evalEp = False
        self.learning = True
        self.pauseLearning = False
        self.pauseExploring = False
        self.stopLearning = False
        self.stopExploring = False

        self.qValFileExpl = open(
            os.path.join(self.params['out_dir'], "qValExpl.txt"), "a")
        self.qValFileEval = open(
            os.path.join(self.params['out_dir'], "qValEval.txt"), "a")

        self.actionLogFile = open(
            os.path.join(self.params['out_dir'], "actionLog.txt"), "a")
        self.episodeLogFile = open(
            os.path.join(self.params['out_dir'], "episodeLog.txt"), "a")
        self.episodeEvalLogFile = open(
            os.path.join(self.params['out_dir'], "episodeEvalLog.txt"), "a")

        # remove stop/termination file
        if os.path.exists("stop"):
            os.remove(os.path.join(params['out_dir'], "stop"))

        # reset
        if self.params['onlyLearn']:
            sess.run(self.resetEpStep)
            sess.run(self.resetAcStep)

        if self.params['onlyLearn']:
            self.learn()
            exit()

        # multi-threaded
        # learning and exploration threads act independently?
        if self.params['async']:
            t = threading.Thread(target=self.learnWrap)
            t.daemon = True
            t.start()

        if self.params['evaluation']:
            # evaluate this often
            evalEpReward = 0
            evalEpDiscReward = 0
            evalEpStepCount = 0
            evalIntv = 25
            evalCnt = 40
            evalOc = 0

        # start exploration
        self.episode = sess.run(self.episode_step)
        if self.params['verbose']:
            printT("start Episode: {}".format(self.episode))
        acs = sess.run(self.action_step)
        if self.params['verbose']:
            printT("start action step: {}".format(acs))
        self.globActStep = acs
        gacs = sess.run(self.global_action_step)
        if self.params['verbose']:
            printT("start global action step: {}".format(gacs))
        self.gac = gacs
        while self.episode < self.params['numEpisodes']:
            self.episode = sess.run(self.episode_step)
            sess.run(self.increment_ep_step_op)
            if self.params['verbose']:
                print("STARTING NEW EPISODE:" + str(self.episode))
            # do we want to explore/gather samples?
            while self.stopExploring:
                time.sleep(1)
            # evaluation episode (no exploration?)
            if self.params['evaluation'] and self.episode % (
                    evalIntv + evalCnt) < evalCnt:
                self.evalEp = True
                if self.episode % (evalIntv + evalCnt) == 0:
                    if self.params['verbose']:
                        printT("Start Eval Episodes!")
                    evalOc += 1
            elif self.params['onlyLearn'] or \
               (self.params['limitExploring'] is not None \
                and self.replay.size() >= self.params['limitExploring']):
                self.pauseExploring = True
                self.evalEp = False

            else:
                self.evalEp = False

            # reset simulation/episode state
            terminal = False
            ep_reward = 0
            ep_disc_reward = 0
            ep_ave_max_q = 0
            self.inEpStep = 0

            if self.params['interEval']:
                self.evalMethod = self.evalMethods[self.episode %
                                                   (len(self.evalMethods))]

            # reset environment
            # set start state and allowed actions
            nextState, allowedActions, terminal = self.env.reset(
                self.episode, self.evalEp, globActStep=self.globActStep)
            allowedV = self.calcAllowedActionsVector(allowedActions)

            if nextState is None:
                # unable to get state
                # restart with new episode
                continue

            lastTime = time.time()
            # step forward until terminal
            while not terminal:
                if os.path.exists(os.path.join(params['out_dir'], "stop")):
                    self.terminate()

                if self.params['async']:
                    if not t.isAlive():
                        printT("alive {}".format(t.isAlive()))
                        printT("Exception in user code:")
                        printT('-' * 60)
                        traceback.print_exc(file=sys.stdout)
                        printT('-' * 60)
                        sys.stdout.flush()
                        t.join(timeout=None)
                        os._exit(-1)

                # state <- nextstate
                state = nextState

                # choose action
                # random or according to dqn (depending on epsilon)
                self.inEpStep += 1
                if not self.evalEp:
                    sess.run(self.increment_ac_step_op)
                    self.globActStep += 1
                sess.run(self.increment_gac_step_op)
                self.gac += 1
                epsStep = max(
                    0, self.globActStep - (self.params['startLearning'] / 4.0))
                tmp_step = min(epsStep, self.annealSteps)
                self.epsilon = (self.startEpsilon - self.endEpsilon) * \
                               (1 - tmp_step / self.annealSteps) + \
                               self.endEpsilon

                action = self.getActionID(state, allowedV)

                if self.evalMethod == "fixed":
                    action = self.params['fixedAction']

                # We choose a random action in these cases
                rnm = np.random.rand()
                if self.params['veryveryverbose']:
                    printT("rnm:" + str(rnm) + " self.epsilon:" +
                           str(self.epsilon) + " |self.params['randomEps']:" +
                           str(self.params['randomEps']) + " e:" +
                           str(self.episode))
                if (self.evalMethod == "random"
                    ) or (not self.pauseExploring) and (not self.evalEp) and (
                        self.episode < self.params['randomEps']
                        or rnm < self.epsilon):
                    if self.params['verbose']:
                        printT("randomly selecting action")
                    action = np.random.choice(allowedActions)

                    if self.params['verbose']:
                        printT(
                            "\nEpisode: {}, Step: {}, Time:{}, Next action (e-greedy {}): {}"
                            .format(self.episode, self.globActStep,
                                    time.ctime(), self.epsilon, action))
                else:  # We let the DQN choose the action
                    if self.params['verbose']:
                        printT("Greedyly selecting action:")
                    if self.params['verbose']:
                        printT(
                            "\nEpisode: {}, Step: {}, Time:{}, Next action: {}"
                            .format(self.episode, self.globActStep,
                                    time.ctime(), action))

                # perform selected action and
                # get new state, reward, and termination-info
                nextState, reward, terminal, terminalP, allowedActions = self.env.act(
                    action, self.episode, self.inEpStep, self.globActStep,
                    self.evalEp)
                if self.params['veryveryverbose']:
                    print('ACTIONLOG:',
                          str(self.globActStep), str(self.episode),
                          str(self.inEpStep), action, self.evalEp, terminal,
                          terminalP, reward, self.epsilon, self.evalMethod)
                self.actionLogFile.write(
                    "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                        time.time(), str(self.globActStep), str(self.episode),
                        str(self.inEpStep), action, self.evalEp, terminal,
                        terminalP, reward, self.epsilon, self.evalMethod))
                self.actionLogFile.flush()

                allowedV = self.calcAllowedActionsVector(allowedActions)

                # accumulate episode reward
                ep_disc_reward += pow(self.params['gamma'],
                                      self.inEpStep - 1) * reward
                ep_reward += reward

                if (self.evalMethod == "agent"
                    ) and not self.evalEp and not self.pauseExploring:
                    self.insertSamples(np.copy(state), action, reward,
                                       terminal, np.copy(nextState),
                                       np.copy(allowedV))

                # do logging inside of one episode
                # we do not want to lose any data
                if self.params['storeModel'] and \
                  ((self.globActStep+1) % self.modelStoreIntv) == 0:
                    logDqn.logModel(self)
                if self.params['storeBuffer'] and \
                  ((self.globActStep+1) % self.bufferStoreIntv) == 0:
                    logDqn.logBuffer(self)

                # if training/exploration not decoupled, do one learning step
                if not self.params['async']:
                    for i in range(8):
                        self.learn()

                sys.stdout.flush()

                cTime = time.time()
                usedTime = cTime - lastTime

                # do we want to pause exploration thread?
                # (to simulate slower stm)
                if not self.pauseExploring and \
                   not self.evalEp and \
                   self.params['sleep'] and \
                   self.params['async'] and \
                   (self.replay.size() >= self.params['startLearning']) and \
                   (self.replay.size() >= self.params['miniBatchSize']):
                    if self.params['sleepA'] is not None:
                        sleepingTime = self.params['sleepA'] - usedTime
                        if sleepingTime > 0:
                            time.sleep(sleepingTime)
                    else:
                        time.sleep(60)

                cTime = time.time()
                usedTime = cTime - lastTime
                lastTime = cTime
                self.mainLoopTimeFile.write(
                    str(cTime) + " " + str(usedTime) + "\n")
                self.mainLoopTimeFile.flush()

                # terminate episode after x steps
                # even if no good state has been reached
                if self.inEpStep == self.params['stepsTillTerm']:
                    self.env.switchApproachArea()
                    break
            # end episode

            # otherwise store episode summaries and print log
            if self.evalEp:
                evalEpReward += ep_reward
                evalEpDiscReward += ep_disc_reward
                evalEpStepCount += self.inEpStep
                if self.episode % (evalIntv + evalCnt) == (evalCnt - 1):
                    summary_str = self.sess.run(
                        eval_sum_op,
                        feed_dict={
                            eval_sum_vars[0]: evalEpReward / float(evalCnt),
                            eval_sum_vars[1]:
                            evalEpDiscReward / float(evalCnt),
                            eval_sum_vars[2]: evalEpStepCount / float(evalCnt)
                        })
                    self.writer.add_summary(summary_str, evalOc - 1)
                    evalEpReward = 0.0
                    evalEpDiscReward = 0.0
                    evalEpStepCount = 0.0
                if self.params['veryveryverbose']:
                    printT("step count-eval: {}".format(self.inEpStep))
                if self.params['veryverbose']:
                    printT(
                        'Time: {} | Reward: {} | Discounted Reward: {} | Eval-Episode {}'
                        .format(time.ctime(), ep_reward, ep_disc_reward,
                                self.episode))

                self.episodeEvalLogFile.write(
                    "{}\t{}\t{}\t{}\t{}\t{}\n".format(time.time(),
                                                      self.episode, ep_reward,
                                                      ep_disc_reward,
                                                      self.inEpStep,
                                                      self.epsilon))
                self.episodeEvalLogFile.flush()
            else:
                if self.params['evaluation']:
                    et = self.episode - (evalOc * evalCnt)
                else:
                    et = self.episode
                summary_str = self.sess.run(summary_ops,
                                            feed_dict={
                                                summary_vars[0]:
                                                ep_reward,
                                                summary_vars[1]:
                                                ep_disc_reward,
                                                summary_vars[2]:
                                                ep_ave_max_q /
                                                float(max(self.inEpStep, 1)),
                                                summary_vars[3]:
                                                self.inEpStep,
                                                summary_vars[4]:
                                                self.epsilon
                                            })
                self.writer.add_summary(summary_str, et)
                self.writer.flush()
                if self.params['veryveryverbose']:
                    printT("step count: {}".format(self.inEpStep))
                if self.params['veryveryverbose']:
                    printT(
                        'Time: {} | Reward: {} | Discounted Reward: {} | Episode {} | Buffersize: {}'
                        .format(time.ctime(), ep_reward, ep_disc_reward,
                                self.episode, self.replay.size()))

                self.episodeLogFile.write(
                    "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                        time.time(), self.episode, ep_reward, ep_disc_reward,
                        self.inEpStep, self.epsilon, self.evalMethod))
                self.episodeLogFile.flush()

            # log some stuff
            if self.params['storeModel'] and \
               ((self.episode+1) % self.modelStoreIntv) == 0:
                logDqn.logModel(self)
            if self.params['storeBuffer'] and \
               ((self.episode+1) % self.bufferStoreIntv) == 0:
                logDqn.logBuffer(self)
            statsIntv = 100

            sys.stdout.flush()

        # stop learning after last episode
        self.learning = False
        sys.stdout.flush()

    def terminate(self):
        printT("terminating...........")
        sys.stdout.flush()
        self.logStuff()
        sys.stdout.flush()
        printT("EXIT NOW!")
        sys.stdout.flush()
        exit(0)

    def learnWrap(self):
        try:
            self.learn()
        except:
            printT("learn wrap failed")
            printT("Exception in user code:")
            printT('-' * 60)
            traceback.print_exc(file=sys.stdout)
            printT('-' * 60)
            sys.stdout.flush()
            os._exit(-1)

    def learn(self):
        y_batch = np.zeros((self.params['miniBatchSize'], 1))

        tmp = np.zeros((self.params['miniBatchSize'], self.numActions))
        lastTime = time.time()
        count = 0

        while self.learning:
            # Throtteling to allow the other thread a chance
            count += 1

            cTime = time.time()
            loopTime = cTime - lastTime
            lastTime = cTime
            self.learnLoopTimeFile.write(
                str(cTime) + " " + str(loopTime) + "\n")
            self.learnLoopTimeFile.flush()

            if self.stopLearning:
                time.sleep(5.0)
                continue

            if   self.replay.size() < self.params['startLearning'] or \
               self.replay.size() < self.params['miniBatchSize'] or \
               self.evalEp:
                if self.params['async']:
                    time.sleep(5.0)
                    continue
                else:
                    return

            s_batch, a_batch, r_batch, t_batch, ns_batch, allowed_batch = \
                self.replay.sample_batch(self.params['miniBatchSize'])

            if self.params['doubleDQN']:
                qValsNewState = self.estimate_ddqn(ns_batch,
                                                   allowed_batch,
                                                   p=False,
                                                   mem=tmp)
            else:
                qValsNewState = self.predict_target_nn(ns_batch)

            for i in range(self.params['miniBatchSize']):
                if t_batch[i]:
                    y_batch[i] = r_batch[i]
                else:
                    y_batch[i] = r_batch[
                        i] + self.params['gamma'] * qValsNewState[i]

            gS, qs, delta = self.update(s_batch, a_batch, y_batch)

            if self.params['noHardResetDQN']:
                self.update_targets()
            elif (gS + 1) % self.params['resetFreq'] == 0:
                self.update_targets()

            if not self.params['async']:
                return

            if self.params['onlyLearn']:
                if (gS + 1) % 1000 == 0:
                    logDqn.logModel(self)

    # Returns vector of length 'self.numActions' containing
    # Zeros for allowed actions
    # '-inf' for forbidden actions
    def calcAllowedActionsVector(self, allowedActions):
        allowedV = np.zeros(shape=(self.numActions))
        allowedV[:] = float("-inf")  # init all actions as fobidden
        for i in allowedActions:
            allowedV[i] = 0  # mark actions as allowed
        return allowedV

    # get action id for max q
    def getActionID(self, state, allowedActionsV):
        if self.params['interEval'] and self.evalMethod == 'agentB':
            if self.params['verbose']:
                print("PREDICTING WITH AGENTB:")
            qs = self.qAgentB.run_predict(state)
            print(qs)
        else:
            if self.params['verbose']:
                print("PREDICTING WITH AGENT:")
            qs = self.q.run_predict(state)
        if self.evalEp:
            self.qValFileEval.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                time.time(), str(self.globActStep), str(self.episode),
                str(self.inEpStep), qs[0], allowedActionsV))
            self.qValFileEval.flush()
        else:
            self.qValFileExpl.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                time.time(), str(self.globActStep), str(self.episode),
                str(self.inEpStep), qs[0], allowedActionsV))
            self.qValFileExpl.flush()

        var_dict = {}
        for a in range(self.numActions):
            var_dict[self.action_vars[a]] = qs[0][a]
        summary_str = self.sess.run(self.action_ops, feed_dict=var_dict)
        self.writer.add_summary(summary_str, self.gac)
        self.writer.flush()
        printT("Q-values:" + str(qs))
        qs = qs + allowedActionsV
        return np.argmax(qs, axis=1)[0]

    # update dqn main network
    def update(self, states, actionIDs, targets):
        step, out, delta, loss = self.q.run_train(states, actionIDs, targets)
        # network diverged?
        if np.isnan(loss):
            printT("ABORT: NaN")
            sys.stdout.flush()
            os._exit(-1)
        return step, out, delta

    # update dqn target network
    def update_targets(self):
        self.q.run_update_target_nn()

    # estimate q values using double dqn
    # get values of target network for actions where main network is max
    def estimate_ddqn(self, states, allowedActionsV, p=False, mem=None):
        qs = self.q.run_predict(states)
        if p:
            if self.params['veryveryverbose']:
                print("allowedActionsV.shape" + str(allowedActionsV.shape))
                print("qs.shape" + str(qs.shape))
        qs += allowedActionsV  # add '-inf' to the q values of forbidden actions
        if p:
            if self.params['veryveryverbose']:
                print(states)
                print(qs.shape)
                print(states.shape)
                printT("qs: {}".format(qs))
        maxA = np.argmax(qs, axis=1)

        qs = self.q.run_predict_target(states)
        mem.fill(0)
        mem[np.arange(maxA.size), maxA] = 1
        mem = mem * qs
        mem = np.sum(mem, axis=1)
        return mem

    # predict dqns
    def predict_target_nn(self, states):
        qs = self.q.run_predict_target(states)
        return np.max(qs, axis=1)

    def predict_nn(self, states):
        qs = self.q.run_predict(states)
        return np.max(qs, axis=1)

    # insert samples into replay buffer
    def insertSamples(self, stateScaled, action, reward, terminal,
                      newStateScaled, allowedActionsV):

        stateScaled.shape = (stateScaled.shape[1], stateScaled.shape[2],
                             stateScaled.shape[3])
        newStateScaled.shape = (newStateScaled.shape[1],
                                newStateScaled.shape[2],
                                newStateScaled.shape[3])

        states = (stateScaled, np.rot90(stateScaled,
                                        2), np.fliplr(stateScaled),
                  np.flipud(stateScaled))
        newStates = (newStateScaled, np.rot90(newStateScaled, 2),
                     np.fliplr(newStateScaled), np.flipud(newStateScaled))

        if (self.params['fullAugmentation']):
            self.lock.acquire()
            for i in range(4):
                for j in range(4):
                    self.replay.add(states[i], action, reward, terminal,
                                    allowedActionsV, newStates[j])
            self.lock.release()
        else:
            self.lock.acquire()
            self.replay.add(stateScaled, action, reward, terminal,
                            allowedActionsV, newStateScaled)
            self.replay.add(np.ascontiguousarray(np.rot90(stateScaled, 2)),
                            action, reward, terminal, allowedActionsV,
                            np.ascontiguousarray(np.rot90(newStateScaled, 2)))
            self.replay.add(np.ascontiguousarray(np.fliplr(stateScaled)),
                            action, reward, terminal, allowedActionsV,
                            np.ascontiguousarray(np.fliplr(newStateScaled)))
            self.replay.add(np.ascontiguousarray(np.flipud(stateScaled)),
                            action, reward, terminal, allowedActionsV,
                            np.ascontiguousarray(np.flipud(newStateScaled)))
            self.lock.release()

        # if we want to stop if buffer is full
        # or limit exploration
        if self.pauseExploring == False and \
           self.replay.size() == self.replayBufferSize:
            if self.params['termAtFull']:
                printT("Buffer FULL!")
                self.logStuff()
                self.pauseExploring = True
                # exit()
        elif self.pauseExploring == False and \
             self.params['limitExploring'] is not None and \
             self.replay.size() >= self.params['limitExploring']:
            if self.params['termAtFull']:
                printT("Buffer FULL!")
                self.logStuff()
                self.pauseExploring = True

    def logStuff(self):
        logDqn.logModel(self)
        logDqn.logBuffer(self)
Exemplo n.º 3
0
def main():
    args = parser.parse_args()

    # Seq sequence length & visualization_num
    args.seq_len = args.target_num if args.seq_len is None else args.seq_len
    args.visualization_dir = os.path.join('exp', args.exp, 'visualization')
    utils.mkdir(args.visualization_dir)

    # Set exp directory and tensorboard writer
    writer_dir = os.path.join('exp', args.exp)
    utils.mkdir(writer_dir)
    writer = SummaryWriter(writer_dir)

    # Save arguments
    str_list = []
    for key in vars(args):
        print('[{0}] = {1}'.format(key, getattr(args, key)))
        str_list.append('--{0}={1} \\'.format(key, getattr(args, key)))
    with open(os.path.join('exp', args.exp, 'args.txt'), 'w+') as f:
        f.write('\n'.join(str_list))

    # Set directory. e.g. replay buffer, visualization, model snapshot
    args.replay_buffer_dir = os.path.join('exp', args.exp, 'replay_buffer')
    args.model_dir = os.path.join('exp', args.exp, 'models')
    utils.mkdir(args.model_dir)

    # Reset random seeds
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Set device
    device = torch.device('cpu') if args.gpu == '-1' else torch.device(
        f'cuda:{args.gpu}')

    # Set replay buffer
    replay_buffer = ReplayBuffer(args.replay_buffer_dir,
                                 args.replay_buffer_size)
    if args.load_replay_buffer is not None:
        print(f'==> Loading replay buffer from {args.load_replay_buffer}')
        replay_buffer.load(
            os.path.join('exp', args.load_replay_buffer, 'replay_buffer'))
        print(
            f'==> Loaded replay buffer from {args.load_replay_buffer} [size = {replay_buffer.length}]'
        )

    # Set model and optimizer
    if args.model_type == 'adagrasp':
        model = GraspingModel(num_rotations=args.num_rotations,
                              gripper_final_state=True)
    elif args.model_type == 'adagrasp_init_only':
        model = GraspingModel(num_rotations=args.num_rotations,
                              gripper_final_state=False)
    elif args.model_type == 'scene_only':
        model = GraspingModelSceneOnly(num_rotations=args.num_rotations,
                                       gripper_final_state=True)
    else:
        raise NotImplementedError(f'Does not support {args.model_type}')
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 betas=(0.9, 0.95))
    model = model.to(device)

    #check cuda memory allocation
    if args.gpu != '-1':
        bytes_allocated = torch.cuda.memory_allocated(device)
        print("Model size: {:.3f} MB".format(bytes_allocated / (1024**2)))

    # load checkpoint
    if args.load_checkpoint is not None:
        print(f'==> Loading checkpoint from {args.load_checkpoint}')
        if args.load_checkpoint.endswith('.pth'):
            checkpoint = torch.load(args.load_checkpoint, map_location=device)
        else:
            checkpoint = torch.load(os.path.join('exp', args.load_checkpoint,
                                                 'models', 'latest.pth'),
                                    map_location=device)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint[
            'epoch'] if args.load_replay_buffer is not None else 0
        print(f'==> Loaded checkpoint from {args.load_checkpoint}')
    else:
        start_epoch = 0

    # launch processes for each env
    args.num_envs = 1 if args.gui else args.num_envs
    processes, conns = [], []
    ctx = mp.get_context('spawn')
    for rank in range(args.num_envs):
        conn_main, conn_env = ctx.Pipe()
        reset_args = {
            'num_open_scale': args.num_open_scale,
            'max_open_scale': args.max_open_scale,
            'min_open_scale': args.min_open_scale,
            'gripper_final_state': args.model_type == 'adagrasp',
            'target_num': args.target_num,
            'obstacle_num': args.obstacle_num
        }
        p = ctx.Process(target=env_process,
                        args=(rank, start_epoch + args.seed + rank, conn_env,
                              args.gui, args.num_cam, args.seq_len,
                              reset_args))
        p.daemon = True
        p.start()
        processes.append(p)
        conns.append(conn_main)

    # Initialize exit signal handler (for graceful exits)
    def save_and_exit(signal, frame):
        print('Warning: keyboard interrupt! Cleaning up...')
        for p in processes:
            p.terminate()
        replay_buffer.dump()
        writer.close()
        print('Finished. Now exiting gracefully.')
        sys.exit(0)

    signal.signal(signal.SIGINT, save_and_exit)

    for epoch in range(start_epoch, args.epoch):
        print(f'---------- epoch-{epoch + 1} ----------')
        timestamp = time.time()

        assert args.min_epsilon <= args.max_epsilon
        m1, m2 = args.min_epsilon, args.max_epsilon
        epsilon = max(m1, m2 - (m2 - m1) * epoch / args.exploration_epoch)
        # Data collection
        data = collect_data(
            conns,
            model,
            device,
            n_steps=1,
            epsilon=epsilon,
            gripper_final_state=(args.model_type == 'adagrasp'))

        for d in data.values():
            replay_buffer.save_data(d)

        average_reward = np.mean([d['reward'] for d in data.values()])
        average_score = np.mean([d['score'] for d in data.values()])
        print(
            f'Mean reward = {average_reward:.3f}, Mean score = {average_score:.3f}'
        )
        writer.add_scalar('Data Collection/Reward', average_reward, epoch + 1)
        writer.add_scalar('Data Collection/Score', average_score, epoch + 1)

        time_data_collection = time.time() - timestamp

        # Replay buffer statistic
        reward_data = np.array(replay_buffer.scalar_data['reward'])
        print(
            f'Replay buffer size = {len(reward_data)} (positive = {len(np.argwhere(reward_data == 1))}, negative = {len(np.argwhere(reward_data == 0))})'
        )

        # Policy training
        model.train()
        torch.set_grad_enabled(True)
        sum_loss = 0
        score_statics = {'positive': list(), 'negative': list()}
        for _ in range(args.iter_per_epoch):
            iter_loss, iter_score_statics = train(
                model,
                device,
                replay_buffer,
                optimizer,
                args.batch_size,
                gripper_final_state=(args.model_type == 'adagrasp'))
            sum_loss += iter_loss
            score_statics['positive'].append(iter_score_statics[1])
            score_statics['negative'].append(iter_score_statics[0])
        average_loss = sum_loss / args.iter_per_epoch
        positive_score_prediction = np.mean(score_statics['positive'])
        negative_score_prediction = np.mean(score_statics['negative'])
        print(
            f'Training loss = {average_loss:.5f}, positive_mean = {positive_score_prediction:.3f}, negative_mean = {negative_score_prediction:.3f}'
        )
        writer.add_scalar('Policy Training/Loss', average_loss, epoch + 1)
        writer.add_scalar('Policy Training/Positive Score Prediction',
                          positive_score_prediction, epoch + 1)
        writer.add_scalar('Policy Training/Negative Score Prediction',
                          negative_score_prediction, epoch + 1)

        # Save model and optimizer
        if (epoch + 1) % args.snapshot_gap == 0:
            model.eval()
            torch.set_grad_enabled(False)

            # Visualization
            [conn.send("reset") for conn in conns]
            data = collect_data(
                conns,
                model,
                device,
                n_steps=args.seq_len,
                epsilon=0,
                gripper_final_state=(args.model_type == 'adagrasp'))

            vis_path = os.path.join(args.visualization_dir,
                                    'epoch_%06d' % (epoch + 1))
            utils.visualization(data, args.num_envs, args.seq_len,
                                args.num_open_scale, args.num_rotations,
                                args.num_vis, vis_path)

            save_state = {
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch + 1
            }
            torch.save(save_state, os.path.join(args.model_dir, 'latest.pth'))
            shutil.copyfile(
                os.path.join(args.model_dir, 'latest.pth'),
                os.path.join(args.model_dir, 'epoch_%06d.pth' % (epoch + 1)))

            # Save replay buffer
            replay_buffer.dump()

        # Print elapsed time for an epoch
        time_all = time.time() - timestamp
        time_training = time_all - time_data_collection
        print(
            f'Elapsed time = {time_all:.2f}: (collect) {time_data_collection:.2f} + (train) {time_training:.2f}'
        )

    save_and_exit(None, None)
Exemplo n.º 4
0
def trainer(env,
            outdir,
            epochs=100,
            MINIBATCH_SIZE=64,
            GAMMA=0.99,
            epsilon=0.01,
            min_epsilon=0.01,
            BUFFER_SIZE=10000,
            train_indicator=False,
            render=False):
    tf.reset_default_graph()
    with tf.Session(config=config) as sess:

        # configuring environment
        #env = gym.make(ENV_NAME)
        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)
        # info of the environment to pass to the agent
        state_dim = env.observation_space
        action_dim = env.action_space
        action_bound = np.float64(
            1
        )  # I choose this number since the mountain continuos does not have a boundary
        # Creating agent

        # FOR the RNN
        #tf.contrib.rnn.core_rnn_cell.BasicLSTMCell from https://github.com/tensorflow/tensorflow/issues/8771
        #cell = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None)
        #cell_target = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None)
        ruido = OUNoise(action_dim,
                        mu=0.4)  # this is the Ornstein-Uhlenbeck Noise
        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU, outdir)
        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars(), outdir)

        #sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        actor.update_target_network()
        critic.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
        replay_buffer.load()

        #goal = 0
        max_state = -1.
        try:
            critic.recover_critic()
            actor.recover_actor()
            print('********************************')
            print('models restored succesfully')
            print('********************************')
        except Exception as e:
            print('********************************')
            print(e)
            print('********************************')
        #critic.recover_critic()
        #actor.recover_actor()

        for i in range(epochs):
            state = env.reset()
            #state = np.hstack(state)
            ep_reward = 0
            ep_ave_max_q = 0
            done = False
            step = 0
            max_state_episode = -1
            epsilon -= epsilon / EXPLORE
            if epsilon < min_epsilon:
                epsilon = min_epsilon
            while (not done):

                if render:
                    env.render()

                #print('step', step)
                # 1. get action with actor, and add noise

                np.set_printoptions(precision=4)
                # remove comment if you want to see a step by step update
                #print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode)

                # 2. take action, see next state and reward :
                action_original = actor.predict(
                    np.reshape(state,
                               (1, actor.s_dim
                                )))  # + (10. / (10. + i))* np.random.randn(1)
                action = action_original  #+ max(epsilon, 0) * ruido.noise()
                '''
                for j in range(action.shape[1]):
                    if abs(action[0,j]) > 1:
                        act=action[0,j]
                        action[0,j]=act/abs(act)
                    else:
                        continue
                '''
                action = np.reshape(action, (actor.a_dim, ))
                next_state, reward, done, info = env.step(action)
                if train_indicator:
                    # 3. Save in replay buffer:
                    replay_buffer.add(np.reshape(state, (actor.s_dim, )),
                                      np.reshape(action, (actor.a_dim, )),
                                      reward, done,
                                      np.reshape(next_state, (actor.s_dim, )))

                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:

                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                            MINIBATCH_SIZE)

                        # Calculate targets

                        # 5. Train critic Network (states,actions, R + gamma* V(s', a')):
                        # 5.1 Get critic prediction = V(s', a')
                        # the a' is obtained using the actor prediction! or in other words : a' = actor(s')
                        target_q = critic.predict_target(
                            s2_batch, actor.predict_target(s2_batch), 20)

                        # 5.2 get y_t where:
                        y_i = []
                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                y_i.append(r_batch[k])
                            else:
                                y_i.append(r_batch[k] + GAMMA * target_q[k])

                        # 5.3 Train Critic!
                        predicted_q_value, _ = critic.train(
                            s_batch, a_batch,
                            np.reshape(y_i, (MINIBATCH_SIZE, 1)), 20)

                        ep_ave_max_q += np.amax(predicted_q_value)

                        # 6 Compute Critic gradient (depends on states and actions)
                        # 6.1 therefore I first need to calculate the actions the current actor would take.
                        a_outs = actor.predict(s_batch)
                        # 6.2 I calculate the gradients
                        grads = critic.action_gradients(s_batch, a_outs, 20)
                        c = np.array(grads)
                        #print(c.shape)
                        #print('...')
                        #print('...',c[0].shape)
                        #print('...')
                        actor.train(s_batch, grads[0])

                        # Update target networks
                        actor.update_target_network()
                        critic.update_target_network()
                state = next_state
                if next_state[0] > max_state_episode:
                    max_state_episode = next_state[0]

                ep_reward = ep_reward + reward
                step += 1

            if max_state_episode > max_state:
                max_state = max_state_episode

            print('th', i + 1, 'Step', step, 'Reward:', ep_reward, 'Pos',
                  next_state[0], next_state[1], 'epsilon', epsilon)
            print('*************************')
            print('now we save the model')
            critic.save_critic()
            actor.save_actor()
            print('model saved succesfuly')
            print('*************************')
            replay_buffer.save()
            #proc = Popen(['rosclean','purge'],stdout=PIPE, stdin=PIPE, stderr=PIPE,universal_newlines=True)
            #out,err = proc.communicate(input="{}\n".format("y"))
            #print('maxmimum state reach', max_state)
            #print('the reward at the end of the episode,', reward)
            #print('Efficiency', 100.*((goal)/(i+1.)))
        '''
        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
        replay_buffer.save()
        #env.close()
        '''
        sess.close()
    return 0
Exemplo n.º 5
0
def trainer(epochs=1000,
            MINIBATCH_SIZE=32,
            GAMMA=0.99,
            save=1,
            save_image=1,
            epsilon=1.0,
            min_epsilon=0.05,
            BUFFER_SIZE=15000,
            train_indicator=True,
            render=True):
    with tf.Session() as sess:

        # configuring the random processes
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)

        # environment

        env = gym.make('CartPole-v1')
        print('action ', env.action_space)
        print('obs ', env.observation_space)
        observation_space = 4
        action_space = 2
        '''
        env = gym.make('FrozenLake8x8-v0') 
        print('action ', env.action_space)
        print('obs ', env.observation_space)
        observation_space = 64
        action_space = 4
        '''
        # agent
        agent = Network(sess,
                        observation_space,
                        action_space,
                        LEARNING_RATE,
                        DEVICE,
                        layer_norm=False)

        # worker_summary = tf.Summary()
        writer = tf.summary.FileWriter('./train', sess.graph)

        # TENSORFLOW init seession
        sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        agent.update_target_network()
        # Initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
        replay_buffer.load()
        print('buffer size is now', replay_buffer.count)
        # this is for loading the net

        if save:
            try:
                agent.recover()
                print('********************************')
                print('models restored succesfully')
                print('********************************')
            except:
                print('********************************')
                print('Failed to restore models')
                print('********************************')
        loss = 0.
        j = 0
        for i in range(epochs):

            if (i % 500 == 0) and (i != 0):
                print('*************************')
                print('now we save the model')
                agent.save()
                #replay_buffer.save()
                print('model saved succesfuly')
                print('*************************')

            if i % 200 == 0:
                agent.update_target_network()
                print('update_target_network')

            state = env.reset()
            # state = to_one_hot(state, observation_space)
            # print('state', state)
            q0 = np.zeros(action_space)
            ep_reward = 0.
            done = False
            step = 0
            loss_vector = deque()
            lr = 0.
            while not done:
                j = j + 1
                epsilon -= 0.0000051
                epsilon = np.maximum(min_epsilon, epsilon)

                # Get action with e greedy

                if np.random.random_sample() < epsilon:
                    #Explore!
                    action = np.random.randint(0, action_space)
                else:
                    # Just stick to what you know bro
                    q0 = agent.predict(
                        np.reshape(state, (1, observation_space)))
                    action = np.argmax(q0)

                next_state, reward, done, info = env.step(action)
                # next_state = to_one_hot(next_state, observation_space)

                # I made a change to the reward
                reward = np.cos(2 * next_state[3])

                if train_indicator:

                    # Keep adding experience to the memory until
                    # there are at least minibatch size samples
                    if replay_buffer.size() > MINIBATCH_SIZE:
                        # 4. sample random minibatch of transitions:
                        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                            MINIBATCH_SIZE)
                        q_eval = agent.predict_target(
                            np.reshape(s2_batch,
                                       (MINIBATCH_SIZE, observation_space)))
                        q_target = np.zeros(MINIBATCH_SIZE)
                        # q_target = q_eval.copy()
                        for k in range(MINIBATCH_SIZE):
                            if t_batch[k]:
                                q_target[k] = r_batch[k]
                            else:
                                q_target[k] = r_batch[k] + GAMMA * np.max(
                                    q_eval[k])

                        #5.3 Train agent!
                        summary, loss, _ = agent.train(
                            np.reshape(a_batch, (MINIBATCH_SIZE, 1)),
                            np.reshape(q_target, (MINIBATCH_SIZE, 1)),
                            np.reshape(s_batch,
                                       (MINIBATCH_SIZE, observation_space)))
                        loss_vector.append(loss)
                        writer.add_summary(summary, j)
                        # this function is there so you can see the gradients and the updates for debuggin
                        #actiones, action_one_hot, out, target_q_t, q_acted_0, q_acted, delta, loss, _ = agent.train_v2(np.reshape(a_batch,(MINIBATCH_SIZE,1)),np.reshape(q_target,(MINIBATCH_SIZE, 1)), np.reshape(s_batch,(MINIBATCH_SIZE,observation_space)) )
                        #print('action',actiones, 'action one hot', action_one_hot, 'out', out,'q acted 0', q_acted_0,  'q acted', q_acted, 'target', target_q_t, 'loss',loss, 'delta', delta)
                # 3. Save in replay buffer:
                replay_buffer.add(state, action, reward, done, next_state)

                # prepare for next state
                state = next_state
                ep_reward = ep_reward + reward
                step += 1

            print('th', i + 1, 'Step', step, 'Reward:', round(ep_reward, 0),
                  'epsilon', round(epsilon, 3), 'loss',
                  round(np.mean(loss_vector), 3), lr)

        print('*************************')
        print('now we save the model')
        agent.save()
        #replay_buffer.save()
        print('model saved succesfuly')
        print('*************************')