예제 #1
0
def test_training_flag():
    obs_size = (3, 4)

    obs0 = np.random.random(obs_size)
    terminal0 = False

    obs1 = np.random.random(obs_size)
    terminal1 = True

    obs2 = np.random.random(obs_size)
    terminal2 = False

    for training in (True, False):
        memory = SequentialMemory(3, window_length=2)

        state = memory.get_recent_state(obs0)
        assert state.shape == (2,) + obs_size
        assert np.allclose(state[0], 0.)
        assert np.all(state[1] == obs0)
        assert memory.nb_entries == 0
        
        memory.append(obs0, 0, 0., terminal1, training=training)
        state = memory.get_recent_state(obs1)
        assert state.shape == (2,) + obs_size
        assert np.all(state[0] == obs0)
        assert np.all(state[1] == obs1)
        if training:
            assert memory.nb_entries == 1
        else:
            assert memory.nb_entries == 0

        memory.append(obs1, 0, 0., terminal2, training=training)
        state = memory.get_recent_state(obs2)
        assert state.shape == (2,) + obs_size
        assert np.allclose(state[0], 0.)
        assert np.all(state[1] == obs2)
        if training:
            assert memory.nb_entries == 2
        else:
            assert memory.nb_entries == 0
예제 #2
0
               enable_dueling_network=True,
               dueling_type='avg',
               target_model_update=1e-2,
               policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# load memory if possible
if args.load_memory == "agent":
    if not os.path.isfile(MEMORY_FILE):
        env.close()
        del env
        exit(0)
    with open(MEMORY_FILE, 'rb') as handle:
        memory_list = pickle.load(handle)
        for obs, a, r, done, training in memory_list:
            memory.append(obs, a, r, done, training)

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=args.fit_step, visualize=False, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights(args.out_dir +
                 '/duel_dqn_{}_weights.h5f'.format(CONFIG_FILE[7:-5]),
                 overwrite=True)

env.close()
del env

df = pd.DataFrame()
예제 #3
0
def test_get_recent_state_with_episode_boundaries():
    memory = SequentialMemory(3, window_length=2, ignore_episode_boundaries=False)
    obs_size = (3, 4)
    
    obs0 = np.random.random(obs_size)
    terminal0 = False

    obs1 = np.random.random(obs_size)
    terminal1 = False

    obs2 = np.random.random(obs_size)
    terminal2 = False

    obs3 = np.random.random(obs_size)
    terminal3 = True

    obs4 = np.random.random(obs_size)
    terminal4 = False

    obs5 = np.random.random(obs_size)
    terminal5 = True

    obs6 = np.random.random(obs_size)
    terminal6 = False

    state = memory.get_recent_state(obs0)
    assert state.shape == (2,) + obs_size
    assert np.allclose(state[0], 0.)
    assert np.all(state[1] == obs0)

    # memory.append takes the current observation, the reward after taking an action and if
    # the *new* observation is terminal, thus `obs0` and `terminal1` is correct.
    memory.append(obs0, 0, 0., terminal1)
    state = memory.get_recent_state(obs1)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == obs0)
    assert np.all(state[1] == obs1)

    memory.append(obs1, 0, 0., terminal2)
    state = memory.get_recent_state(obs2)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == obs1)
    assert np.all(state[1] == obs2)

    memory.append(obs2, 0, 0., terminal3)
    state = memory.get_recent_state(obs3)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == obs2)
    assert np.all(state[1] == obs3)

    memory.append(obs3, 0, 0., terminal4)
    state = memory.get_recent_state(obs4)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == np.zeros(obs_size))
    assert np.all(state[1] == obs4)

    memory.append(obs4, 0, 0., terminal5)
    state = memory.get_recent_state(obs5)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == obs4)
    assert np.all(state[1] == obs5)

    memory.append(obs5, 0, 0., terminal6)
    state = memory.get_recent_state(obs6)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == np.zeros(obs_size))
    assert np.all(state[1] == obs6)
예제 #4
0
def test_sampling():
    memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False)
    obs_size = (3, 4)
    actions = range(5)
    
    obs0 = np.random.random(obs_size)
    terminal0 = False
    action0 = np.random.choice(actions)
    reward0 = np.random.random()
    
    obs1 = np.random.random(obs_size)
    terminal1 = False
    action1 = np.random.choice(actions)
    reward1 = np.random.random()
    
    obs2 = np.random.random(obs_size)
    terminal2 = False
    action2 = np.random.choice(actions)
    reward2 = np.random.random()
    
    obs3 = np.random.random(obs_size)
    terminal3 = True
    action3 = np.random.choice(actions)
    reward3 = np.random.random()

    obs4 = np.random.random(obs_size)
    terminal4 = False
    action4 = np.random.choice(actions)
    reward4 = np.random.random()

    obs5 = np.random.random(obs_size)
    terminal5 = False
    action5 = np.random.choice(actions)
    reward5 = np.random.random()

    obs6 = np.random.random(obs_size)
    terminal6 = False
    action6 = np.random.choice(actions)
    reward6 = np.random.random()
    
    # memory.append takes the current observation, the reward after taking an action and if
    # the *new* observation is terminal, thus `obs0` and `terminal1` is correct.
    memory.append(obs0, action0, reward0, terminal1)
    memory.append(obs1, action1, reward1, terminal2)
    memory.append(obs2, action2, reward2, terminal3)
    memory.append(obs3, action3, reward3, terminal4)
    memory.append(obs4, action4, reward4, terminal5)
    memory.append(obs5, action5, reward5, terminal6)
    assert memory.nb_entries == 6

    experiences = memory.sample(batch_size=5, batch_idxs=[0, 1, 2, 3, 4])
    assert len(experiences) == 5

    assert_allclose(experiences[0].state0, np.array([np.zeros(obs_size), obs0]))
    assert_allclose(experiences[0].state1, np.array([obs0, obs1]))
    assert experiences[0].action == action0
    assert experiences[0].reward == reward0
    assert experiences[0].terminal1 is False

    assert_allclose(experiences[1].state0, np.array([obs0, obs1]))
    assert_allclose(experiences[1].state1, np.array([obs1, obs2]))
    assert experiences[1].action == action1
    assert experiences[1].reward == reward1
    assert experiences[1].terminal1 is False

    assert_allclose(experiences[2].state0, np.array([obs1, obs2]))
    assert_allclose(experiences[2].state1, np.array([obs2, obs3]))
    assert experiences[2].action == action2
    assert experiences[2].reward == reward2
    assert experiences[2].terminal1 is True

    # Next experience has been re-sampled since since state0 would be terminal in which case we
    # cannot really have a meaningful transition because the environment gets reset. We thus
    # just ensure that state0 is not terminal.
    assert not np.all(experiences[3].state0 == np.array([obs2, obs3]))

    assert_allclose(experiences[4].state0, np.array([np.zeros(obs_size), obs4]))
    assert_allclose(experiences[4].state1, np.array([obs4, obs5]))
    assert experiences[4].action == action4
    assert experiences[4].reward == reward4
    assert experiences[4].terminal1 is False
예제 #5
0
# warm up
pi = None
for p in policy_iteration_iterator(10,
                                   0.5,
                                   file_path="/tmp/state_table.csv",
                                   save_path="/tmp/OSCAR/"):
    pi = p

for i in range(20):
    obs = env.reset()
    while True:
        s = state_from_obs(obs)
        a = pi[s.id()]
        old_obs = obs
        obs, r, done, debug_dict = env.step(a)
        memory.append(old_obs, a, r, done, False)
        if done:
            break

env.close()
env = GeneralLearningEnv(CONFIG_FILE,
                         False,
                         log_file_path=LOG_FILE,
                         publish_stats=False)

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=500000, visualize=False, verbose=2)

# After training is done, we save the final weights.
예제 #6
0
class MACE(Agent):
    def __init__(self, env: gym.Env, **kwargs):

        super(MACE, self).__init__(**kwargs)
        self.nb_actions = env.action_space.shape[0]

        obs_input_actor = Input(shape=(1, ) + env.observation_space.shape,
                                name='observation_input')
        x_ac = Flatten()(obs_input_actor)
        x_ac = Dense(units=256, activation='relu')(x_ac)

        obs_input_critic = Input(shape=(1, ) + env.observation_space.shape,
                                 name='observation_input')
        x_cr = Flatten()(obs_input_critic)
        x_cr = Dense(units=256, activation='relu')(x_cr)

        x_critic = Dense(units=128, activation='relu')(x_cr)
        value = Dense(units=1)(x_critic)

        x_actor = Dense(units=128, activation='relu')(x_ac)
        action = Dense(units=self.nb_actions, activation='tanh')(x_actor)

        actor = Model(inputs=obs_input_actor, outputs=action)
        critic = Model(inputs=obs_input_critic, outputs=value)

        metrics = []
        metrics += [mean_q]
        critic_metrics = metrics

        critic_optimizer = Adam(lr=1e-3)
        actor_optimizer = Adam(lr=1e-3)

        #        critic_optimizer = SGD(lr=1e-4, momentum=0.9)
        #        actor_optimizer = SGD(lr=1e-3, momentum=0.9)

        self.actor = actor
        self.critic = critic

        self.target_actor = clone_model(self.actor)
        self.target_actor.compile(optimizer='sgd', loss='mse')
        self.target_critic = clone_model(self.critic)
        self.target_critic.compile(optimizer='sgd', loss='mse')

        self.target_model_update = 1e-3
        #self.target_model_update=500

        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            critic_updates = get_soft_target_model_updates(
                self.target_critic, self.critic, self.target_model_update)
            critic_optimizer = AdditionalUpdatesOptimizer(
                critic_optimizer, critic_updates)
            actor_updates = get_soft_target_model_updates(
                self.target_actor, self.actor, self.target_model_update)
            actor_optimizer = AdditionalUpdatesOptimizer(
                actor_optimizer, actor_updates)

        self.delta_clip = np.inf

        def clipped_error(y_true, y_pred):
            return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)

        actor.compile(actor_optimizer, loss='mse')
        critic.compile(critic_optimizer, loss='mse', metrics=critic_metrics)

        self.compiled = True

        self.memory = SequentialMemory(limit=100000, window_length=1)
        self.memory_interval = 1
        self.memory_actor = SequentialMemory(limit=100000, window_length=1)
        self.memory_critic = SequentialMemory(limit=100000, window_length=1)

        self.nb_steps_warmup = 50000

        self.train_interval = 4
        self.batch_size = 64
        self.gamma = 0.99

        self.processor = None
        self.random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                                       mu=0.,
                                                       sigma=.3,
                                                       size=self.nb_actions)
        self.eps = 0.9

    def process_state_batch(self, batch):
        batch = np.array(batch)
        if self.processor is None:
            return batch
        return self.processor.process_state_batch(batch)

    def select_action(self, state):
        batch = [state]
        action = self.actor.predict_on_batch(np.asarray(batch)).flatten()
        # Apply noise, if a random process is set.
        if self.training and self.random_process is not None:
            #Actor exploration Bernoulli
            #            rd = np.random.rand()
            #            if rd<self.eps:
            noise = self.random_process.sample()
            assert noise.shape == action.shape
            action += noise
#               self.action_exploration=True


#            else:
#                self.action_exploration=False
        return action

    def forward(self, observation):
        # Select an action.

        state = self.memory.get_recent_state(observation)
        action = self.select_action(state)  # TODO: move this into policy

        # Book-keeping.
        self.recent_observation = observation
        self.recent_action = action

        return action

    def backward(self, reward, terminal=False):
        # Store most recent experience in memory.
        if self.step % self.memory_interval == 0:
            self.memory.append(self.recent_observation,
                               self.recent_action,
                               reward,
                               terminal,
                               training=self.training)

        metrics = [np.nan for _ in self.metrics_names]
        if not self.training:
            # We're done here. No need to update the experience memory since we only use the working
            # memory to obtain the state over the most recent observations.
            return metrics

        # Train the network on a single stochastic batch.
        can_train_either = self.step > self.nb_steps_warmup
        if can_train_either and self.step % self.train_interval == 0:
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            # Update actor and critic, if warm up is over.
            if self.step > self.nb_steps_warmup:
                if len(self.critic.inputs) >= 3:
                    state1_batch_with_action = state1_batch[:]
                else:
                    state1_batch_with_action = [state1_batch]
                target_q_values = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                assert target_q_values.shape == (self.batch_size, )

                # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target ys accordingly,
                # but only for the affected output units (as given by action_batch).
                discounted_reward_batch = self.gamma * target_q_values
                discounted_reward_batch *= terminal1_batch
                assert discounted_reward_batch.shape == reward_batch.shape
                targets = (reward_batch + discounted_reward_batch).reshape(
                    self.batch_size, 1)

                # Perform a single batch update on the critic network.
                if len(self.critic.inputs) >= 3:
                    state0_batch_with_action = state0_batch[:]
                else:
                    state0_batch_with_action = [state0_batch]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                metrics = self.critic.train_on_batch(state0_batch_with_action,
                                                     targets)
                if self.processor is not None:
                    metrics += self.processor.metrics

            #Actor
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            if self.step > self.nb_steps_warmup:
                #Actor
                target_q_values1 = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                discounted_reward_batch = self.gamma * target_q_values1
                discounted_reward_batch *= terminal1_batch
                targets = (reward_batch + discounted_reward_batch)
                target_q_values0 = self.target_critic.predict_on_batch(
                    state0_batch_with_action).flatten()
                delta = targets - target_q_values0
                if len(self.actor.inputs) >= 2:
                    inputs = state0_batch[:]
                else:
                    #inputs = [state0_batch]
                    inputs = state0_batch
                pos_dif = delta > 0
                #                if self.step%1000==0:
                #                    print(np.sum(pos_dif))
                inputs = np.asarray(inputs)[pos_dif]
                actions_target = action_batch[pos_dif]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                self.actor.train_on_batch(inputs, actions_target)

        if self.target_model_update >= 1 and self.step % self.target_model_update == 0:
            self.update_target_models_hard()

        return metrics

    def reset_states(self):
        if self.random_process is not None:
            self.random_process.reset_states()
        self.recent_action = None
        self.recent_observation = None
        if self.compiled:
            self.actor.reset_states()
            self.critic.reset_states()
            self.target_actor.reset_states()
            self.target_critic.reset_states()

    def update_target_models_hard(self):
        self.target_critic.set_weights(self.critic.get_weights())
        self.target_actor.set_weights(self.actor.get_weights())

    @property
    def metrics_names(self):
        names = self.critic.metrics_names[:]
        if self.processor is not None:
            names += self.processor.metrics_names[:]
        return names
예제 #7
0
class DDPG(object):
    def __init__(self, n_state, log_writer, args):
        self.n_state = n_state
        self.log_writer = log_writer
        self.output = args.output

        self.action_start = args.action_start
        self.action_end = args.action_end
        self.n_action = self.action_end - self.action_start + 1

        # create actor and critic network
        net_config = {
            'n_state': self.n_state,
            'n_action': self.n_action,
            'hidden1': args.hidden1,
            'hidden2': args.hidden2
        }

        self.actor = Actor(**net_config)
        self.actor_target = Actor(**net_config)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.lr_a)
        self.critic = Critic(**net_config)
        self.critic_target = Critic(**net_config)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr_c)

        # make sure target is with the same weight
        self.hard_update(self.actor_target, self.actor)
        self.hard_update(self.critic_target, self.critic)

        # create replay buffer
        self.memory = SequentialMemory(size=args.rmsize)

        # hyper-parameters
        self.batch_size = args.bsize
        self.discount = args.discount
        self.tau = args.tau

        # noise ???
        '''
        
        '''

        if torch.cuda.is_available():
            self.cuda()

        # moving average baseline
        self.moving_average = None
        self.moving_alpha = args.moving_alpha

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def random_action(self):
        # print('random_int')
        return random.randint(self.action_start, self.action_end)

    def select_action(self, state):
        action_prob = to_numpy(self.actor(to_tensor(state.reshape(
            1, -1)))).squeeze(0)
        dice = stats.rv_discrete(
            values=(range(self.action_start, self.action_end + 1),
                    action_prob))
        action = dice.rvs(size=1)

        # print(action_prob)
        # print('select action: {}'.format(action[0]))
        return action[0]

    def get_exact_action(self, state_batch, kind):
        if kind == 0:
            action_prob = self.actor_target(state_batch)
        else:
            action_prob = self.actor(state_batch)

        max_val, prediction = torch.max(action_prob, 1)
        prediction = prediction.reshape(self.batch_size, -1).float()
        return prediction / self.n_action

    def update_policy(self):
        # sample batch
        # print('start update policy\n')
        # self.log_writer.flush()

        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)

        action_batch = (action_batch - self.action_start) / self.n_action

        # normalize the reward
        batch_mean_reward = reward_batch.mean().item()
        if self.moving_average is None:
            self.moving_average = batch_mean_reward
        else:
            self.moving_average += self.moving_alpha * (batch_mean_reward -
                                                        self.moving_average)
        reward_batch -= self.moving_average

        # update critic
        self.critic.zero_grad()

        q_batch = self.critic([state_batch, action_batch])

        with torch.no_grad():  # prepare for the target q batch
            next_q_values = self.critic_target(
                [next_state_batch,
                 self.get_exact_action(next_state_batch, 0)])
        target_q_batch = reward_batch + self.discount * terminal_batch * next_q_values

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # update actor
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [state_batch, self.get_exact_action(state_batch, 1)])
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # print('end update policy\n')
        # self.log_writer.flush()

        # target network update
        self.soft_update(self.actor_target, self.actor)
        self.soft_update(self.critic_target, self.critic)

    def hard_update(self, target, source):
        for target_param, source_param in zip(target.parameters(),
                                              source.parameters()):
            target_param.data.copy_(source_param.data)

    def soft_update(self, target, source):
        for target_param, source_param in zip(target.parameters(),
                                              source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    source_param.data * self.tau)

    def append_replay(self, s_t, a_t, r_t, done):
        self.memory.append(s_t, a_t, r_t, done)

    def save_model(self, num):
        torch.save(self.actor.state_dict(),
                   '{}/actor-{}.pkl'.format(self.output, num))
        torch.save(self.critic.state_dict(),
                   '{}/critic-{}.pkl'.format(self.output, num))

    def load_weights(self, state_dir, num):
        self.actor.load_state_dict(
            torch.load('{}/actor-{}.pkl'.format(state_dir, num)))
        self.critic.load_state_dict(
            torch.load('{}/critic-{}.pkl'.format(state_dir, num)))
        self.actor_target.load_state_dict(
            torch.load('{}/actor-{}.pkl'.format(state_dir, num)))
        self.critic_target.load_state_dict(
            torch.load('{}/critic-{}.pkl'.format(state_dir, num)))
예제 #8
0
class deepAMDP():
    def __init__(self,
                 inputDim=16,
                 alpha=1e-4,
                 gamma=0.99,
                 epsilon=0.1,
                 numberOfActions=0,
                 tau=1e-1):

        self.predictionModel = Sequential()
        self.predictionModel.add(
            Dense(16, input_dim=inputDim, activation='relu'))
        self.predictionModel.add(Dense(16, activation='relu'))
        self.predictionModel.add(Dense(numberOfActions, activation='linear'))
        self.predictionModel.compile(loss="mse", optimizer=Adam(lr=alpha))

        self.targetModel = Sequential()
        self.targetModel.add(Dense(16, input_dim=inputDim))
        self.targetModel.add(Dense(16, activation='relu'))
        self.targetModel.add(Dense(numberOfActions, activation="linear"))
        self.targetModel.compile(loss="mse", optimizer=Adam(lr=alpha))

        self.memory = SequentialMemory(limit=100000, window_length=1)

        self.otherMemory = deque(maxlen=2000)

        self.numberOfActions = numberOfActions
        self.alpha = alpha
        self.gamma = gamma
        self.tau = tau
        self.epsilon = epsilon

        self.initialEpsilon = 0.1
        self.finalEpsilon = 0.01
        self.currentEpsilon = self.initialEpsilon
        self.episodesToDecay = 500

    def addExperience(self, latentState, action, reward, done):
        self.memory.append(latentState, action, reward, done)
        #print(latentState, action, reward)

    def memorize(self, state, action, reward, next_state, done):
        self.otherMemory.append((state, action, reward, next_state, done))

    def action(self, state):
        if np.random.random() < self.currentEpsilon:
            return np.random.randint(self.numberOfActions)
        state = state.reshape(1, -1)
        qValues = self.predictionModel.predict(state)
        return np.argmax(self.predictionModel.predict(state)[0])

    def replay(self, batchSize=8):
        #print("replay")
        #if len(self.memory) < batchSize:
        #    return

        experiences = self.memory.sample(batchSize)

        # Start by extracting the necessary parameters (we use a vectorized implementation).
        state0Batch = []
        rewardBatch = []
        actionBatch = []
        terminal1Batch = []
        state1Batch = []
        for e in experiences:
            # print(e.state0, e.state1, e.reward, e.action)
            state0Batch.append(e.state0[0])
            state1Batch.append(e.state1[0])
            rewardBatch.append(e.reward)
            actionBatch.append(e.action)
            terminal1Batch.append(0. if e.terminal1 else 1.)

        state0Batch = np.array(state0Batch)
        rewardBatch = np.array(rewardBatch)
        actionBatch = np.array(actionBatch)
        terminal1Batch = np.array(terminal1Batch)
        state1Batch = np.array(state1Batch)

        #state0Batch = normalize(state0Batch, axis=-1)
        #state1Batch = normalize(state1Batch, axis=-1)

        targetQValues = self.targetModel.predict_on_batch(state1Batch)

        #print("Target Q Values")
        #print(targetQValues)
        #print("Target Q 1")
        #print(state1Batch[0])
        qBatch = np.max(targetQValues, axis=1).flatten()
        #targets = np.zeros((batchSize, self.numberOfActions))
        #targets = np.random.rand(batchSize, self.numberOfActions)
        targets = targetQValues

        discountedRewardBatch = self.gamma * qBatch
        discountedRewardBatch *= terminal1Batch
        Rs = rewardBatch + discountedRewardBatch

        #print("Our Targets")
        #print(targets)
        for (target, r, action) in zip(targets, Rs, actionBatch):
            target[action] = r
        #print("Updated Targets")
        #print(targets)

        self.predictionModel.fit(state0Batch, targets, verbose=0)
        #self.updateTargetModel()

    def otherReplay(self, batch_size=8):
        minibatch = random.sample(self.otherMemory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            state = state.reshape(1, -1)
            next_state = next_state.reshape(1, -1)
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.predictionModel.predict(next_state)[0]))
            target_f = self.predictionModel.predict(state)
            target_f[0][action] = target
            self.predictionModel.fit(state, target_f, epochs=1, verbose=0)
        #self.updateTargetModel()

    def updateTargetModel(self):
        predictionWeights = self.predictionModel.get_weights()
        targetWeights = self.targetModel.get_weights()

        for i in range(0, len(targetWeights)):
            targetWeights[i] = predictionWeights[i] * self.tau + targetWeights[
                i] * (1 - self.tau)
        self.targetModel.set_weights(targetWeights)