Exemplo n.º 1
0
 def __init__(self, memory_size, batch_size, learn_start_time, learn_fre, lr, replay_iters, eps_T, eps_t_init,
     gamma, update_period, board, device, model_path, r_memory_Fname, o_model_name, model_load=False ):
     self.step_now = 0 # record the step
     self.reward_num = 0
     self.reward_accumulated = 0 # delay reward
     self.final_tem = 10 # just for now
     self.step_last_update = 0 # record the last update time 
     self.update_period = update_period # for the off policy
     self.learn_start_time = learn_start_time 
     self.gamma = gamma
     self.batch_size = batch_size
     self.memory_size = memory_size
     self.alpha = 0.6
     self.beta = 0.4
     self.replay_bata_iters = replay_iters 
     self.replay_eps = 1e-6
     self.memory_min_num = 1000 #she min num to learn
     self.step_last_learn = 0 # record the last learn step
     self.learn_fre = learn_fre # step frequency to learn
     self.e_greedy = 1 # record the e_greedy
     self.eps_T = eps_T # par for updating the maybe step 80,0000
     self.eps_t_init = eps_t_init # par for updating the eps
      
     self.device = device
     self.model_path = model_path
     self.mode_enjoy = model_load
     if model_load == False: 
         self.policy_net = DQN(board[0], board[1], action_num).to(device)
         self.target_net = DQN(board[0], board[1], action_num).to(device)
         self.optimizer = optim.Adagrad(self.policy_net.parameters(), lr=lr)
         self.loss_fn = nn.functional.mse_loss # use the l1 loss
         self.memory = PrioritizedReplayBuffer(memory_size, self.alpha)
         self.beta_schedule = LinearSchedule(self.replay_bata_iters, self.beta, 1.0)
     else:
         self.load(o_model_name) 
     #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) 
     self.obs_new = None
     self.obs_old = None
     self.action = None
     self.action_old = None
     self.dqn_direct_flag = False # show if the dqn action is done
     self.model_save_flag = False
Exemplo n.º 2
0
def parse_eps_schedule(eps_schedule_str):
    algo, extra_args = eps_schedule_str.split(':')
    if algo == 'linear':
        steps, init_p, final_p = map(float, extra_args.split(','))
        steps = int(steps)
        return LinearSchedule(steps, initial_p=init_p, final_p=final_p)
    elif algo == 'const':
        val = extra_args.split(',')[0]
        return ConstantSchedule(float(val))
    else:
        raise NotImplemented()
Exemplo n.º 3
0
def init_dqn(args):
    """Intitialises and returns the necessary objects for
       Deep Q-learning:
       Q-network, target network, replay buffer and optimizer.
    """
    logging.info(
        "Initialisaling DQN with architecture {} and optimizer {}".format(
            args.dqn_archi, args.optimizer_agent))
    if args.dqn_archi == 'mlp':
        q_net = DQN(args.obs_shape, args.n_actions, args)
        q_target = DQN(args.obs_shape, args.n_actions, args)
    elif args.dqn_archi == 'cnn':
        q_net = CnnDQN(args.obs_shape, args.n_actions, args)
        q_target = CnnDQN(args.obs_shape, args.n_actions, args)
    if args.optimizer_agent == 'RMSProp':
        optimizer_agent = optim.RMSprop(q_net.parameters(),
                                        lr=args.lr_agent,
                                        weight_decay=args.lambda_agent)
    else:
        assert args.optimizer_agent == 'Adam'
        optimizer_agent = optim.Adam(q_net.parameters(),
                                     lr=args.lr_agent,
                                     weight_decay=args.lambda_agent)
    q_target.load_state_dict(
        q_net.state_dict())  # set params of q_target to be the same
    replay_buffer = ReplayBuffer(args.replay_buffer_size)

    if args.epsilon_annealing_scheme == 'linear':
        epsilon_schedule = LinearSchedule(schedule_timesteps=int(
            args.exploration_fraction * args.n_agent_steps),
                                          initial_p=args.epsilon_start,
                                          final_p=args.epsilon_stop)
    else:
        assert args.epsilon_annealing_scheme == 'exp'
        epsilon_schedule = ExpSchedule(decay_rate=args.epsilon_decay,
                                       final_p=args.epsilon_stop,
                                       initial_p=args.epsilon_start)

    return q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule
Exemplo n.º 4
0
    def learn(self, total_timesteps=None, total_episodes=None, log_interval=100, ckpt_interval=100, ckpt_path=None):
        last_100rewards = np.zeros(100)
        last_100rewards[:] = np.NaN

        if total_timesteps and total_episodes:
            raise ValueError("Only one of total_timesteps or total_episodes can be specified")

        
        if ckpt_path is None:
            print('Checkpoint path is not provided, no intermediate models will be saved')
    
        loop_type = 'episode' if total_episodes else 'timesteps'
        loop_var = total_timesteps if total_timesteps is not None else total_episodes

        # if self.exploration_frac is None:
        #     self.exploration = LinearSchedule(frac=self.exploration_ep,
        #                                       initial=self.exploration_initial_eps,
        #                                       final=self.exploration_final_eps)
        # else:
        #     self.exploration = LinearSchedule(frac=self.exploration_frac * loop_var,
        #                                       initial=self.exploration_initial_eps,
        #                                       final=self.exploration_final_eps)

        if self.exploration_type == 'linear':
            self.exploration = LinearSchedule(
                frac=self.exploration_frac * loop_var,
                initial=self.exploration_initial_eps,
                final=self.exploration_final_eps)
        elif self.exploration_type == 'exponential':
            self.exploration = ExponentialSchedule(
                frac=self.exploration_frac,
                initial=self.exploration_initial_eps,
                final=self.exploration_final_eps)

        train = True
        done = False
        step = 0
        ep_reward = 0
        obs = self.env.reset()

        while train:
            if loop_type == 'episode':
                update_eps = self.exploration.value(self.ep_done)
            if loop_type == 'timesteps':
                update_eps = self.exploration.value(self.elapsed_steps)

            if np.random.random_sample() > update_eps:
                action, value = self.policy.predict(obs, deterministic=True)
            else:
                action, value = self.policy.predict(obs, deterministic=False)

            next_obs, reward, done, info = self.env.step(action)

            # print(step, next_obs, self.qvalues[next_obs])
            # argmax_a = np.argmax(self.qvalues[next_obs])
            # argmax_a, _ = self.policy.predict(obs, deterministic=True)
            argmax_a = np.argmax(self.qvalues[next_obs])

            if isinstance(self.observation_space, Tuple):
                # print(obs, action)
                expected_reward = reward + self.gamma*self.qvalues[next_obs + (argmax_a,)]*(1-int(done))-self.qvalues[obs + (action,)]
                self.qvalues[obs + (action,)] += self.learning_rate * expected_reward

                if self.policy.intent:
                    intent_update = np.zeros(self.qvalues.shape)
                    intent_update[obs + (action,)] += 1
                    expected_intent = intent_update + self.gamma * self.hvalues[next_obs + (argmax_a,)] * (1-int(done)) - self.hvalues[obs + (action,)]
                    self.hvalues[obs + (action,)] = self.hvalues[obs + (action,)] + self.learning_rate * expected_intent

            if isinstance(self.observation_space, Discrete):
                expected_reward = reward + self.gamma*np.max(self.qvalues[next_obs])*(1-int(done))-self.qvalues[obs, action]
                self.qvalues[obs, action] += self.learning_rate * expected_reward

                if self.policy.intent:
                    intent_update = np.zeros(self.qvalues.shape)
                    intent_update[obs, action] += 1
                    expected_intent = intent_update + self.gamma * self.hvalues[next_obs, argmax_a] * (1-int(done)) - self.hvalues[obs, action]
                    self.hvalues[obs, action] = self.hvalues[obs, action] + self.learning_rate * expected_intent

            obs = next_obs
            step += 1
            ep_reward += reward
            self.elapsed_steps += 1

            if loop_type == 'timesteps':
                if self.elapsed_steps == total_timesteps:
                    train = False

            if done:
                # print(step)
                last_100rewards[self.ep_done%100] = ep_reward
                print("\rEpisode {}/{}, Average Reward {}".format(self.ep_done,total_episodes, np.nanmean(last_100rewards)),end="")
                self.ep_done += 1
                step = 0
                ep_reward = 0
                obs = self.env.reset()
                if loop_type == 'episode':
                    if self.ep_done >= total_episodes:
                        train = False

            if ckpt_path is not None and ckpt_interval:
                if loop_type == 'episode':
                    if self.ep_done % ckpt_interval == 0 and done:
                        ckpt_str = str(self.ep_done)
                        full_path = ckpt_path + '/' + ckpt_str
                        # super(DBNModel, self).save(full_path)
                        super(QTabularRLModel, self).save(full_path)

                if loop_type == 'timesteps':
                    if self.elapsed_steps % ckpt_interval == 0 and done:
                        ckpt_str = str(self.ep_done)
                        full_path = ckpt_path + '/' + ckpt_str
                        # super(DBNModel, self).save(full_path)
                        super(QTabularRLModel, self).save(full_path)
Exemplo n.º 5
0
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps):

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Create environment and set stride parameters for this problem instance.
    # Most of the time, these two stride parameters will be equal. However,
    # one might use a smaller stride for initial placement and a larger stride
    # for action specification in order to speed things up. Unfortunately, this
    # could cause the problem to be infeasible: no grasp might work for a given
    # initial setup.
    env = envstandalone.PuckArrange()
    env.initStride = initEnvStride  # stride for initial puck placement
    env.stride = envStride  # stride for action specification

    # Standard q-learning parameters
    reuseModels = None
    max_timesteps = inputmaxtimesteps
    exploration_fraction = 0.5
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 60
    buffer_size = 1000
    batch_size = 32
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    # Set parameters related to shape of the patch and the number of patches
    descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2)
    #    descriptorShapeSmall = (10,10,2)
    #    descriptorShapeSmall = (15,15,2)
    descriptorShapeSmall = (20, 20, 2)
    num_states = 2  # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2 * num_patches

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    #    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
    #                                 initial_p=exploration_final_eps,
    #                                 final_p=exploration_final_eps)

    # Set parameters for prioritized replay. You  can turn this off just by
    # setting the line below to False
    prioritized_replay = True
    #    prioritized_replay=False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Create neural network
    q_func = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[32], dueling=True)

    # Build tensorflow functions
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)

    getqNotHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                q_func=q_func,
                                num_states=num_states,
                                num_cascade=5,
                                scope="deepq",
                                qscope="q_func_notholding",
                                reuse=reuseModels)
    getqHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                             q_func=q_func,
                             num_states=num_states,
                             num_cascade=5,
                             scope="deepq",
                             qscope="q_func_holding",
                             reuse=reuseModels)

    targetTrainNotHolding = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_notholding",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHolding = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_holding",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy.
    lrState = 0.1
    V = np.zeros([
        2,
    ])

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    # Initialize things
    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()

    # Load neural network model if one was specified.
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)
        fileInV = fileIn + 'V.npy'
        V = np.load(fileInV)

    # Iterate over time steps
    for t in range(max_timesteps):

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors * 2 - 1
        actionsPickDescriptors = np.stack(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.stack(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]

        # Get qCurr. I split up pick and place in order to accomodate larger batches
        qCurrNotHoldingPick = getqNotHolding(actionsPickDescriptors)
        qCurrHoldingPick = getqHolding(actionsPickDescriptors)
        qCurrNotHoldingPlace = getqNotHolding(actionsPlaceDescriptors)
        qCurrHoldingPlace = getqHolding(actionsPlaceDescriptors)
        qCurr = np.concatenate([
            np.r_[qCurrNotHoldingPick, qCurrNotHoldingPlace],
            np.r_[qCurrHoldingPick, qCurrHoldingPlace]
        ],
                               axis=1)

        # Update tabular state-value function using V(s) = max_a Q(s,a)
        thisStateValues = np.max(qCurr[:, obs[1]])
        V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues

        # Select e-greedy action to execute
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:, obs[1]])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # Execute action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(cp.copy(obs[1]),
                          np.copy(actionDescriptors[action, :]), cp.copy(rew),
                          cp.copy(new_obs[1]), cp.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actionPatches, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actionPatches, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Calculate target
            targets = rewards + (1 - dones) * gamma * V[states_tp1]

            # Get current q-values and calculate td error and q-value targets
            qCurrTargetNotHolding = getqNotHolding(actionPatches)
            qCurrTargetHolding = getqHolding(actionPatches)
            qCurrTarget = np.concatenate(
                [qCurrTargetNotHolding, qCurrTargetHolding], axis=1)
            td_error = qCurrTarget[range(batch_size), states_t] - targets
            qCurrTarget[range(batch_size), states_t] = targets

            # Train
            targetTrainNotHolding(
                actionPatches, np.reshape(qCurrTarget[:, 0], [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHolding(actionPatches,
                               np.reshape(qCurrTarget[:, 1], [batch_size, 1]),
                               np.reshape(weights, [batch_size, 1]))

            # Update replay priorities using td_error
            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
            #            print("time to do training: " + str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = np.copy(new_obs)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)
        fileOutV = fileOut + 'V'
        print("fileOutV: " + fileOutV)
        np.save(fileOutV, V)

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors * 2 - 1
    gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0]))

    actionsPickDescriptors = np.stack(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)

    print(str(obs[0][:, :, 0]))

    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    qPickHolding = getqHolding(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:, 0], [gridSize, gridSize])))

    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:, 1], [gridSize, gridSize])))

    plt.subplot(1, 3, 1)
    plt.imshow(np.tile(env.state[0], [1, 1, 3]), vmin=5, vmax=12)
    plt.subplot(1, 3, 2)
    plt.imshow(np.reshape(qPick[:, 0], [gridSize, gridSize]), vmin=5, vmax=12)
    plt.subplot(1, 3, 3)
    plt.imshow(np.reshape(qPlace[:, 1], [gridSize, gridSize]), vmin=5, vmax=12)
    plt.show()
Exemplo n.º 6
0
def main():
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})
    
    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
#        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
        return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys])
    
    def trainTabular(vectorKey,qCurrTargets,weights):
        keys = getTabularKeys(vectorKey)
        alpha=0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
#                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]


    # Standard DQN parameters
#    max_timesteps=20000
    max_timesteps=30000
#    max_timesteps=2000
    learning_starts=1000
#    learning_starts=10
#    buffer_size=50000
    buffer_size=10000
#    buffer_size=1000
#    buffer_size=320
#    buffer_size=32
#    buffer_size=8
#    buffer_size=1
#    exploration_fraction=0.2
    exploration_fraction=0.3
#    exploration_final_eps=0.02
    exploration_final_eps=0.1
    print_freq=1
#    gamma=.98
    gamma=.9
    target_network_update_freq=1
    batch_size=32
#    batch_size=1
    train_freq=1
#    train_freq=2
    num_cpu = 16
#    lr=0.001
    lr=0.0003
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay=True
#    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1
    
    # Deictic state/action parameters
    deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (3,3,2)
    num_cascade = 5
#    num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything
    num_states = 2 # either holding or not
    num_patches = env.maxSide**2
    num_actions = 2*num_patches
    num_actions_discrete = 2
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
#    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected
    
    # ******* Build tensorflow functions ********

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
        convs=[(16,3,1), (32,3,1)],
        hiddens=[48],
#        convs=[(32,3,1)],
#        hiddens=[32],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(deicticActionShape, name=name)

    def make_target_ph(name):
#        return U.BatchInput([num_actions], name=name)
#        return U.BatchInput([num_cascade,num_states], name=name)
        return U.BatchInput([num_states], name=name)

    def make_weight_ph(name):
        return U.BatchInput([num_states], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    if valueFunctionType == 'DQN':
        getq = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=num_cascade,
                scope="deepq",
                qscope="q_func"
                )
    
        targetTrain = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=num_cascade,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func",
            grad_norm_clipping=1.
    #        grad_norm_clipping=0.1
        )

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = np.int32(obs[1]>0) # holding

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptorsRaw = getMoveActionDescriptors([obs[0]])
        moveDescriptors = np.int32(moveDescriptorsRaw>0)
        moveDescriptors = moveDescriptors*2-1

        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        if valueFunctionType == "TABULAR":
            actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1
            qCurr = getTabular(actionDescriptorsFlat)
        else:
            qCurr = getq(actionDescriptors)
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly

        # select action at random
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:,stateDeictic])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx,stateDeictic])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv==actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # display state at the end
        if t > max_timesteps-200:
            print(str(obs[0][:,:,0]))
            print(str(obs[1]))
            print("action: " + str(action))

        # take action
        new_obs, rew, done, _ = env.step(action)
        
        # display state at the end
        if (t > max_timesteps-200) and done:
            print("done *********************** done")
            
        replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            states_tp1 = np.int32(states_tp1>0)
            
            moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext1 = np.int32(moveDescriptorsNext1>0)
            moveDescriptorsNext1 = moveDescriptorsNext1*2-1

            actionsPickDescriptorsNext1 = np.stack([moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1))],axis=3)
            actionsPlaceDescriptorsNext1 = np.stack([np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1],axis=3)
            actionDescriptorsNext1 = np.stack([actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0)
            actionDescriptorsNext1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,deicticActionShape[0],deicticActionShape[1],deicticActionShape[2]])
            
            if valueFunctionType == "TABULAR":
                actionDescriptorsNextFlat1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,-1]) == 1
                qNextFlat1 = getTabular(actionDescriptorsNextFlat1)
            else:
                qNextFlat1 = getq(actionDescriptorsNext1)
            
            qNext1 = np.reshape(qNextFlat1,[batch_size,num_patches,num_actions_discrete,num_states])
            qNextmax1 = np.max(np.max(qNext1[range(batch_size),:,:,states_tp1],2),1)
            targets1 = rewards + (1-dones) * gamma * qNextmax1

            if valueFunctionType == "TABULAR":
                actionsFlat = np.reshape(actions,[batch_size,-1]) == 1
                qCurrTarget1 = getTabular(actionsFlat)
            else:
                qCurrTarget1 = getq(actions)

            td_errors = qCurrTarget1[range(batch_size),states_t] - targets1
            qCurrTarget1[range(batch_size),states_t] = targets1

            if valueFunctionType == "TABULAR":
                trainTabular(actionsFlat, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (TABULAR)
            else:
                targetTrain(actions, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (DQN)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", beta: " + str(beta) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
        
    # display value function
    obs = env.reset()
    moveDescriptorsRaw = getMoveActionDescriptors([obs[0]])
    moveDescriptors = np.int32(moveDescriptorsRaw>0)
    moveDescriptors = moveDescriptors*2-1

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    qPick = getq(actionsPickDescriptors)
#    qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[8,8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[8,8])))

    qPlace = getq(actionsPlaceDescriptors)
#    qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[8,8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[8,8])))
Exemplo n.º 7
0
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 20000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1000
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 4

    deicticShape = (3, 3, 4)
    num_deictic_patches = 36

    num_actions = 3
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros(
            (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen))
        obsPadded[windowLen:windowLen + obsShape[0],
                  windowLen:windowLen + obsShape[1]] = obs[:, :, 0]
        deicticObsThis = np.zeros(
            (windowLen, windowLen, 4)
        )  # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 1  # agent zoomin
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 2  # ball zoomin
                patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen]
                for k in range(1, 3):
                    # THE VERSION BELOW USES A FIXED VIEW
                    #                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])],
                    #                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
                    #                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    #                    deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])],
                    #                                 [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])],
                    #                                 [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
                    deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]),
                                                    (k in patch[0:3, 3:6]),
                                                    (k in patch[0:3, 6:9])],
                                                   [(k in patch[3:6, 0:3]),
                                                    (k in patch[3:6, 3:6]),
                                                    (k in patch[3:6, 6:9])],
                                                   [(k in patch[6:9, 0:3]),
                                                    (k in patch[6:9, 3:6]),
                                                    (k in patch[6:9, 6:9])]]
                deicticObs.append(
                    deicticObsThis.copy()
                )  # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)

    # input: batch x nxnx1 tensor of observations
    # output: 8 x batch matrix of deictic observations
    def convertState(observations):

        # Reshape to batch x flatimage x channel.
        # Channel1 = zoomin agent, channel2 = zoomin ball
        # Channel3 = zoomout agent, channel4 = zoomout ball
        obs = np.zeros((36, 9, 4))
        for i in range(4):
            obs[:, :, i] = np.reshape(observations[:, :, :, i], [36, 9])

        # state_numeric: 4 x batch.
        # row0: pos of agent in zoomin, row1: pos of ball in zoomin
        # row2: pos of agent in zoomout, row3: pos of ball in zoomout
        shape = np.shape(obs)
        state_numeric = 9 * np.ones(
            (4, shape[0])
        )  # 9 indicates agent/ball does not appear at this zoom in this glance
        pos = np.nonzero(obs == 1)
        for i in range(4):
            idx = np.nonzero(pos[2] == i)[0]
            state_numeric[i, pos[0][idx]] = pos[1][idx]
#            state_numeric[i,pos[0][pos[2] == i]] = pos[1][pos[2] == i]

        return np.int32(state_numeric)

    def convertStateBatch(observations):
        shape = np.shape(observations)
        state_numeric_batch = []
        for batch in range(shape[0]):
            state_numeric_batch.append(convertState(observations[batch]))
        return (np.array(state_numeric_batch))

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        return (np.array(deicticObsBatch))

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        #        convs=[(16,3,1)],
        convs=[(16, 2, 1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    lr = 1e-3

    def make_obs_ph(name):
        return U.BatchInput(deicticShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_cascaded(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_cascade=num_cascade,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    #    update_target()

    dimSize = deicticShape[0] * deicticShape[1] + 1
    tabularQ = 1 * np.ones(
        (dimSize, dimSize, dimSize, dimSize, num_cascade, num_actions))

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)

        #        # Get current q-values: tabular version
        #        stateCurr = convertState(obsDeictic)
        #        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],-1,:]

        # Get current q-values: neural network version
        qCurr = getq(np.array(obsDeictic))[:, -1, :]

        # select action
        qCurrNoise = qCurr + np.random.random(
        ) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise, 0))
        selPatch = np.argmax(np.max(qCurrNoise, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        #        # debug
        #        if t > 5000:
        #            print("obs:\n" + str(np.squeeze(obs)))
        #            print("qCurr:\n" + str(qCurr))
        #            print("action: " + str(action) + ", patch: " + str(selPatch))
        #            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
        #            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
        #            action

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            obs_resize_to_network = [
                batch_size * num_deictic_patches, deicticShape[0],
                deicticShape[1], deicticShape[2]
            ]
            q_resize_from_network = [
                batch_size, num_deictic_patches, num_cascade, num_actions
            ]
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            obses_t_deic = getDeicticObsBatch(obses_t)
            obses_tp1_deic = getDeicticObsBatch(obses_tp1)

            #            # Get curr, next values: tabular version
            #            stateNext = convertStateBatch(obses_tp1_deic)
            #            qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],-1,:]
            #            stateCurr = convertStateBatch(obses_t_deic)
            #            qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:]

            # Get curr, next values: neural network version
            qNext = np.reshape(
                getq(np.reshape(obses_tp1_deic, obs_resize_to_network)),
                q_resize_from_network)[:, :, -1, :]
            qCurr = np.reshape(
                getq(np.reshape(obses_t_deic, obs_resize_to_network)),
                q_resize_from_network)

            # Get "raw" targets (no masking for cascade levels)
            qNextmax = np.max(np.max(qNext, 2), 1)
            targetsRaw = rewards + (1 - dones) * gamma * qNextmax
            targetsTiled = np.tile(np.reshape(targetsRaw, [batch_size, 1, 1]),
                                   [1, num_deictic_patches, num_cascade])

            # Get qCurrActionSelect
            actionsTiled = np.tile(np.reshape(actions, [batch_size, 1, 1]),
                                   [1, num_deictic_patches, num_cascade])
            qCurrActionSelect = np.zeros(
                (batch_size, num_deictic_patches, num_cascade))
            for i in range(num_actions):
                qCurrActionSelect += (actionsTiled == i) * qCurr[:, :, :, i]

            # Get targets masked for cascade level
            targetMask = targetsTiled < qCurrActionSelect
            targets = np.zeros((batch_size, num_deictic_patches, num_cascade))
            targets[:, :, 0] = targetsTiled[:, :, 0]
            targets[:, :, 1] = targetMask[:, :, 0] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 0]) * qCurrActionSelect[:, :, 1]
            targets[:, :, 2] = targetMask[:, :, 1] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 1]) * qCurrActionSelect[:, :, 2]
            targets[:, :, 3] = targetMask[:, :, 2] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 2]) * qCurrActionSelect[:, :, 3]
            targets[:, :, 4] = targetMask[:, :, 3] * targetsTiled[:, :, 0] + (
                1 - targetMask[:, :, 3]) * qCurrActionSelect[:, :, 4]

            qCurrTargets = np.zeros(np.shape(qCurr))
            for i in range(num_actions):
                myActions = actionsTiled == i
                qCurrTargets[:, :, :, i] = myActions * targets + (
                    1 - myActions) * qCurr[:, :, :, i]


#            # Update values: tabular version
#            tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] = \
#                (1 - learning_alpha) * tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,actionsTiled[:,:,0]] \
#                + learning_alpha * targets

# Update values: neural network version
            targets_resize_to_network = [
                batch_size * num_deictic_patches, num_cascade, num_actions
            ]
            td_error_out, obses_out, targets_out = targetTrain(
                np.reshape(obses_t_deic, obs_resize_to_network),
                np.reshape(qCurrTargets, targets_resize_to_network))

            td_error_pre = qCurrActionSelect - targets
            #            print("td error pre-update: " + str(np.linalg.norm(td_error_pre)))

            #            # tabular version
            #            qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:,:]

            # neural network version
            qCurr = np.reshape(
                getq(np.reshape(obses_t_deic, obs_resize_to_network)),
                q_resize_from_network)

            qCurrActionSelect_post = np.zeros(
                (batch_size, num_deictic_patches, num_cascade))
            for i in range(num_actions):
                qCurrActionSelect_post += (actionsTiled == i) * qCurr[:, :, :,
                                                                      i]

            td_error_post = qCurrActionSelect_post - targets
            #            print("td error post-update: " + str(np.linalg.norm(td_error_post)))

            if -1 in rewards:
                dones

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemplo n.º 8
0
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps,
         vispolicy):

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Create environment and set stride parameters for this problem instance.
    # Most of the time, these two stride parameters will be equal. However,
    # one might use a smaller stride for initial placement and a larger stride
    # for action specification in order to speed things up. Unfortunately, this
    # could cause the problem to be infeasible: no grasp might work for a given
    # initial setup.
    env = envstandalone.PuckArrange()
    env.initStride = initEnvStride  # stride for initial puck placement
    env.stride = envStride  # stride for action specification

    # Standard q-learning parameters
    reuseModels = None
    max_timesteps = inputmaxtimesteps
    exploration_fraction = 0.5
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 60
    buffer_size = 1000
    #    batch_size=32
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    #    useHierarchy = False
    useHierarchy = True

    # Set parameters related to shape of the patch and the number of patches
    descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2)
    #    descriptorShapeSmall = (10,10,2)
    #    descriptorShapeSmall = (15,15,2)
    descriptorShapeSmall = (20, 20, 2)
    num_states = 2  # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2 * num_patches * env.num_orientations

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Set parameters for prioritized replay. You  can turn this off just by
    # setting the line below to False
    #    prioritized_replay=True
    prioritized_replay = False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Create neural network
    q_func = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[32], dueling=True)

    # Build tensorflow functions
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptorsNoRot = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)
    getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)

    getqNotHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                   q_func=q_func,
                                   num_states=num_states,
                                   num_cascade=5,
                                   scope="deepq",
                                   qscope="q_func_notholding_rot",
                                   reuse=reuseModels)
    getqHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                q_func=q_func,
                                num_states=num_states,
                                num_cascade=5,
                                scope="deepq",
                                qscope="q_func_holding_rot",
                                reuse=reuseModels)

    targetTrainNotHoldingRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_notholding_rot",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHoldingRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_holding_rot",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    getqNotHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                     q_func=q_func,
                                     num_states=num_states,
                                     num_cascade=5,
                                     scope="deepq",
                                     qscope="q_func_notholding_norot",
                                     reuse=reuseModels)
    getqHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                  q_func=q_func,
                                  num_states=num_states,
                                  num_cascade=5,
                                  scope="deepq",
                                  qscope="q_func_holding_norot",
                                  reuse=reuseModels)

    targetTrainNotHoldingNoRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_notholding_norot",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHoldingNoRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_holding_norot",
        grad_norm_clipping=1.,
        reuse=reuseModels)

    # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy.
    lrState = 0.1
    V = np.zeros([
        2,
    ])

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    # Initialize things
    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()

    # Load neural network model if one was specified.
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)
        fileInV = fileIn + 'V.npy'
        V = np.load(fileInV)

    # Iterate over time steps
    for t in range(max_timesteps):

        # Use hierarchy to get candidate actions
        if useHierarchy:

            # Get NoRot descriptors
            moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]])
            moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1
            actionsPickDescriptorsNoRot = np.stack([
                moveDescriptorsNoRot,
                np.zeros(np.shape(moveDescriptorsNoRot))
            ],
                                                   axis=3)
            actionsPlaceDescriptorsNoRot = np.stack([
                np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot
            ],
                                                    axis=3)
            actionDescriptorsNoRot = np.r_[actionsPickDescriptorsNoRot,
                                           actionsPlaceDescriptorsNoRot]

            # Get NoRot values
            if obs[1] == 0:
                qCurrPick = getqNotHoldingNoRot(actionsPickDescriptorsNoRot)
                qCurrPlace = getqNotHoldingNoRot(actionsPlaceDescriptorsNoRot)
            elif obs[1] == 1:
                qCurrPick = getqHoldingNoRot(actionsPickDescriptorsNoRot)
                qCurrPlace = getqHoldingNoRot(actionsPlaceDescriptorsNoRot)
            else:
                print("error: state out of bounds")
            qCurrNoRot = np.squeeze(np.r_[qCurrPick, qCurrPlace])

            # Get Rot actions corresponding to top k% NoRot actions
            k = 0.2  # top k% of NoRot actions
            valsNoRot = qCurrNoRot
            topKactionsNoRot = np.argsort(
                valsNoRot)[-np.int32(np.shape(valsNoRot)[0] * k):]
            topKpositionsNoRot = topKactionsNoRot % env.num_moves
            topKpickplaceNoRot = topKactionsNoRot / env.num_moves
            actionsCandidates = []
            for ii in range(2):
                eltsPos = topKpositionsNoRot[topKpickplaceNoRot == ii]
                for jj in range(env.num_orientations):
                    actionsCandidates = np.r_[
                        actionsCandidates, eltsPos + jj * env.num_moves + ii *
                        (env.num_moves * env.num_orientations)]
            actionsCandidates = np.int32(actionsCandidates)

        # No hierarchy
        else:
            actionsCandidates = range(2 * env.num_moves * env.num_orientations)

        # Get Rot descriptors
        moveDescriptorsRot = getMoveActionDescriptorsRot([obs[0]])
        moveDescriptorsRot = moveDescriptorsRot * 2 - 1
        actionsPickDescriptorsRot = np.stack(
            [moveDescriptorsRot,
             np.zeros(np.shape(moveDescriptorsRot))],
            axis=3)
        actionsPlaceDescriptorsRot = np.stack(
            [np.zeros(np.shape(moveDescriptorsRot)), moveDescriptorsRot],
            axis=3)
        actionDescriptorsRot = np.r_[actionsPickDescriptorsRot,
                                     actionsPlaceDescriptorsRot]

        # Get qCurr using actionCandidates
        actionDescriptorsRotReduced = actionDescriptorsRot[actionsCandidates]
        if obs[1] == 0:
            qCurrReduced = np.squeeze(
                getqNotHoldingRot(actionDescriptorsRotReduced))
        elif obs[1] == 1:
            qCurrReduced = np.squeeze(
                getqHoldingRot(actionDescriptorsRotReduced))
        else:
            print("error: state out of bounds")
        qCurr = -100 * np.ones(np.shape(actionDescriptorsRot)[0])
        qCurr[actionsCandidates] = np.copy(qCurrReduced)

        #        # Get qCurr. I split up pick and place in order to accomodate larger batches
        #        if obs[1] == 0:
        #            qCurrPick = getqNotHoldingRot(actionsPickDescriptorsRot)
        #            qCurrPlace = getqNotHoldingRot(actionsPlaceDescriptorsRot)
        #        elif obs[1] == 1:
        #            qCurrPick = getqHoldingRot(actionsPickDescriptorsRot)
        #            qCurrPlace = getqHoldingRot(actionsPlaceDescriptorsRot)
        #        else:
        #            print("error: state out of bounds")
        #        qCurr = np.squeeze(np.r_[qCurrPick,qCurrPlace])

        # Update tabular state-value function using V(s) = max_a Q(s,a)
        thisStateValues = np.max(qCurr)
        V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues

        #        # Select e-greedy action to execute
        #        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        #        action = np.argmax(qCurrNoise)
        #        if (np.random.rand() < exploration.value(t)) and not vispolicy:
        #            action = np.random.randint(num_actions)

        # e-greedy + softmax
        #        qCurrExp = np.exp(qCurr/0.3)
        qCurrExp = np.exp(qCurr / 0.2)
        #        qCurrExp = np.exp(qCurr/0.1)
        probs = qCurrExp / np.sum(qCurrExp)
        action = np.random.choice(range(np.size(probs)), p=probs)
        if (np.random.rand() < exploration.value(t)) and not vispolicy:
            action = np.random.randint(num_actions)

        position = action % env.num_moves
        pickplace = action / (env.num_moves * env.num_orientations)
        #        orientation = action / env.num_moves
        orientation = (action - pickplace * env.num_moves *
                       env.num_orientations) / env.num_moves
        actionNoRot = position + pickplace * env.num_moves

        if vispolicy:
            print("action: " + str(action))
            print("position: " + str(position))
            print("pickplace: " + str(pickplace))
            print("orientation: " + str(orientation))
            vposition = env.moveCenters[position / len(env.moveCenters)]
            hposition = env.moveCenters[position % len(env.moveCenters)]
            plt.subplot(1, 2, 1)
            im = env.state[0][:, :, 0]
            im[vposition, hposition] = 0.5
            plt.imshow(env.state[0][:, :, 0])
#            plt.show()

# Execute action
        new_obs, rew, done, _ = env.step(action)

        if useHierarchy:
            # store both NoRot and Rot descriptors
            replay_buffer.add(cp.copy(obs[1]),
                              np.copy(actionDescriptorsNoRot[actionNoRot, :]),
                              np.copy(actionDescriptorsRot[action, :]),
                              cp.copy(rew), cp.copy(new_obs[1]),
                              cp.copy(float(done)))
        else:
            # store only Rot descriptor
            replay_buffer.add(cp.copy(obs[1]),
                              np.copy(actionDescriptorsRot[action, :]),
                              np.copy(actionDescriptorsRot[action, :]),
                              cp.copy(rew), cp.copy(new_obs[1]),
                              cp.copy(float(done)))

        if vispolicy:
            print("rew: " + str(rew))
            print("done: " + str(done))
            plt.subplot(1, 2, 2)
            plt.imshow(env.state[0][:, :, 0])
            plt.show()

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Calculate target
            targets = rewards + (1 - dones) * gamma * V[states_tp1]

            # Get current q-values and calculate td error and q-value targets
            qCurrTargetNotHolding = getqNotHoldingRot(actionPatchesRot)
            qCurrTargetHolding = getqHoldingRot(actionPatchesRot)
            qCurrTarget = np.concatenate(
                [qCurrTargetNotHolding, qCurrTargetHolding], axis=1)
            td_error = qCurrTarget[range(batch_size), states_t] - targets
            qCurrTarget[range(batch_size), states_t] = targets

            # Train
            targetTrainNotHoldingRot(
                actionPatchesRot, np.reshape(qCurrTarget[:, 0],
                                             [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHoldingRot(
                actionPatchesRot, np.reshape(qCurrTarget[:, 1],
                                             [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))

            # Only train NoRot if we're doing the hierarchy
            if useHierarchy:

                #                qCurrTargetNotHoldingNoRot = getqNotHoldingNoRot(actionPatchesNoRot)
                #                qCurrTargetHoldingNoRot = getqHoldingNoRot(actionPatchesNoRot)
                #                qCurrTargetNoRot = np.concatenate([qCurrTargetNotHoldingNoRot,qCurrTargetHoldingNoRot],axis=1)
                #                idx = np.nonzero(np.int32(qCurrTargetNoRot[range(batch_size),states_t] > targets))
                #                targets[idx] = qCurrTargetNoRot[idx,states_t[idx]]

                targetTrainNotHoldingNoRot(
                    actionPatchesNoRot,
                    np.reshape(qCurrTarget[:, 0], [batch_size, 1]),
                    np.reshape(weights, [batch_size, 1]))
                targetTrainHoldingNoRot(
                    actionPatchesNoRot,
                    np.reshape(qCurrTarget[:, 1], [batch_size, 1]),
                    np.reshape(weights, [batch_size, 1]))

            # Update replay priorities using td_error
            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            #            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % exploration factor: " + str(int(100*explorationGaussianFactor.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))

            timerStart = timerFinal

        obs = np.copy(new_obs)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)
        fileOutV = fileOut + 'V'
        print("fileOutV: " + fileOutV)
        np.save(fileOutV, V)

    # display value function
    obs = env.reset()

    moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]])
    moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1
    actionsPickDescriptors = np.stack(
        [moveDescriptorsNoRot,
         np.zeros(np.shape(moveDescriptorsNoRot))],
        axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot],
        axis=3)
    qPickNotHoldingNoRot = getqNotHoldingNoRot(actionsPickDescriptors)
    qPickHoldingNoRot = getqHoldingNoRot(actionsPickDescriptors)
    qPickNoRot = np.concatenate([qPickNotHoldingNoRot, qPickHoldingNoRot],
                                axis=1)
    qPlaceNotHoldingNoRot = getqNotHoldingNoRot(actionsPlaceDescriptors)
    qPlaceHoldingNoRot = getqHoldingNoRot(actionsPlaceDescriptors)
    qPlaceNoRot = np.concatenate([qPlaceNotHoldingNoRot, qPlaceHoldingNoRot],
                                 axis=1)

    moveDescriptors = getMoveActionDescriptorsRot([obs[0]])
    moveDescriptors = moveDescriptors * 2 - 1
    actionsPickDescriptors = np.stack(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
    qPickNotHolding = getqNotHoldingRot(actionsPickDescriptors)
    qPickHolding = getqHoldingRot(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    qPlaceNotHolding = getqNotHoldingRot(actionsPlaceDescriptors)
    qPlaceHolding = getqHoldingRot(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)

    gridSize = len(env.moveCenters)
    print("Value function for pick action in hold-0 state:")
    print(str(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize])))
    print("Value function for pick action for rot0 in hold-0 state:")
    print(str(np.reshape(qPick[:gridSize**2, 0], [gridSize, gridSize])))
    print("Value function for pick action for rot1 in hold-0 state:")
    print(
        str(
            np.reshape(qPick[gridSize**2:2 * gridSize**2, 0],
                       [gridSize, gridSize])))
    print("Value function for pick action for rot2 in hold-0 state:")
    print(
        str(
            np.reshape(qPick[2 * gridSize**2:3 * gridSize**2, 0],
                       [gridSize, gridSize])))
    print("Value function for pick action for rot3 in hold-0 state:")
    print(
        str(
            np.reshape(qPick[3 * gridSize**2:4 * gridSize**2, 0],
                       [gridSize, gridSize])))

    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize])))
    print("Value function for place action for rot0 in hold-1 state:")
    print(str(np.reshape(qPlace[:gridSize**2, 1], [gridSize, gridSize])))
    print("Value function for place action for rot1 in hold-1 state:")
    print(
        str(
            np.reshape(qPlace[gridSize**2:2 * gridSize**2, 1],
                       [gridSize, gridSize])))
    print("Value function for place action for rot2 in hold-1 state:")
    print(
        str(
            np.reshape(qPlace[2 * gridSize**2:3 * gridSize**2, 1],
                       [gridSize, gridSize])))
    print("Value function for place action for rot3 in hold-1 state:")
    print(
        str(
            np.reshape(qPlace[3 * gridSize**2:4 * gridSize**2, 1],
                       [gridSize, gridSize])))

    plt.subplot(2, 10, 1)
    plt.imshow(np.tile(env.state[0], [1, 1, 3]), interpolation=None)
    plt.subplot(2, 10, 2)
    plt.imshow(np.reshape(qPick[:gridSize**2, 0], [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 3)
    plt.imshow(np.reshape(qPick[gridSize**2:2 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 4)
    plt.imshow(np.reshape(qPick[2 * gridSize**2:3 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 5)
    plt.imshow(np.reshape(qPick[3 * gridSize**2:4 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 6)
    plt.imshow(np.reshape(qPick[4 * gridSize**2:5 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 7)
    plt.imshow(np.reshape(qPick[5 * gridSize**2:6 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 8)
    plt.imshow(np.reshape(qPick[6 * gridSize**2:7 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 9)
    plt.imshow(np.reshape(qPick[7 * gridSize**2:8 * gridSize**2, 0],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 10)
    plt.imshow(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize]),
               vmin=5,
               vmax=12)

    plt.subplot(2, 10, 12)
    plt.imshow(np.reshape(qPlace[:gridSize**2, 1], [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 13)
    plt.imshow(np.reshape(qPlace[gridSize**2:2 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 14)
    plt.imshow(np.reshape(qPlace[2 * gridSize**2:3 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 15)
    plt.imshow(np.reshape(qPlace[3 * gridSize**2:4 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 16)
    plt.imshow(np.reshape(qPlace[4 * gridSize**2:5 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 17)
    plt.imshow(np.reshape(qPlace[5 * gridSize**2:6 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 18)
    plt.imshow(np.reshape(qPlace[6 * gridSize**2:7 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 19)
    plt.imshow(np.reshape(qPlace[7 * gridSize**2:8 * gridSize**2, 1],
                          [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.subplot(2, 10, 20)
    plt.imshow(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize]),
               vmin=5,
               vmax=12)
    plt.show()
Exemplo n.º 9
0
def main():

    env = envstandalone.MultiGhostEvade()
#    env = envstandalone.GhostEvade()
#    env = envstandalone.BallCatch()
    
#    max_timesteps=40000
    max_timesteps=80000
    learning_starts=1000
    buffer_size=50000
#    exploration_fraction=0.2
    exploration_fraction=0.4
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
#    target_network_update_freq=500
#    target_network_update_freq=100
#    target_network_update_freq=10
    target_network_update_freq=1
    learning_alpha = 0.2
    
    batch_size=32
#    batch_size=64
#    batch_size=1024
    train_freq=1

#    obsShape = (8,8,1)
    obsShape = env.observation_space.shape
#    deicticShape = (3,3,2)
#    deicticShape = (3,3,4)
#    deicticShape = (4,4,2)
#    deicticShape = (4,4,4)
    deicticShape = (5,5,2)
#    deicticShape = (6,6,2)
#    deicticShape = (8,8,2)
#    num_deictic_patches = 36
#    num_deictic_patches = 25
    num_deictic_patches = 16
#    num_deictic_patches = 9
#    num_deictic_patches = 1

    num_cascade = 5
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu=16
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Dictionary-based value function
    q_func = {}
    
    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def getTabularKeys(obsDeictic):
        obsDeicticTiled = np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
        obsBits = np.packbits(obsDeicticTiled,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the type cast below (UINT64) must be large enough to support the size of obsBits
            # if it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(obsDeictic):
        keys = getTabularKeys(obsDeictic)
        return np.array([q_func[x] if x in q_func else 1000*np.ones([num_cascade,num_actions]) for x in keys])
    
    def trainTabular(obsDeictic,qCurrTargets):
        keys = getTabularKeys(obsDeictic)
        alpha=0.5
        for i in range(len(keys)):
            if keys[i] in q_func:
                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func[keys[i]] = qCurrTargets[i]


    sess = U.make_session(num_cpu)
    sess.__enter__()

    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        # Get current obervations
        obsDeictic = getDeic([obs])
        qCurr = getTabular(obsDeictic)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)

        # Get next obervations
        obsNextDeictic = getDeic([new_obs])
        qNext = getTabular(obsNextDeictic)

        # Calculate TD target
        qNextmax = np.max(qNext[:,-1,:],1) # USE CASCADE
        targets = rew + (1-done) * gamma * qNextmax

        # Update dictionary value function
        qCurrTargets = np.copy(qCurr)

        # Copy into cascade with pruning.
        qCurrTargets[:,0,action] = targets
        for i in range(num_cascade-1):
            mask = targets < qCurr[:,i,action]
            qCurrTargets[:,i+1,action] = \
                mask*targets + \
                (1-mask)*qCurr[:,i+1,action]
        
#        qCurrTargets[:,action] = np.minimum(targets,qCurrTargets[:,action])
        
        
        trainTabular(obsDeictic,qCurrTargets)

        if t > 3000:
            obsDeictic

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
Exemplo n.º 10
0
    def learn(self,
              total_timesteps=None,
              total_episodes=None,
              log_interval=100,
              ckpt_interval=100,
              ckpt_path=None):
        def _sample_episode():
            sample = []
            obs = self.env.reset()
            done = False

            while not done:
                update_eps = self.exploration.value(self.ep_done)

                if np.random.random_sample() > update_eps:
                    action, value = self.policy.predict(obs,
                                                        deterministic=True)
                else:
                    action, value = self.policy.predict(obs,
                                                        deterministic=False)

                new_obs, reward, done, info = self.env.step(action)

                sample.append((obs, action, reward))
                obs = new_obs

            return sample

        last_100rewards = np.zeros(100)
        last_100rewards[:] = np.NaN
        episode_rewards = []
        episode_successes = []
        loop_var = total_timesteps if total_timesteps is not None else total_episodes
        loop_type = 'episode' if total_episodes else 'timesteps'

        if total_timesteps is not None:
            raise ValueError(
                'Only total_episodes can be specified for this class')

        # if self.exploration_frac is None:
        #     self.exploration = LinearSchedule(frac=self.exploration_ep,
        #                                       initial=self.exploration_initial_eps,
        #                                       final=self.exploration_final_eps)
        # else:
        #     self.exploration = LinearSchedule(frac=self.exploration_frac * loop_var,
        #                                       initial=self.exploration_initial_eps,
        #                                       final=self.exploration_final_eps)
        if self.exploration_type == 'linear':
            self.exploration = LinearSchedule(
                frac=self.exploration_frac * loop_var,
                initial=self.exploration_initial_eps,
                final=self.exploration_final_eps)
        elif self.exploration_type == 'exponential':
            self.exploration = ExponentialSchedule(
                frac=self.exploration_frac,
                initial=self.exploration_initial_eps,
                final=self.exploration_final_eps)

        train = True

        while train:
            sample = _sample_episode()
            obses, actions, rewards = zip(*sample)
            for idx in range(len(sample)):
                self.elapsed_steps += 1
                discounts = np.array(
                    [self.gamma**i for i in range(len(obses) + 1)])
                expected_reward = sum(
                    rewards[idx:] *
                    discounts[:-(1 + idx)]) - self.qvalues[obses[idx] +
                                                           (actions[idx], )]
                self.qvalues[obses[idx] + (
                    actions[idx], )] += self.learning_rate * expected_reward
                # print(np.where(self.qvalues!=0))

                if self.policy.intent:
                    h_update = np.zeros(self.qvalues.shape)
                    intent_update = np.zeros(len(BLACKJACK_OUTCOMES))
                    for iidx, (obs, action, reward) in enumerate(
                            zip(obses[idx:], actions[idx:], rewards[idx:])):
                        h_update[obs + (
                            action, )] += self.learning_rate * discounts[iidx]
                        outcome = BLACKJACK_OUTCOMES[int(action), int(reward)]
                        intent_update[
                            outcome] += self.learning_rate * discounts[iidx]
                    mc_h = self.hvalues[obses[idx] + (actions[idx], )] * (
                        1 - self.learning_rate)
                    mc_h += h_update
                    # print(obses[idx], actions[idx])
                    mc_intent = self.intention[obses[idx] +
                                               (actions[idx], )] * (
                                                   1 - self.learning_rate)
                    mc_intent += intent_update
                    self.hvalues[obses[idx] + (actions[idx], )] = mc_h
                    self.intention[obses[idx] + (actions[idx], )] = mc_intent

            self.ep_done += 1
            last_100rewards[self.ep_done % 100] = np.sum(rewards)
            print("\rEpisode {}/{}".format(self.ep_done, total_episodes,
                                           np.mean(last_100rewards)),
                  end="")
            # print(len(sample))

            if self.ep_done >= total_episodes:
                train = False

            if ckpt_path is not None and ckpt_interval:
                if loop_type == 'episode':
                    if self.ep_done % ckpt_interval == 0:
                        ckpt_str = str(self.ep_done)
                        full_path = ckpt_path + '/' + ckpt_str
                        super(BlackjackMCTabularRLModel, self).save(full_path)

                if loop_type == 'timesteps':
                    if self.elapsed_steps % ckpt_interval == 0:
                        ckpt_str = str(self.ep_done)
                        full_path = ckpt_path + '/' + ckpt_str
                        super(BlackjackMCTabularRLModel, self).save(full_path)
def main():

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    env = gym.make("FrozenLake-v0")
    #    env = gym.make("FrozenLake8x8-v0")

    # Dictionary-based value function
    q_func_tabular = {}
    defaultQValue = np.ones(env.action_space.n)

    # Given an integer, return the corresponding boolean array
    def getBoolBits(state):
        return np.unpackbits(np.uint8(state), axis=1) == 1

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([
            q_func_tabular[x] if x in q_func_tabular else defaultQValue
            for x in keys
        ])

#    def trainTabular(vectorKey,qCurrTargets,weights):

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 0.1
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
                q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[
                    keys[i]] + alpha * qCurrTargets[i]


#                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]

    max_timesteps = 200000
    exploration_fraction = 0.3
    exploration_final_eps = 0.02
    print_freq = 1
    gamma = .98
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 10
    buffer_size = 100
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    valueFunctionType = "TABULAR"
    #    valueFunctionType = "DQN"

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Set up replay buffer
    prioritized_replay = True
    #    prioritized_replay=False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    state = env.reset()

    episode_rewards = [0.0]
    timerStart = time.time()
    for t in range(max_timesteps):

        #        np.unpackbits(np.uint8(np.reshape(states_tp1,[batch_size,1])),axis=1)
        qCurr = getTabular(getBoolBits([[state]]))

        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly

        # select action at random
        action = np.argmax(qCurrNoise)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        nextState, rew, done, _ = env.step(action)

        replay_buffer.add(state, action, rew, nextState, float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actions, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actions, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            qNext = getTabular(
                getBoolBits(np.reshape(states_tp1, [batch_size, 1])))

            qNextmax = np.max(qNext, axis=1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrTarget = getTabular(
                getBoolBits(np.reshape(states_t, [batch_size, 1])))

            td_error = qCurrTarget[range(batch_size), actions] - targets
            qCurrTarget[range(batch_size), actions] = targets

            trainTabular(getBoolBits(np.reshape(states_t, [batch_size, 1])),
                         qCurrTarget)

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        state = np.copy(nextState)
Exemplo n.º 12
0
def main():

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([
            q_func_tabular[x] if x in q_func_tabular else 10 *
            np.ones(num_states) for x in keys
        ])

#    def trainTabular(vectorKey,qCurrTargets,weights):

    def trainTabular(vectorKey, qCurrTargets, weights):
        keys = getTabularKeys(vectorKey)
        alpha = 0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
                #                q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[
                    keys[i]] = q_func_tabular[keys[i]] + alpha * weights[i] * (
                        qCurrTargets[i] - q_func_tabular[keys[i]]
                    )  # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]

    # Return a list of actions in adjacent patches to <action>
    def getAdjacentActions(action):
        side = len(env.moveCenters)
        mat = np.reshape(range(side**2), [side, side])
        move = action
        if action >= side**2:
            move = action - side**2
        coords = np.squeeze(np.nonzero(mat == move))
        adjacent = []

        # this cell
        adjacent.append(coords)

        # 8-neighborhood
        adjacent.append(coords - [0, 1])
        adjacent.append(coords + [0, 1])
        adjacent.append(coords - [1, 0])
        adjacent.append(coords + [1, 0])
        adjacent.append(coords + [-1, -1])
        adjacent.append(coords + [1, -1])
        adjacent.append(coords + [-1, 1])
        adjacent.append(coords + [1, 1])

        # 16-neighborhood
        adjacent.append(coords + [-2, 2])
        adjacent.append(coords + [-1, 2])
        adjacent.append(coords + [0, 2])
        adjacent.append(coords + [1, 2])
        adjacent.append(coords + [2, 2])
        adjacent.append(coords + [2, 1])
        adjacent.append(coords + [2, 0])
        adjacent.append(coords + [2, -1])
        adjacent.append(coords + [2, -2])
        adjacent.append(coords + [1, -2])
        adjacent.append(coords + [0, -2])
        adjacent.append(coords + [-1, -2])
        adjacent.append(coords + [-2, -2])
        adjacent.append(coords + [-2, -1])
        adjacent.append(coords + [-2, 0])
        adjacent.append(coords + [-2, 1])

        adjacentValid = [x for x in adjacent if all(x < side) and all(x >= 0)]
        if action >= side**2:
            return [side**2 + x[0] * side + x[1] for x in adjacentValid]
        else:
            return [x[0] * side + x[1] for x in adjacentValid]

    env = envstandalone.NumbersArrange()

    # Standard q-learning parameters
    max_timesteps = 2000
    exploration_fraction = 0.3
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 10
    buffer_size = 1000
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    # first two elts of deicticShape must be odd
    descriptorShape = (env.blockSize * 3, env.blockSize * 3, 2)
    #    descriptorShapeSmall = (10,10,2)
    #    descriptorShapeSmall = (14,14,2)
    descriptorShapeSmall = (20, 20, 2)
    num_states = 2  # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2 * num_patches
    num_actions_discrete = 2
    num_patches_side = len(env.moveCenters)
    #    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
    #    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE"  # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    #    prioritized_replay=True
    prioritized_replay = False
    #    prioritized_replay_alpha=1.0
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    #    prioritized_replay_beta_iters=20000
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
        #    q_func = models.cnn_to_mlp_2pathways(
        #        convs=[(16,3,1), (32,3,1)],
        #        hiddens=[48],
        convs=[(16, 3, 1)],
        hiddens=[32],
        #        convs=[(32,3,1)],
        #        hiddens=[48],
        #        convs=[(48,3,1)],
        #        hiddens=[48],
        dueling=True)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)

    if valueFunctionType == 'DQN':
        getqNotHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                    q_func=q_func,
                                    num_states=num_states,
                                    num_cascade=5,
                                    scope="deepq",
                                    qscope="q_func_notholding")
        getqHolding = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                 q_func=q_func,
                                 num_states=num_states,
                                 num_cascade=5,
                                 scope="deepq",
                                 qscope="q_func_holding")
        targetTrainNotHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_notholding",
            grad_norm_clipping=1.)
        targetTrainHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_holding",
            grad_norm_clipping=1.)

        getqNotHoldingCoarse = build_getq(
            make_actionDeic_ph=make_actionDeic_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            scope="deepq",
            qscope="q_func_notholding_coarse")
        getqHoldingCoarse = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                       q_func=q_func,
                                       num_states=num_states,
                                       num_cascade=5,
                                       scope="deepq",
                                       qscope="q_func_holding_coarse")
        targetTrainNotHoldingCoarse = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            #            optimizer=tf.train.AdamOptimizer(learning_rate=lr*20),
            optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_notholding_coarse",
            grad_norm_clipping=None)
        targetTrainHoldingCoarse = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            #            optimizer=tf.train.AdamOptimizer(learning_rate=lr*20),
            optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_holding_coarse",
            grad_norm_clipping=None)

        sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()

    episode_rewards = [0.0]
    newEpisode = 0
    td_errors = [0.0]
    timerStart = time.time()
    U.initialize()
    for t in range(max_timesteps):

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors * 2 - 1
        actionsPickDescriptors = np.stack(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.stack(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]

        qCurrNotHolding = getqNotHolding(actionDescriptors)
        qCurrHolding = getqHolding(actionDescriptors)
        qCurr = np.concatenate([qCurrNotHolding, qCurrHolding], axis=1)

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:, obs[1]])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _, idx, inv = np.unique(actionDescriptors,
                                    axis=0,
                                    return_index=True,
                                    return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx, obs[1]])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv == actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        adjacentActions = getAdjacentActions(action)

        # take action
        new_obs, rew, done, _ = env.step(action)

        #        replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done))
        replay_buffer.add(obs[1], actionDescriptors[action, :],
                          actionDescriptors[adjacentActions, :], np.copy(rew),
                          np.copy(new_obs), np.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actionPatches, actionPatchesAdjacent, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actionPatches, actionPatchesAdjacent, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext = moveDescriptorsNext * 2 - 1
            actionsPickDescriptorsNext = np.stack(
                [moveDescriptorsNext,
                 np.zeros(np.shape(moveDescriptorsNext))],
                axis=3)
            actionsPlaceDescriptorsNext = np.stack(
                [np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],
                axis=3)
            actionDescriptorsNext = np.stack(
                [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext],
                axis=1
            )  # I sometimes get this axis parameter wrong... pay attention!

            # flat estimate of qNextmax
            actionDescriptorsNext = np.reshape(actionDescriptorsNext, [
                batch_size * num_patches * num_actions_discrete,
                descriptorShapeSmall[0], descriptorShapeSmall[1],
                descriptorShapeSmall[2]
            ])
            qNextNotHolding = getqNotHolding(actionDescriptorsNext)
            qNextHolding = getqHolding(actionDescriptorsNext)
            qNextFlat = np.concatenate([qNextNotHolding, qNextHolding], axis=1)
            qNext = np.reshape(
                qNextFlat,
                [batch_size, num_patches, num_actions_discrete, num_states])
            qNextmax = np.max(
                np.max(qNext[range(batch_size), :, :, states_tp1], 2), 1)

            #            # coarse/fine estimate of qNextmax
            #            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size,num_patches_side,num_patches_side,num_actions_discrete,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]])
            #            aa = actionDescriptorsNext[:,range(0,num_patches_side,2),:,:,:,:,:]
            #            bb = aa[:,:,range(0,num_patches_side,2),:,:,:,:]
            #            cc = np.reshape(bb,[-1,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]])
            #            qNextNotHolding = getqNotHoldingCoarse(cc)
            #            qNextHolding = getqHoldingCoarse(cc)
            #            qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1)
            #            qNext = np.reshape(qNextFlat,[batch_size,-1,num_actions_discrete,num_states])
            #            qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1)

            targets = rewards + (1 - dones) * gamma * qNextmax

            # train action Patches
            qCurrTargetNotHolding = getqNotHolding(actionPatches)
            qCurrTargetHolding = getqHolding(actionPatches)
            qCurrTarget = np.concatenate(
                [qCurrTargetNotHolding, qCurrTargetHolding], axis=1)
            td_error = qCurrTarget[range(batch_size), states_t] - targets
            qCurrTarget[range(batch_size), states_t] = targets
            targetTrainNotHolding(
                actionPatches, np.reshape(qCurrTarget[:, 0], [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHolding(actionPatches,
                               np.reshape(qCurrTarget[:, 1], [batch_size, 1]),
                               np.reshape(weights, [batch_size, 1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            td_errors[-1] += td_error

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        newEpisode = 0
        if done:
            newEpisode = 1
            new_obs = env.reset()
            episode_rewards.append(0.0)
            td_errors.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart) + ", tderror: " +
                  str(mean_100ep_tderror))
            timerStart = timerFinal

        obs = np.copy(new_obs)

        # Train coarse grid
        if newEpisode:
            moveDescriptors = getMoveActionDescriptors([obs[0]])
            moveDescriptors = moveDescriptors * 2 - 1
            actionsPickDescriptors = np.stack(
                [moveDescriptors,
                 np.zeros(np.shape(moveDescriptors))], axis=3)
            actionsPlaceDescriptors = np.stack(
                [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
            actionDescriptors = np.r_[actionsPickDescriptors,
                                      actionsPlaceDescriptors]
            #            actionDescriptors, inverseIdx = np.unique(actionDescriptors,axis=0,return_inverse=True) # reduce to just unique descriptors
            qCurrNotHolding = getqNotHolding(actionDescriptors)
            qCurrHolding = getqHolding(actionDescriptors)
            qTargetNotHolding = np.zeros(np.shape(qCurrNotHolding))
            qTargetHolding = np.zeros(np.shape(qCurrHolding))
            for jj in range(num_actions):
                adj = getAdjacentActions(jj)
                qTargetNotHolding[jj] = np.max(qCurrNotHolding[adj])
                qTargetHolding[jj] = np.max(qCurrHolding[adj])
            for iter in range(10):
                targetTrainNotHoldingCoarse(
                    actionDescriptors, np.reshape(qTargetNotHolding, [-1, 1]),
                    np.ones([num_actions, 1]))
                targetTrainHoldingCoarse(actionDescriptors,
                                         np.reshape(qTargetHolding, [-1, 1]),
                                         np.ones([num_actions, 1]))


#    # Train coarse grid
#    for iter in range(500):
#        print(str(iter))
#        obs = env.reset()
#        moveDescriptors = getMoveActionDescriptors([obs[0]])
#        moveDescriptors = moveDescriptors*2-1
#        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
#        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
#        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]
#        qCurrNotHolding = getqNotHolding(actionDescriptors)
#        qCurrHolding = getqHolding(actionDescriptors)
#        qTargetNotHolding = np.zeros(np.shape(qCurrNotHolding))
#        qTargetHolding = np.zeros(np.shape(qCurrHolding))
#        for jj in range(num_actions):
#            adj = getAdjacentActions(jj)
#            qTargetNotHolding[jj] = np.max(qCurrNotHolding[adj])
#            qTargetHolding[jj] = np.max(qCurrHolding[adj])
#        targetTrainNotHoldingCoarse(actionDescriptors, np.reshape(qTargetNotHolding,[-1,1]), np.ones([num_actions,1]))
#        targetTrainHoldingCoarse(actionDescriptors, np.reshape(qTargetHolding,[-1,1]), np.ones([num_actions,1]))

# display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors * 2 - 1
    gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0]))

    actionsPickDescriptors = np.stack(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)

    print(str(obs[0][:, :, 0]))

    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    qPickHolding = getqHolding(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:, 0], [gridSize, gridSize])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:, 1], [gridSize, gridSize])))

    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:, 0], [gridSize, gridSize])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:, 1], [gridSize, gridSize])))

    qPickNotHolding = getqNotHoldingCoarse(actionsPickDescriptors)
    qPickHolding = getqHoldingCoarse(actionsPickDescriptors)
    qPickCoarse = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPickCoarse[:, 0], [gridSize, gridSize])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPickCoarse[:, 1], [gridSize, gridSize])))

    qPlaceNotHolding = getqNotHoldingCoarse(actionsPlaceDescriptors)
    qPlaceHolding = getqHoldingCoarse(actionsPlaceDescriptors)
    qPlaceCoarse = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlaceCoarse[:, 0], [gridSize, gridSize])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlaceCoarse[:, 1], [gridSize, gridSize])))

    plt.subplot(2, 3, 1)
    plt.imshow(np.tile(env.state[0], [1, 1, 3]))
    plt.subplot(2, 3, 2)
    plt.imshow(np.reshape(qPick[:, 0], [gridSize, gridSize]))
    plt.subplot(2, 3, 3)
    plt.imshow(np.reshape(qPlace[:, 1], [gridSize, gridSize]))
    plt.subplot(2, 3, 5)
    plt.imshow(np.reshape(qPickCoarse[:, 0], [gridSize, gridSize]))
    plt.subplot(2, 3, 6)
    plt.imshow(np.reshape(qPlaceCoarse[:, 1], [gridSize, gridSize]))

    plt.show()
Exemplo n.º 13
0
def main(initEnvStride, envStride, fileIn, fileOut, inputmaxtimesteps,
         vispolicy, objType, numOrientations, useRotHierarchy,
         useHandCodeHierarchy):

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Create environment and set two stride parameters (stride-x and stride-y)
    # for this problem instance. Most of the time, the two stride parameters will be equal.
    env = envstandalone.PuckArrange()
    env.initStride = initEnvStride  # stride for initial puck placement
    env.stride = envStride  # stride for action specification
    env.blockType = objType
    env.num_orientations = numOrientations
    env.reset()

    # Standard q-learning parameters
    reuseModels = None
    max_timesteps = inputmaxtimesteps
    exploration_fraction = 0.75
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 60
    buffer_size = 10000
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1

    # SGD learning rate
    lr = 0.0003

    # Set parameters related to shape of the patch and the number of patches
    descriptorShape = (
        env.blockSize * 3, env.blockSize * 3, 2
    )  # size of patch descriptor relative to number of "blocks" on board (each block is a 28x28 region)
    descriptorShapeSmall = (
        25, 25, 2
    )  # size to which each patch gets resized to. Code runs faster w/ smaller sizes, but could miss detail needed to solve the problem.
    num_discrete_states = 2  # number of discrete states: either holding or not
    num_patches = len(
        env.moveCenters
    )**2  # env.moveCenters is num of patches along one side of image
    num_actions = num_discrete_states * num_patches * env.num_orientations  # total actions = num discrete states X num non-rotated descriptor patches X num of orientations per patch location

    # e-greedy exploration schedule. I find that starting at e=50% helps curriculum learning "remember" what was learned in the prior run.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=0.5,
                                 final_p=exploration_final_eps)

    # Set parameters for prioritized replay. You  can turn this off just by
    # setting the line below to False
    prioritized_replay = True
    #    prioritized_replay=False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Create neural network
    q_func = models.cnn_to_mlp(convs=[(16, 3, 1), (32, 3, 1)],
                               hiddens=[64],
                               dueling=True)

    # Build tensorflow functions
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptorsNoRot = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride)
    getMoveActionDescriptorsRot = build_getMoveActionDescriptorsRot(
        make_obs_ph=make_obs_ph,
        actionShape=descriptorShape,
        actionShapeSmall=descriptorShapeSmall,
        stride=env.stride,
        numOrientations=numOrientations)

    getqNotHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                   q_func=q_func,
                                   num_discrete_states=num_discrete_states,
                                   num_cascade=5,
                                   scope="deepq",
                                   qscope="q_func_notholding_rot",
                                   reuse=reuseModels)
    getqHoldingRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                q_func=q_func,
                                num_discrete_states=num_discrete_states,
                                num_cascade=5,
                                scope="deepq",
                                qscope="q_func_holding_rot",
                                reuse=reuseModels)

    targetTrainNotHoldingRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_discrete_states=num_discrete_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(
            learning_rate=lr / 2.),  # rotation learns slower than norot
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr/2.), # rotation learns slower than norot
        scope="deepq",
        qscope="q_func_notholding_rot",
        #        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHoldingRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_discrete_states=num_discrete_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(
            learning_rate=lr / 2.),  # rotation learns slower than norot
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr/2.), # rotation learns slower than norot
        scope="deepq",
        qscope="q_func_holding_rot",
        #        grad_norm_clipping=1.,
        reuse=reuseModels)

    getqNotHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                     q_func=q_func,
                                     num_discrete_states=num_discrete_states,
                                     num_cascade=5,
                                     scope="deepq",
                                     qscope="q_func_notholding_norot",
                                     reuse=reuseModels)
    getqHoldingNoRot = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                                  q_func=q_func,
                                  num_discrete_states=num_discrete_states,
                                  num_cascade=5,
                                  scope="deepq",
                                  qscope="q_func_holding_norot",
                                  reuse=reuseModels)

    targetTrainNotHoldingNoRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_discrete_states=num_discrete_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_notholding_norot",
        #        grad_norm_clipping=1.,
        reuse=reuseModels)

    targetTrainHoldingNoRot = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_discrete_states=num_discrete_states,
        num_cascade=5,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func_holding_norot",
        #        grad_norm_clipping=1.,
        reuse=reuseModels)

    # Initialize tabular state-value function. There are only two states (holding, not holding), so this is very easy.
    lrState = 0.1
    V = np.zeros([
        2,
    ])

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    # Initialize things
    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()

    # Load neural network model if one was specified.
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)
        fileInV = fileIn + 'V.npy'
        V = np.load(fileInV)

    # Iterate over time steps
    for t in range(max_timesteps):

        # Get NoRot descriptors. Each x-y position gets one descriptor patch in
        # a single orientation. Encode pick/place using a stack of two image channels.
        # Pick actions are denoted by the patch in channel 0 and zeros in channel 1.
        # Place actions have zeros in channel 0 and the patch in channel 1.
        # Each elt of actionDescriptorsNoRot is a pick/place action to a specific
        # position with orientation left unspecified.
        moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]])
        moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1
        actionsPickDescriptorsNoRot = np.stack(
            [moveDescriptorsNoRot,
             np.zeros(np.shape(moveDescriptorsNoRot))],
            axis=3)
        actionsPlaceDescriptorsNoRot = np.stack(
            [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot],
            axis=3)
        actionDescriptorsNoRot = np.r_[actionsPickDescriptorsNoRot,
                                       actionsPlaceDescriptorsNoRot]

        # If useHandCodeHierarchy == 1, we exclude patches that are completely zero
        if useHandCodeHierarchy == 1:
            nonZeroMoves = np.sum(np.sum(moveDescriptorsNoRot > 0, -1), -1) > 0
            movesCandidates = np.nonzero(nonZeroMoves)[0]
            actionsCandidates = []
            for jj in range(0, num_discrete_states):
                for ii in range(0, env.num_orientations):
                    actionsCandidates = np.r_[actionsCandidates,
                                              movesCandidates +
                                              ii * env.num_moves +
                                              jj * env.num_orientations *
                                              env.num_moves]
            actionsCandidatesHandCodeHierarchy = np.int32(actionsCandidates)
            movesCandidatesHandCodeHierarchy = np.int32(movesCandidates)

        else:
            actionsCandidatesHandCodeHierarchy = range(
                num_discrete_states * env.num_moves * env.num_orientations)
            movesCandidatesHandCodeHierarchy = range(env.num_moves)

        # If useRotHierarchy == 1, we evaluate the Q function using a two-level hierarchy.
        # The first level (getq<Not>HoldingNoRot) is position but no rotation.
        # The second level (getq<Not>HoldingRot) is both position and orientation.
        # Specifically, we evaluate getq<Not>HoldingRot only for the top 20% of positions
        # found using getq<Not>HoldingNoRot.
        if useRotHierarchy == 1:

            # Get NoRot values
            if obs[1] == 0:
                qCurrPick = getqNotHoldingNoRot(actionsPickDescriptorsNoRot[
                    movesCandidatesHandCodeHierarchy])
                qCurrPlace = getqNotHoldingNoRot(actionsPlaceDescriptorsNoRot[
                    movesCandidatesHandCodeHierarchy])
            elif obs[1] == 1:
                qCurrPick = getqHoldingNoRot(actionsPickDescriptorsNoRot[
                    movesCandidatesHandCodeHierarchy])
                qCurrPlace = getqHoldingNoRot(actionsPlaceDescriptorsNoRot[
                    movesCandidatesHandCodeHierarchy])
            else:
                print("error: state out of bounds")
            qCurrNoRot = np.squeeze(np.r_[qCurrPick, qCurrPlace])
            qCurrNoRotIdx = np.r_[movesCandidatesHandCodeHierarchy,
                                  env.num_moves +
                                  movesCandidatesHandCodeHierarchy]

            # Get Rot actions corresponding to top k% NoRot actions
            k = 0.2  # top k% of NoRot actions
            #            k=0.1 # DEBUG: TRYING TO VISUALIZE AND RAN OUT OF MEM ON LAPTOP...
            valsNoRot = qCurrNoRot
            topKactionsNoRot = np.argsort(
                valsNoRot)[-np.int32(np.shape(valsNoRot)[0] * k):]
            topKpositionsNoRot = qCurrNoRotIdx[topKactionsNoRot] % env.num_moves
            topKpickplaceNoRot = qCurrNoRotIdx[topKactionsNoRot] / env.num_moves
            actionsCandidates = []
            for ii in range(2):
                eltsPos = topKpositionsNoRot[topKpickplaceNoRot == ii]
                for jj in range(env.num_orientations):
                    actionsCandidates = np.r_[
                        actionsCandidates, eltsPos + jj * env.num_moves + ii *
                        (env.num_moves * env.num_orientations)]
            actionsCandidatesRotHierarchy = np.int32(actionsCandidates)

        # No rot hierarchy
        else:
            actionsCandidatesRotHierarchy = range(
                num_discrete_states * env.num_moves * env.num_orientations)

        # Intersect two types of hierarchy and get final list of actions to consider
        actionsCandidates = np.intersect1d(actionsCandidatesRotHierarchy,
                                           actionsCandidatesHandCodeHierarchy)

        # Get all patch descriptors (position + rotation)
        moveDescriptorsRot = getMoveActionDescriptorsRot([obs[0]])
        moveDescriptorsRot = moveDescriptorsRot * 2 - 1
        actionsPickDescriptorsRot = np.stack(
            [moveDescriptorsRot,
             np.zeros(np.shape(moveDescriptorsRot))],
            axis=3)
        actionsPlaceDescriptorsRot = np.stack(
            [np.zeros(np.shape(moveDescriptorsRot)), moveDescriptorsRot],
            axis=3)
        actionDescriptorsRot = np.r_[actionsPickDescriptorsRot,
                                     actionsPlaceDescriptorsRot]

        # Get qCurr for selected actions, i.e. actions contained in actionCandidates
        actionDescriptorsRotReduced = actionDescriptorsRot[actionsCandidates]
        if obs[1] == 0:
            qCurrReduced = np.squeeze(
                getqNotHoldingRot(actionDescriptorsRotReduced))
        elif obs[1] == 1:
            qCurrReduced = np.squeeze(
                getqHoldingRot(actionDescriptorsRotReduced))
        else:
            print("error: state out of bounds")
        qCurr = -100 * np.ones(np.shape(actionDescriptorsRot)[0])
        qCurr[actionsCandidates] = np.copy(qCurrReduced)

        # Update tabular state-value function using V(s) = max_a Q(s,a)
        thisStateValues = np.max(qCurr)
        V[obs[1]] = (1 - lrState) * V[obs[1]] + lrState * thisStateValues

        #        # Select e-greedy action to execute
        #        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        #        action = np.argmax(qCurrNoise)
        #        if (np.random.rand() < exploration.value(t)) and not vispolicy:
        #            action = np.random.randint(num_actions)

        # e-greedy + softmax action selection
        qCurrExp = np.exp(qCurr / 0.1)
        probs = qCurrExp / np.sum(qCurrExp)
        action = np.random.choice(range(np.size(probs)), p=probs)
        if (np.random.rand() < exploration.value(t)) and not vispolicy:
            action = np.random.randint(num_actions)

        # factor action into position, orientation, pick-or-place
        position = action % env.num_moves
        pickplace = action / (env.num_moves * env.num_orientations)
        orientation = (action - pickplace * env.num_moves *
                       env.num_orientations) / env.num_moves
        actionNoRot = position + pickplace * env.num_moves

        if vispolicy:
            print("action: " + str(action))
            print("position: " + str(position))
            print("pickplace: " + str(pickplace))
            print("orientation: " + str(orientation))
            plt.subplot(1, 2, 1)
            plt.imshow(env.state[0][:, :, 0])
            sp.misc.imsave('temp1.png', env.state[0][:, :, 0])

        # Execute action
        new_obs, rew, done, _ = env.step(action)

        # Add to buffer
        replay_buffer.add(cp.copy(obs[1]),
                          np.copy(actionDescriptorsNoRot[actionNoRot, :]),
                          np.copy(actionDescriptorsRot[action, :]),
                          cp.copy(rew), cp.copy(new_obs[1]),
                          cp.copy(float(done)))

        # If vispolicy==True, then visualize policy
        if vispolicy:
            print("rew: " + str(rew))
            print("done: " + str(done))
            plt.subplot(1, 2, 2)
            plt.imshow(env.state[0][:, :, 0])
            plt.show()
            sp.misc.imsave('temp2.png', env.state[0][:, :, 0])

        if t > learning_starts and t % train_freq == 0:

            # Get batch
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actionPatchesNoRot, actionPatchesRot, rewards, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            # Calculate target
            targets = rewards + (1 - dones) * gamma * V[states_tp1]

            # Get current q-values and calculate td error and q-value targets
            qCurrTargetNotHolding = getqNotHoldingRot(actionPatchesRot)
            qCurrTargetHolding = getqHoldingRot(actionPatchesRot)
            qCurrTarget = np.concatenate(
                [qCurrTargetNotHolding, qCurrTargetHolding], axis=1)
            td_error = qCurrTarget[range(batch_size), states_t] - targets
            qCurrTarget[range(batch_size), states_t] = targets

            # Train
            targetTrainNotHoldingRot(
                actionPatchesRot, np.reshape(qCurrTarget[:, 0],
                                             [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHoldingRot(
                actionPatchesRot, np.reshape(qCurrTarget[:, 1],
                                             [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))

            targetTrainNotHoldingNoRot(
                actionPatchesNoRot,
                np.reshape(qCurrTarget[:, 0], [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))
            targetTrainHoldingNoRot(
                actionPatchesNoRot,
                np.reshape(qCurrTarget[:, 1], [batch_size, 1]),
                np.reshape(weights, [batch_size, 1]))

            # Update replay priorities using td_error
            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = np.copy(new_obs)

    # save learning curve
    filename = 'PA18_deictic_rewards.dat'
    np.savetxt(filename, episode_rewards)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)
        fileOutV = fileOut + 'V'
        print("fileOutV: " + fileOutV)
        np.save(fileOutV, V)

    # Display value function from this run
    obs = env.reset()

    moveDescriptorsNoRot = getMoveActionDescriptorsNoRot([obs[0]])
    moveDescriptorsNoRot = moveDescriptorsNoRot * 2 - 1
    actionsPickDescriptors = np.stack(
        [moveDescriptorsNoRot,
         np.zeros(np.shape(moveDescriptorsNoRot))],
        axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptorsNoRot)), moveDescriptorsNoRot],
        axis=3)
    qPickNotHoldingNoRot = getqNotHoldingNoRot(actionsPickDescriptors)
    qPickHoldingNoRot = getqHoldingNoRot(actionsPickDescriptors)
    qPickNoRot = np.concatenate([qPickNotHoldingNoRot, qPickHoldingNoRot],
                                axis=1)
    qPlaceNotHoldingNoRot = getqNotHoldingNoRot(actionsPlaceDescriptors)
    qPlaceHoldingNoRot = getqHoldingNoRot(actionsPlaceDescriptors)
    qPlaceNoRot = np.concatenate([qPlaceNotHoldingNoRot, qPlaceHoldingNoRot],
                                 axis=1)

    moveDescriptors = getMoveActionDescriptorsRot([obs[0]])
    moveDescriptors = moveDescriptors * 2 - 1
    actionsPickDescriptors = np.stack(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
    qPickNotHolding = getqNotHoldingRot(actionsPickDescriptors)
    qPickHolding = getqHoldingRot(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding, qPickHolding], axis=1)
    qPlaceNotHolding = getqNotHoldingRot(actionsPlaceDescriptors)
    qPlaceHolding = getqHoldingRot(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding, qPlaceHolding], axis=1)

    gridSize = len(env.moveCenters)
    print("Value function for pick action in hold-0 state:")
    print(str(np.reshape(qPickNoRot[:gridSize**2, 0], [gridSize, gridSize])))
    for ii in range(env.num_orientations):
        print("Value function for pick action for rot" + str(ii) +
              " in hold-0 state:")
        print(
            str(
                np.reshape(
                    qPick[ii * (gridSize**2):(ii + 1) * (gridSize**2), 0],
                    [gridSize, gridSize])))

    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlaceNoRot[:gridSize**2, 1], [gridSize, gridSize])))
    for ii in range(env.num_orientations):
        print("Value function for place action for rot" + str(ii) +
              " in hold-1 state:")
        print(
            str(
                np.reshape(
                    qPlace[ii * (gridSize**2):(ii + 1) * (gridSize**2), 0],
                    [gridSize, gridSize])))
Exemplo n.º 14
0
def main():

    #    env = envstandalone.BallCatch()
    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    #    batch_size=1
    train_freq = 1

    obsShape = (8, 8, 1)
    deicticShape = (3, 3, 1)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu = 16

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):
                deicticObs.append(obs[i:i + windowLen, j:j + windowLen, :])
        return np.array(deicticObs)

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        return (np.array(deicticObsBatch))

    # input: batch x nxnx1 tensor of observations
    def convertState(observations):
        shape = np.shape(observations)
        observations_small = np.squeeze(observations)
        agent_pos = np.nonzero(observations_small == 10)
        ghost_pos = np.nonzero(observations_small == 20)
        state_numeric = 3 * np.ones((4, shape[0]))
        state_numeric[0, agent_pos[0]] = agent_pos[1]
        state_numeric[1, agent_pos[0]] = agent_pos[2]
        state_numeric[2, ghost_pos[0]] = ghost_pos[1]
        state_numeric[3, ghost_pos[0]] = ghost_pos[2]
        return np.int32(state_numeric)

    def convertStateBatch(observations):
        shape = np.shape(observations)
        state_numeric_batch = []
        for batch in range(shape[0]):
            state_numeric_batch.append(convertState(observations[batch]))
        return (np.array(state_numeric_batch))

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
        convs=[(16, 3, 1)],
        #        convs=[(16,2,1)],
        #        convs=[(32,3,1)],
        hiddens=[16],
        #        hiddens=[64],
        #        dueling=True
        dueling=False)

    q_func = model
    #    lr=1e-3
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(deicticShape, name=name)
#        return U.BatchInput(obsShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq, targetTrain = build_graph.build_train_nodouble(
        make_obs_ph=make_obs_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        grad_norm_clipping=10,
        double_q=False)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    #    tabularQ = 100*np.ones([deicticShape[0]+1,deicticShape[1]+1,deicticShape[0]+1,deicticShape[1]+1, num_actions])
    tabularQ = 0 * np.ones([
        deicticShape[0] + 1, deicticShape[1] + 1, deicticShape[0] + 1,
        deicticShape[1] + 1, num_actions
    ])

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeicticObs(obs)

        # get q: neural network
        qCurr = getq(np.array(obsDeictic))

        #        # get q: tabular
        #        stateCurr = convertState(obsDeictic)
        #        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise, 0))
        selPatch = np.argmax(np.max(qCurrNoise, 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:
            #        if t > max_timesteps:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeicticObsBatch(obses_t)
            obses_tp1_deic = getDeicticObsBatch(obses_tp1)

            # Reshape everything to (1152,) form
            obs_resize_to_network = [
                batch_size * num_deictic_patches, deicticShape[0],
                deicticShape[1], deicticShape[2]
            ]
            obses_t_deic = np.reshape(obses_t_deic, obs_resize_to_network)
            obses_tp1_deic = np.reshape(obses_tp1_deic, obs_resize_to_network)
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # Get curr, next values: neural network version
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

            #            # Get curr, next values: tabular version
            #            q_resize_from_network = [batch_size*num_deictic_patches,num_actions]
            #            stateNext = convertStateBatch(obses_tp1_deic)
            #            qNext = tabularQ[stateNext[:,0,:], stateNext[:,1,:], stateNext[:,2,:], stateNext[:,3,:],:]
            #            qNext = np.reshape(qNext,q_resize_from_network)
            #            stateCurr = convertStateBatch(obses_t_deic)
            #            qCurr = tabularQ[stateCurr[:,0,:], stateCurr[:,1,:], stateCurr[:,2,:], stateCurr[:,3,:],:]
            #            qCurr = np.reshape(qCurr,q_resize_from_network)

            # Get "raw" targets (no masking for cascade levels)
            qNextmax = np.max(qNext, 1)
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            # Update values: neural network version
            qCurrTargets = np.copy(qCurr)
            qCurrTargets[range(batch_size * num_deictic_patches),
                         actionsTiled] = targets

            td_error_out, obses_deic_out, targets_out = targetTrain(
                obses_t_deic, qCurrTargets)


#            # Update values: tabular version
#            stateCurrTiled = np.reshape(np.rollaxis(stateCurr,1),[num_actions,batch_size*num_deictic_patches])
#            tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] = \
#                (1 - learning_alpha) * tabularQ[stateCurrTiled[0,:], stateCurrTiled[1,:], stateCurrTiled[2,:], stateCurrTiled[3,:],actionsTiled] \
#                + learning_alpha * targets

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
class Agent:

    def __init__(self, net, actionSet, goalSet, defaultNSample, defaultRandomPlaySteps, controllerMemCap, explorationSteps, trainFreq, hard_update,
                 controllerEpsilon=defaultControllerEpsilon):
        self.actionSet = actionSet
        self.controllerEpsilon = controllerEpsilon
        self.goalSet = goalSet
        self.nSamples = defaultNSample 
        self.gamma = defaultGamma
        self.net = net
        self.memory = PrioritizedReplayBuffer(controllerMemCap, alpha=prioritized_replay_alpha)
        self.enable_double_dqn = True
        self.exploration = LinearSchedule(schedule_timesteps = explorationSteps, initial_p = 1.0, final_p = 0.02)
        self.defaultRandomPlaySteps = defaultRandomPlaySteps
        self.trainFreq = trainFreq
        self.randomPlay = True
        self.learning_done = False
        self.hard_update = hard_update

    def selectMove(self, state):
        if not self.learning_done:
            if self.controllerEpsilon < random.random():
                return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4))], verbose=0))
                #return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 4)), dummyYtrue, dummyMask], verbose=0)[1])
            return random.choice(self.actionSet)
        else:
            return np.argmax(self.simple_net.predict([np.reshape(state, (1, 84, 84, 4))], verbose=0))

    def setControllerEpsilon(self, epsilonArr):
        self.controllerEpsilon = epsilonArr

    def criticize(self, reachGoal, action, die, distanceReward, useSparseReward):
        reward = 0.0
        if reachGoal:
            reward += 1.0
            #reward += 50.0
        if die:
            reward -= 1.0
        if not useSparseReward:
            reward += distanceReward
        reward = np.minimum(reward, maxReward)
        reward = np.maximum(reward, minReward)
        return reward

    def store(self, experience):
        self.memory.add(experience.state, experience.action, experience.reward, experience.next_state, experience.done)
        #self.memory.add(np.abs(experience.reward), experience)

    def compile(self):
        def huber_loss(y_true, y_pred, clip_value):
            assert clip_value > 0.

            x = y_true - y_pred
            if np.isinf(clip_value):
                return .5 * K.square(x)

            condition = K.abs(x) < clip_value
            squared_loss = .5 * K.square(x)
            linear_loss = clip_value * (K.abs(x) - .5 * clip_value)
            if K.backend() == 'tensorflow':
                import tensorflow as tf
                if hasattr(tf, 'select'):
                    return tf.select(condition, squared_loss, linear_loss)  # condition, true, false
                else:
                    return tf.where(condition, squared_loss, linear_loss)  # condition, true, false
            elif K.backend() == 'theano':
                from theano import tensor as T
                return T.switch(condition, squared_loss, linear_loss)
            else:
                raise RuntimeError('Unknown backend "{}".'.format(K.backend()))

            
        def clipped_masked_error(args):
                y_true, y_pred, mask = args
                loss = huber_loss(y_true, y_pred, 1)
                loss *= mask  # apply element-wise mask
                return K.sum(loss, axis=-1)
        # Create trainable model. The problem is that we need to mask the output since we only
        # ever want to update the Q values for a certain action. The way we achieve this is by
        # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
        # to mask out certain parameters by passing in multiple inputs to the Lambda layer.
        y_pred = self.net.controllerNet.output
        y_true = Input(name='y_true', shape=(nb_Action,))
        mask = Input(name='mask', shape=(nb_Action,))
        loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_pred, y_true, mask])
        ins = [self.net.controllerNet.input] if type(self.net.controllerNet.input) is not list else self.net.controllerNet.input
        trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred])
        assert len(trainable_model.output_names) == 2
        #combined_metrics = {trainable_model.output_names[1]: metrics}
        losses = [
            lambda y_true, y_pred: y_pred,  # loss is computed in Lambda layer
            lambda y_true, y_pred: K.zeros_like(y_pred),  # we only include this for the metrics
        ]
        rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0)
        trainable_model.compile(optimizer=rmsProp, loss=losses)
        self.trainable_model = trainable_model
        self.compiled = True

    def _update(self, stepCount):
        batches = self.memory.sample(self.nSamples, beta=beta_schedule.value(stepCount))
        (stateVector, actionVector, rewardVector, nextStateVector, doneVector, importanceVector, idxVector) = batches
        
        stateVector = np.asarray(stateVector)
        nextStateVector = np.asarray(nextStateVector)
        
        q_values = self.net.controllerNet.predict(stateVector)
        assert q_values.shape == (self.nSamples, nb_Action)
        if self.enable_double_dqn:
            actions = np.argmax(q_values, axis = 1)
            assert actions.shape == (self.nSamples,)

            target_q_values = self.net.targetControllerNet.predict(nextStateVector)
            assert target_q_values.shape == (self.nSamples, nb_Action)
            q_batch = target_q_values[range(self.nSamples), actions]
            assert q_batch.shape == (self.nSamples,)
        else:
            target_q_values = self.net.targetControllerNet.predict(nextStateVector)
            q_batch = np.max(target_q_values, axis=1)
            assert q_batch.shape == (self.nSamples,)

        targets = np.zeros((self.nSamples, nb_Action))
        dummy_targets = np.zeros((self.nSamples,))
        masks = np.zeros((self.nSamples, nb_Action))

        # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target targets accordingly,
        # but only for the affected output units (as given by action_batch).
        discounted_reward_batch = self.gamma * q_batch
        # Set discounted reward to zero for all states that were terminal.
        terminalBatch = np.array([1-float(done) for done in doneVector])
        assert terminalBatch.shape == (self.nSamples,)
        discounted_reward_batch *= terminalBatch
        reward_batch = np.array(rewardVector)
        action_batch = np.array(actionVector)
        assert discounted_reward_batch.shape == reward_batch.shape
        Rs = reward_batch + discounted_reward_batch
        for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)):
            target[action] = R  # update action with estimated accumulated reward
            dummy_targets[idx] = R
            mask[action] = 1.  # enable loss for this specific action
        td_errors = targets[range(self.nSamples), action_batch] - q_values[range(self.nSamples), action_batch]
        
        new_priorities = np.abs(td_errors) + prioritized_replay_eps
        self.memory.update_priorities(idxVector, new_priorities)
        
        targets = np.array(targets).astype('float32')
        masks = np.array(masks).astype('float32')

        
        # Finally, perform a single update on the entire batch. We use a dummy target since
        # the actual loss is computed in a Lambda layer that needs more complex input. However,
        # it is still useful to know the actual target to compute metrics properly.
        ins = [stateVector] if type(self.net.controllerNet.input) is not list else stateVector
        if stepCount >= self.defaultRandomPlaySteps:
            loss = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets], sample_weight = [np.array(importanceVector), np.ones(self.nSamples)])
        else:
            loss = [0.0,0.0,0.0]
        
        if stepCount > self.defaultRandomPlaySteps and stepCount % self.hard_update == 0:
            self.net.targetControllerNet.set_weights(self.net.controllerNet.get_weights())
        return loss[1], np.mean(q_values), np.mean(np.abs(td_errors))
        

    def update(self, stepCount):
        loss = self._update(stepCount)
        return loss

    def annealControllerEpsilon(self, stepCount, option_learned):
        if not self.randomPlay:
            if option_learned:
                self.controllerEpsilon = 0.0
            else:
                if stepCount > self.defaultRandomPlaySteps:
                    self.controllerEpsilon = self.exploration.value(stepCount - self.defaultRandomPlaySteps)
                    #self.controllerEpsilon[goal] = exploration.value(stepCount - defaultRandomPlaySteps)
    def clear_memory(self, goal):
        self.learning_done = True ## Set the done learning flag
        del self.trainable_model
        del self.memory

        gpu = self.net.gpu

        del self.net

        gc.collect()

        rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0)

        with tf.device('/gpu:'+str(gpu)):
            self.simple_net = Sequential()
            self.simple_net.add(Conv2D(32, (8,8), strides = 4, activation = 'relu', padding = 'valid', input_shape = (84,84,4)))
            self.simple_net.add(Conv2D(64, (4,4), strides = 2, activation = 'relu', padding = 'valid'))
            self.simple_net.add(Conv2D(64, (3,3), strides = 1, activation = 'relu', padding = 'valid'))
            self.simple_net.add(Flatten())
            self.simple_net.add(Dense(HIDDEN_NODES, activation = 'relu', kernel_initializer = initializers.random_normal(stddev=0.01, seed = SEED)))
            self.simple_net.add(Dense(nb_Action, activation = 'linear', kernel_initializer = initializers.random_normal(stddev=0.01, seed = SEED)))
            self.simple_net.compile(loss = 'mse', optimizer = rmsProp)
            self.simple_net.load_weights(recordFolder+'/policy_subgoal_' + str(goal) + '.h5')
            self.simple_net.reset_states()
Exemplo n.º 16
0
def main():

    env = envstandalone.MultiGhostEvade()
#    env = envstandalone.GhostEvade()
#    env = envstandalone.BallCatch()
    
    max_timesteps=40000
#    max_timesteps=80000
    learning_starts=1000
#    buffer_size=50000
    buffer_size=1000
#    exploration_fraction=0.2
    exploration_fraction=0.4
    exploration_final_eps=0.02
    print_freq=10
    gamma=.98
#    target_network_update_freq=500
#    target_network_update_freq=100
#    target_network_update_freq=10
    target_network_update_freq=1
    learning_alpha = 0.2
    
#    batch_size=32
#    batch_size=64
    batch_size=512
#    batch_size=1024
    train_freq=1

    obsShape = (8,8,1)
#    deicticShape = (3,3,2)
#    deicticShape = (3,3,4)
#    deicticShape = (4,4,2)
#    deicticShape = (4,4,4)
    deicticShape = (5,5,2)
#    deicticShape = (6,6,2)
#    deicticShape = (8,8,2)
#    num_deictic_patches = 36
#    num_deictic_patches = 25
    num_deictic_patches = 16
#    num_deictic_patches = 9
#    num_deictic_patches = 1

#    num_actions = 4
#    num_actions = 3
    num_actions = env.action_space.n

    episode_rewards = [0.0]
    num_cpu=16
    num_cascade = 5
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)


#    # CNN version
#    # conv model parameters: (num_outputs, kernel_size, stride)
#    model = models.cnn_to_mlp(
###    model = models.cnn_to_mlp_2pathways(
###        convs=[(16,3,1)],
#        convs=[(32,3,1)],
###        convs=[(32,4,1)],
###        convs=[(16,4,1)],
##        hiddens=[16],
#        hiddens=[32],
#        dueling=True
#    )
    
    # MLP version
#    model = models.mlp([8, 16])
#    model = models.mlp([16, 16])
#    model = models.mlp([16, 32])
#    model = models.mlp([16, 16])
#    model = models.mlp([32, 32])
#    model = models.mlp([32])
    model = models.mlp([])

    q_func=model
#    lr=0.01
    lr=0.001
#    lr=0.0005
    
    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)
    
    def make_obsDeic_ph(name):

#        # CNN version
#        return U.BatchInput(deicticShape, name=name)
        
        # MLP version
        return U.BatchInput([deicticShape[0]*deicticShape[1]*deicticShape[2]], name=name)

    def make_target_ph(name):
#        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade,num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func"
            )
    
    getqTarget = build_getq(
            make_obsDeic_ph=make_obsDeic_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=num_cascade,
            scope="deepq",
            qscope="q_func_target"
            )

    update_target = build_update_target(scope="deepq", 
                                        qscope="q_func",
                                        qscopeTarget="q_func_target")
                      
    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
#        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func",
        grad_norm_clipping=1.
#        grad_norm_clipping=0.1
    )
    
    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
#    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()
    
    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        obsDeictic = getDeic([obs])

##       CNN version
#        qCurr = getq(np.array(obsDeictic))
        
        # MLP version
        qCurr = getq(np.reshape(obsDeictic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))


        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:,-1,:],0)) # USE CASCADE
#        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # MONTE CARLO VERSION
        # update rewards to actual monte carlo experiences
        if done:
            replay_buffer.update_montecarlo(gamma)
            
        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
#            obses_t_deic = getDeic(obses_t)[:,:,:,0:2]
#            obses_tp1_deic = getDeic(obses_tp1)[:,:,:,0:2]
            
            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones,num_deictic_patches)
            rewardsTiled = np.repeat(rewards,num_deictic_patches)
            actionsTiled = np.repeat(actions,num_deictic_patches)
            
#            # Get curr, next values: CNN version: NO ROTATION-AUGMENTATION 
#            qNextTarget = getqTarget(obses_tp1_deic)
#            qNext = getq(obses_tp1_deic)
#            qCurr = getq(obses_t_deic)
            
            # Get curr, next values: MLP version
            qNext = getq(np.reshape(obses_tp1_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))
            qCurr = getq(np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]))

#            # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS
#            obses_t_deicRot1 = np.rot90(obses_t_deic,k=3,axes=(1,2))
#            obses_t_deicRot2 = np.rot90(obses_t_deic,k=2,axes=(1,2))
#            obses_t_deicRot3 = np.rot90(obses_t_deic,k=1,axes=(1,2))
#            obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1, obses_t_deicRot2, obses_t_deicRot3]
#            obses_tp1_deicRot1 = np.rot90(obses_tp1_deic,k=3,axes=(1,2))
#            obses_tp1_deicRot2 = np.rot90(obses_tp1_deic,k=2,axes=(1,2))
#            obses_tp1_deicRot3 = np.rot90(obses_tp1_deic,k=1,axes=(1,2))
#            obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1, obses_tp1_deicRot2, obses_tp1_deicRot3]
#            qCurr = getq(np.array(obses_t_deic))
#            qNext = getq(np.array(obses_tp1_deic))
#            actionsTiled = np.r_[actionsTiled, actionsTiled+1, actionsTiled+2, actionsTiled+3]
#            actionsTiled = actionsTiled - 4 * (actionsTiled>3)
#            rewardsTiled = np.r_[rewardsTiled,rewardsTiled,rewardsTiled,rewardsTiled]
#            donesTiled = np.r_[donesTiled,donesTiled,donesTiled,donesTiled]            
            
            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:,-1,:],1) # standard
#            actionsNext = np.argmax(qNextTarget[:,-1,:],1) # double-q
#            qNextmax = qNext[range(num_deictic_patches*batch_size),-1,actionsNext]
            
#            # This version takes the max over all glimpses
#            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
#            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # BELLMAN VERSION
            targets = rewardsTiled + (1-donesTiled) * gamma * qNextmax

            # MONTE CARLO VERSION
            targets = rewardsTiled

#            # Take min over targets in same group
#            obses_t_deic_reshape = np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#            unique_deic, uniqueIdx, uniqueCounts= np.unique(obses_t_deic_reshape,return_inverse=True,return_counts=True,axis=0)
#            for i in range(np.shape(uniqueCounts)[0]):
#                targets[uniqueIdx==i] = np.min(targets[uniqueIdx==i])
            
            
            qCurrTargets = np.copy(qCurr)
            
            # Copy into cascade with pruning.
            expLen = np.shape(qCurr)[0]
            qCurrTargets[range(expLen),0,actionsTiled] = targets
            for i in range(num_cascade-1):
                mask = targets < qCurrTargets[range(expLen),i,actionsTiled]
                qCurrTargets[range(expLen),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled]
            
#            # CNN version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    obses_t_deic,
#                    qCurrTargets
#                    )
            
            # MLP version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                    np.reshape(obses_t_deic,[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]]),
                    qCurrTargets
                    )
                
        # Update target network periodically.
        if t > learning_starts and t % target_network_update_freq == 0:
            update_target()

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
Exemplo n.º 17
0
def main():

    env = envstandalone.BallCatch()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,1)
    #    deicticShape = (3,3,2)
    #    deicticShape = (4,4,1)
    #    deicticShape = (4,4,2)
    deicticShape = (4, 4, 3)
    #    deicticShape = (3,3,4)
    num_deictic_patches = 25

    #    num_actions = 4
    num_actions = 3
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros(
            (obsShape[0] + 2 * windowLen, obsShape[1] + 2 * windowLen))
        obsPadded[windowLen:windowLen + obsShape[0],
                  windowLen:windowLen + obsShape[1]] = obs[:, :, 0]
        deicticObsThis = np.zeros(
            (windowLen, windowLen, 4)
        )  # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 1  # agent zoomin
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 2  # ball zoomin
                patch = obsPadded[i:i + 3 * windowLen, j:j + 3 * windowLen]
                for k in range(1, 3):
                    # THE VERSION BELOW USES A FIXED VIEW
                    #                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])],
                    #                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
                    #                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    #                    deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])],
                    #                                 [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])],
                    #                                 [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]]
                    # THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
                    deicticObsThis[:, :, k + 1] = [[(k in patch[0:3, 0:3]),
                                                    (k in patch[0:3, 3:6]),
                                                    (k in patch[0:3, 6:9])],
                                                   [(k in patch[3:6, 0:3]),
                                                    (k in patch[3:6, 3:6]),
                                                    (k in patch[3:6, 6:9])],
                                                   [(k in patch[6:9, 0:3]),
                                                    (k in patch[6:9, 3:6]),
                                                    (k in patch[6:9, 6:9])]]
                deicticObs.append(
                    deicticObsThis.copy()
                )  # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        shape = np.shape(deicticObsBatch)
        return (np.reshape(
            np.array(deicticObsBatch),
            [shape[0] * shape[1], shape[2], shape[3], shape[4]]))

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
#    model = models.cnn_to_mlp(
#        convs=[(16,4,1)],
#        hiddens=[16],
#        dueling=True
#    )

# MLP version
    model = models.mlp([16, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        #        return U.BatchInput(deicticShape, name=name)

        # MLP version
        return U.BatchInput(
            [deicticShape[0] * deicticShape[1] * deicticShape[2]], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade)

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr))

    getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        #        obsDeictic = getDeicticObs(obs)
        obsDeictic = getDeic([obs])
        #        obsDeictic, patchesTiledStacked2 = getDeic([obs])

        #        # CNN version
        #        qCurr = getq(np.array(obsDeictic))

        # MLP version
        qCurr = getq(
            np.reshape(
                obsDeictic,
                [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, -1, :], 0))
        selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)
            #            obses_t_deic = getDeicticObsBatch(obses_t)
            #            obses_tp1_deic = getDeicticObsBatch(obses_tp1)

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            #            # Get curr, next values: CNN version
            #            qNext = getq(obses_tp1_deic)
            #            qCurr = getq(obses_t_deic)

            # Get curr, next values: MLP version
            qNext = getq(
                np.reshape(
                    obses_tp1_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))
            qCurr = getq(
                np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:, -1, :], 1)

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade])

            qCurrTargets = np.copy(qCurr)

            #            # Copy into cascade without pruning
            #            for i in range(num_cascade):
            #                qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets

            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size * num_deictic_patches), 0,
                         actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(batch_size *
                                                    num_deictic_patches), i,
                                              actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]


#            # CNN version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    obses_t_deic,
#                    qCurrTargets
#                    )

# MLP version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]]),
                qCurrTargets)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
def dist_learn(env,
               q_dist_func,
               num_atoms=51,
               V_max=10,
               lr=25e-5,
               max_timesteps=100000,
               buffer_size=50000,
               exploration_fraction=0.01,
               exploration_final_eps=0.008,
               train_freq=1,
               batch_size=32,
               print_freq=1,
               checkpoint_freq=2000,
               learning_starts=1000,
               gamma=1.0,
               target_network_update_freq=500,
               prioritized_replay=False,
               prioritized_replay_alpha=0.6,
               prioritized_replay_beta0=0.4,
               prioritized_replay_beta_iters=None,
               prioritized_replay_eps=1e-6,
               num_cpu=1,
               callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.single_threaded_session()
    sess.__enter__()

    def make_obs_ph(name):
        print name
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = build_dist_train(
        make_obs_ph=make_obs_ph,
        dist_func=q_dist_func,
        num_actions=env.action_space.n,
        num_atoms=num_atoms,
        V_max=V_max,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)

    # act, train, update_target, debug = build_train(
    #     make_obs_ph=make_obs_ph,
    #     q_func=q_func,
    #     num_actions=env.action_space.n,
    #     optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    #     gamma=gamma,
    #     grad_norm_clipping=10
    # )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_dist_func': q_dist_func,
        'num_actions': env.action_space.n,
    }
    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        print model_file
        # mkdir_p(os.path.dirname(model_file))
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            action = act(np.array(obs)[None],
                         update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    # print "CCCC"
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                # print "Come1"
                # print np.shape(obses_t), np.shape(actions), np.shape(rewards), np.shape(obses_tp1), np.shape(dones)
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                # print "Loss : {}".format(td_errors)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                print "steps : {}".format(t)
                print "episodes : {}".format(num_episodes)
                print "mean 100 episode reward: {}".format(mean_100ep_reward)
                # print "mean 100 episode reward".format(mean_100ep_reward)
                # logger.record_tabular("episodes", num_episodes)
                # logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                # logger.dump_tabular()
                # logger.record_tabular("steps", t)
                # logger.record_tabular("episodes", num_episodes)
                # logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                # logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and t % checkpoint_freq == 0):
                print "=========================="
                print "Error: {}".format(td_errors)
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        print "Saving model due to mean reward increase: {} -> {}".format(
                            saved_mean_reward, mean_100ep_reward)
                        # logger.log("Saving model due to mean reward increase: {} -> {}".format(
                        #            saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                print "Restored model with mean reward: {}".format(
                    saved_mean_reward)
                # logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
def main():

    # ******* Deictic parameters ********

    # deicticShape is the shape of the patch that is used. For example, a 3,3,2 patch
    # is a 2-channel 3x3 patch. num_deictic_patches must be set to the number of deicticShape
    # patches in an entire image.
    # For example, there are 36 3x3 patches that are contained in an 8x8 observation space
    # (assuming no zero padding). You must set this number to correspond to deicticShape.

    #    deicticShape = (3,3,2)
    #    deicticShape = (3,3,4)
    deicticShape = (4, 4, 2)
    #    deicticShape = (4,4,4)

    #    num_deictic_patches = 36
    num_deictic_patches = 25

    # Desired network type. So far, I've done better w/ CNN
    WHICH_Q = "CNN"
    #    WHICH_Q = "MLP"

    # Method used to evaluate value of next state. So far, I've found that PAIRED_NEXT works
    # much better than MAX_NEXT. MAX_NEXT only works if you also set MIN_OVER_BATCH to True.
    # OW, it doesn't converge.
    # PAIRED_NEXT -> use value of corresponding patch on the next step
    # MAX_NEXT -> use max value over all next-step patches
    NEXT_PATCH = "PAIRED_NEXT"
    #    NEXT_PATCH = "MAX_NEXT"

    # If MIN_OVER_BATCH is true, then we find the min value over all targets that have
    # the same corresponding patch. In principle, this should always help. The larger
    # the batch size, the more it should help. However, in practice, I find that
    # it seems to cap the maximum achievable performance. On the other hand, it can
    # help convergence when using NEXT_PATCH = "MAX_NEXT".
    #    MIN_OVER_BATCH = True
    MIN_OVER_BATCH = False

    # If MIN_OR_AVG_Q is "MIN", then we use the minimum Q value as calculated via the cascade.
    # OW (if "AVG"), we use the standard expected value Q value. "MIN" should work. "AVG" is
    # equivalent to the standard DQN backup applied to the patches.
    # best here.
    MIN_OR_AVG_Q = "MIN"
    #    MIN_OR_AVG_Q = "AVG"

    # If true, ROTATION_AUGMENTATION augments the agent's experience with
    # rotated versions of the patches. I typically turn this off.
    #    ROTATION_AUGMENTATION = True
    ROTATION_AUGMENTATION = False

    # ******* Load the environment ********

    env = envstandalone.StandaloneEnv()
    obsShape = env.observation_space.shape
    num_actions = env.action_space.n

    # ******* Standard DQN parameters ********

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    exploration_fraction = 0.4
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 1
    lr = 0.001
    batch_size = 32
    train_freq = 1
    num_cascade = 5  # number of Q-functions in the cascade used to estimate a minimum value for each s,a pair
    num_cpu = 16
    replay_buffer = ReplayBuffer(buffer_size)
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    if MIN_OR_AVG_Q == "MIN":
        minoravg = -1
    elif MIN_OR_AVG_Q == "AVG":
        minoravg = 0
    else:
        print("error")

    # ******* Create neural network model ********

    if WHICH_Q == "CNN":
        # conv model parameters: (num_outputs, kernel_size, stride)
        model = models.cnn_to_mlp(convs=[(32, 3, 1)],
                                  hiddens=[32],
                                  dueling=True)
        networkShapeOfObservation = [
            -1, deicticShape[0], deicticShape[1], deicticShape[2]
        ]
    elif WHICH_Q == "MLP":
        # MLP version
        #        model = models.mlp([8, 16])
        model = models.mlp([16, 32])
        #        model = models.mlp([32])
        #        model = models.mlp([])
        networkShapeOfObservation = [
            -1, deicticShape[0] * deicticShape[1] * deicticShape[2]
        ]
    else:
        print("WHICH_Q error: must select valid q-function")
    q_func = model

    # ******* Build tensorflow functions ********

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        if WHICH_Q == "CNN":
            return U.BatchInput(deicticShape, name=name)
        elif WHICH_Q == "MLP":
            return U.BatchInput(
                [deicticShape[0] * deicticShape[1] * deicticShape[2]],
                name=name)
        else:
            print("WHICH_Q error: must select valid q-function")

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade,
                      scope="deepq",
                      qscope="q_func")

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func",
        grad_norm_clipping=1.)

    getDeic = build_getDeic_Foc(make_obs_ph=make_obs_ph,
                                deicticShape=deicticShape)
    #    getDeic = build_getDeic_FocCoarse(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()
    U.initialize()
    episode_rewards = [0.0]
    timerStart = time.time()
    for t in range(max_timesteps):

        # get q-values for current deictic patches
        obsDeictic = getDeic([obs])
        qCurr = getq(np.reshape(obsDeictic, networkShapeOfObservation))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, minoravg, :],
                                  0))  # USE CASCADE
        #        action = np.argmax(np.max(qCurrNoise[:,0,:],0)) # DO NOT USE CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)

            # Reshape such that patches and batches are interleaved in the same column
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            #            # Get curr, next values: NO ROTATION-AUGMENTATION
            qNext = getq(np.reshape(obses_tp1_deic, networkShapeOfObservation))
            qCurr = getq(np.reshape(obses_t_deic, networkShapeOfObservation))

            #            # ROTATION-AUGMENTATION: AUGMENT EXPERIENCES WITH FOUR ROTATIONS
            if ROTATION_AUGMENTATION:
                obses_t_deicRot1 = np.rot90(obses_t_deic, k=3, axes=(1, 2))
                obses_t_deicRot2 = np.rot90(obses_t_deic, k=2, axes=(1, 2))
                obses_t_deicRot3 = np.rot90(obses_t_deic, k=1, axes=(1, 2))
                obses_t_deic = np.r_[obses_t_deic, obses_t_deicRot1,
                                     obses_t_deicRot2, obses_t_deicRot3]
                obses_tp1_deicRot1 = np.rot90(obses_tp1_deic, k=3, axes=(1, 2))
                obses_tp1_deicRot2 = np.rot90(obses_tp1_deic, k=2, axes=(1, 2))
                obses_tp1_deicRot3 = np.rot90(obses_tp1_deic, k=1, axes=(1, 2))
                obses_tp1_deic = np.r_[obses_tp1_deic, obses_tp1_deicRot1,
                                       obses_tp1_deicRot2, obses_tp1_deicRot3]
                qCurr = getq(np.array(obses_t_deic))
                qNext = getq(np.array(obses_tp1_deic))
                actionsTiled = np.r_[actionsTiled, actionsTiled + 1,
                                     actionsTiled + 2, actionsTiled + 3]
                actionsTiled = actionsTiled - 4 * (actionsTiled > 3)
                rewardsTiled = np.r_[rewardsTiled, rewardsTiled, rewardsTiled,
                                     rewardsTiled]
                donesTiled = np.r_[donesTiled, donesTiled, donesTiled,
                                   donesTiled]

            # Get value of next state
            if NEXT_PATCH == "PAIRED_NEXT":
                qNextmax = np.max(qNext[:, minoravg, :], 1)  # standard
            elif NEXT_PATCH == "MAX_NEXT":
                qNextTiled = np.reshape(qNext[:, minoravg, :],
                                        [-1, num_deictic_patches, num_actions])
                qNextmax = np.repeat(np.max(np.max(qNextTiled, 2), 1),
                                     num_deictic_patches)
            else:
                print("error")

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            # Take min over targets in same group
            if MIN_OVER_BATCH:
                obses_t_deic_reshape = np.reshape(
                    obses_t_deic,
                    [-1, deicticShape[0] * deicticShape[1] * deicticShape[2]])
                unique_deic, uniqueIdx, uniqueCounts = np.unique(
                    obses_t_deic_reshape,
                    return_inverse=True,
                    return_counts=True,
                    axis=0)
                for i in range(np.shape(uniqueCounts)[0]):
                    targets[uniqueIdx == i] = np.min(targets[uniqueIdx == i])

            # Copy into cascade with pruning.
            qCurrTargets = np.copy(qCurr)
            expLen = np.shape(qCurr)[0]
            qCurrTargets[range(expLen), 0, actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(expLen), i, actionsTiled]
                qCurrTargets[range(expLen),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(expLen),i+1,actionsTiled]

            td_error_out, obses_deic_out, targets_out = targetTrain(
                np.reshape(obses_t_deic, networkShapeOfObservation),
                qCurrTargets)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemplo n.º 20
0
def main():


    env = envstandalone.BallCatch()
    
    max_timesteps=20000
    buffer_size=50000
    exploration_fraction=0.2
    exploration_final_eps=0.02
    print_freq=10
    learning_starts=1000
    gamma=.98
    target_network_update_freq=500
    learning_alpha = 0.2
    
    batch_size=32
    train_freq=2

    deicticShape = (3,3,1)
    num_deictic_patches=36

    num_actions = 3
    episode_rewards = [0.0]

#    replay_buffer = ReplayBuffer(buffer_size)
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Extract deictic patches for an input obs. Each deictic patch has a low level
    # and a foveated view.
    # input: n x n x 1
    # output: dn x dn x 4
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        obsShape = np.shape(obs)
        obsPadded = np.zeros((obsShape[0]+2*windowLen,obsShape[1]+2*windowLen))
        obsPadded[windowLen:windowLen+obsShape[0],windowLen:windowLen+obsShape[1]] = obs[:,:,0]
        deicticObsThis = np.zeros((windowLen,windowLen,4)) # channel1: zoomin window; channel2: agent in zoomout window; channel3: ball in zoomout window
        deicticObs = []
        for i in range(obsShape[0] - windowLen + 1):
            for j in range(obsShape[1] - windowLen + 1):
                deicticObsThis[:,:,0] = obs[i:i+windowLen,j:j+windowLen,0] == 1 # agent zoomin
                deicticObsThis[:,:,1] = obs[i:i+windowLen,j:j+windowLen,0] == 2 # ball zoomin
                patch = obsPadded[i:i+3*windowLen,j:j+3*windowLen]
                for k in range(1,3):
# THE VERSION BELOW USES A FIXED VIEW
#                    deicticObsThis[:,:,k+1] = [[(k in obs[0:3,0:3,0]), (k in obs[0:3,3:5]), (k in obs[0:3,5:8,0])], 
#                                 [(k in obs[3:5,0:3,0]), (k in obs[3:5,3:5,0]), (k in obs[3:5,5:8,0])],
#                                 [(k in obs[5:8,0:3,0]), (k in obs[5:8,3:5,0]), (k in obs[5:8,5:8,0])]]
# THE VERSION BELOW USES A WIDE VIEW W/ 2 UNITS IN EACH CELL
                    deicticObsThis[:,:,k+1] = [[(k in patch[1:3,1:3]), (k in patch[1:3,3:5]), (k in patch[1:3,5:7])], 
                                 [(k in patch[3:5,1:3]), (k in patch[3:5,3:5]), (k in patch[3:5,5:7])], 
                                 [(k in patch[5:7,1:3]), (k in patch[5:7,3:5]), (k in patch[5:7,5:7])]]
# THE VERSION BELOW USES A WIDE VIEW W/ 3 UNITS IN EACH CELL
#                    deicticObsThis[:,:,k+1] = [[(k in patch[0:3,0:3]), (k in patch[0:3,3:6]), (k in patch[0:3,6:9])], 
#                                 [(k in patch[3:6,0:3]), (k in patch[3:6,3:6]), (k in patch[3:6,6:9])], 
#                                 [(k in patch[6:9,0:3]), (k in patch[6:9,3:6]), (k in patch[6:9,6:9])]]
                deicticObs.append(deicticObsThis.copy()) # CAREFUL WITH APPENDING REFERENCES VS APPENDING COPIES!!! THIS WAS A BUG BEFORE I CORRECTED IT...

        return np.array(deicticObs)


    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        return(np.array(deicticObsBatch))


    # input: batch x nxnx1 tensor of observations
    # output: 8 x batch matrix of deictic observations
    def convertState(observations):
        
        # Reshape to batch x flatimage x channel.
        # Channel1 = zoomin agent, channel2 = zoomin ball
        # Channel3 = zoomout agent, channel4 = zoomout ball
        obs = np.zeros((36,9,4))
        for i in range(4):
            obs[:,:,i] = np.reshape(observations[:,:,:,i],[36,9])

        # state_numeric: 4 x batch.
        # row0: pos of agent in zoomin, row1: pos of ball in zoomin
        # row2: pos of agent in zoomout, row3: pos of ball in zoomout
        shape = np.shape(obs)
#        state_numeric = 9*np.ones((4,shape[0])) # 9 indicates agent/ball does not appear at this zoom in this glance
        state_numeric = 9*np.ones((shape[0],4)) # 9 indicates agent/ball does not appear at this zoom in this glance
        pos = np.nonzero(obs == 1)
        for i in range(4):
            idx = np.nonzero(pos[2]==i)[0]
#            state_numeric[i,pos[0][idx]] = pos[1][idx]
            state_numeric[pos[0][idx],i] = pos[1][idx]
        
        return np.int32(state_numeric)


    def convertStateBatch(observations):
        shape = np.shape(observations)
        state_numeric_batch = []
        for batch in range(shape[0]):
            state_numeric_batch.append(convertState(observations[batch]))
        return(np.array(state_numeric_batch))


    dimSize = deicticShape[0]*deicticShape[1] + 1
    tabularQ = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ1 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ2 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ3 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ4 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
#    tabularQ5 = 100*np.ones([dimSize, dimSize, dimSize, dimSize, num_actions])
    
    obs = env.reset()
#    OHEnc = np.identity(max_num_groups)


    for t in range(max_timesteps):

        # get current q-values
        obsDeictic = getDeicticObs(obs)
        stateCurr = convertState(obsDeictic)

#        # do a couple of spot checks to verify that obsDeictic is correct
#        num2check = 17
#        print(str(obsDeictic[num2check,:,:,0] + obsDeictic[num2check,:,:,1]))
#        print(str(obsDeictic[num2check,:,:,2] + obsDeictic[num2check,:,:,3]))

#        qCurr = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
        qCurr = tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
        
        # select action
        qCurrNoise = qCurr + np.random.random()*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise,0))
        selPatch = np.argmax(np.max(qCurrNoise,1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)


#        env.render()
#        print("action: " + str(action))

        # take action
        new_obs, rew, done, _ = env.step(action)

#        replay_buffer.add(obs, action, rew, new_obs, float(done))
        
#        if done == 1:
#            print("action: " + str(action) + ", patch: " + str(selPatch) + ", reward: " + str(rew))
#            action
            
        if t > max_timesteps * 1.05:
            print("obs:\n" + str(np.squeeze(obs)))
            print("qCurr:\n" + str(qCurr))
            print("action: " + str(action) + ", patch: " + str(selPatch))
            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
            action
        
#        if t > learning_starts and t % train_freq == 0:
#            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
            
#        obses_t = np.reshape(obs,[1,8,8,1])
#        obses_tp1 = np.reshape(new_obs,[1,8,8,1])
#        stateCurr = convertStateBatch(getDeicticObsBatch(obses_t))
#        stateNext = convertStateBatch(getDeicticObsBatch(obses_tp1))
#        qNext = tabularQ[stateNext[:,:,0], stateNext[:,:,1], stateNext[:,:,2], stateNext[:,:,3],:]
#        qNextmax = np.max(np.max(qNext,2),1)
#        targets = rew + (1-done) * gamma * qNextmax
#        batch_size = 1
#        targets = np.tile(np.reshape(targets,[batch_size,1]),[1,num_deictic_patches])
#        tabularQ[stateCurr[:,:,0], stateCurr[:,:,1], stateCurr[:,:,2], stateCurr[:,:,3],action] = np.minimum(targets, tabularQ[stateCurr[:,:,0], stateCurr[:,:,1], stateCurr[:,:,2], stateCurr[:,:,3],action])
            
        stateNext = convertState(getDeicticObs(new_obs))
        qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:]
        qNextmax = np.max(qNext)
        targets = rew + (1-done) * gamma * qNextmax
        tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])
        
            
#        # get next q-values
#        stateNext = convertState(getDeicticObs(new_obs))
#        qNext5 = tabularQ5[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:]
#        qNext = tabularQ[stateNext[0], stateNext[1], stateNext[2], stateNext[3],:]
        
#        # perform learning update
#        qNextmax = np.max(qNext)
#        targets = rew + (1-done) * gamma * qNextmax
        
#        max_negative_td_error = np.max(np.abs(targets - tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]) * np.int32(targets < tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]))
#        if max_negative_td_error > 5:
#            max_negative_td_error
#        print("max_td_error: " + str(max_negative_td_error))
#        print("curr tabularQ:\n" + str(tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]))
#        print("targets:\n" + str(targets))        
#        tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = np.minimum(targets, tabularQ[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action])
        
#        target2_mask = targets < tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        target3_mask = targets < tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        target4_mask = targets < tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        target5_mask = targets < tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        targets1 = targets
#        targets2 = target2_mask * targets + (1 - target2_mask) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        targets3 = target3_mask * targets + (1 - target3_mask) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        targets4 = target4_mask * targets + (1 - target4_mask) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        targets5 = target5_mask * targets + (1 - target5_mask) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action]
#        
#        tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets1
#        tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets2
#        tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets3
#        tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets4
#        tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] = \
#            (1 - learning_alpha) * tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],action] \
#            + learning_alpha * targets5


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
#            print("************************* Episode done! **************************")
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", max q at curr state: " + str(np.max(qCurr)))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
        
        obs = new_obs
        
        # stop at the end of training
        if t > max_timesteps * 1.1:
#            np.set_printoptions(precision=1)
#            np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
            np.set_printoptions(formatter={'float_kind':lambda x: "%.1f" % x})
            
#            qCurr1 = tabularQ1[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            qCurr2 = tabularQ2[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            qCurr3 = tabularQ3[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            qCurr4 = tabularQ4[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            qCurr5 = tabularQ5[stateCurr[0], stateCurr[1], stateCurr[2], stateCurr[3],:]
#            todisplay = np.c_[np.max(qCurr1,1), np.max(qCurr2,1), np.max(qCurr3,1), np.max(qCurr4,1), np.max(qCurr5,1), obsDeicticReshape]
#            todisplay = np.c_[qCurr5,np.transpose(stateCurr)]

            print("obs:\n" + str(np.squeeze(obs)))

#            todisplay = np.c_[np.max(qCurr5,1),np.transpose(stateCurr)]
#            print("q-values:\n" + str(todisplay))
#
#            print("close:\n" + str(obsDeictic[selPatch,:,:,0] + obsDeictic[selPatch,:,:,1]))
#            print("far:\n" + str(obsDeictic[selPatch,:,:,2] + obsDeictic[selPatch,:,:,3]))
#            print("action: " + str(action) + ", patch: " + str(selPatch))
            action

#                print("obs:\n" + str(np.squeeze(obs)))
#                print("patch:\n" + str(np.reshape(obsDeictic[selPatch],(3,3))))
#                print("action: " + str(action) + ", patch: " + str(selPatch))
#                t
            
            

    t
Exemplo n.º 21
0
def main():

#    env = gym.make("CartPoleRob-v0")
#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
#    env = gym.make("MountainCarRob-v0")
#    env = gym.make("FrozenLake-v0")
#    env = gym.make("FrozenLake8x8-v0")
#    env = gym.make("FrozenLake8x8rob-v0")
#    env = gym.make("FrozenLake16x16rob-v0")
    env = gym.make("TestRob3-v0")
    
    
    
    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obses_t, windowLen):
        deicticObses_t = []
        for i in range(np.shape(obses_t)[0] - windowLen):
            for j in range(np.shape(obses_t)[1] - windowLen):
                deicticObses_t.append(obses_t[i:i+windowLen,j:j+windowLen,:])
        return np.array(deicticObses_t)

    # get set of deictic alternatives
    # input: batch x n x n x channels
    # output: (batch x deictic) x dn x dn x channels
    def getDeictic(obses_t, actions, obses_tp1, weights, windowLen):
        deicticObses_t = []
        deicticActions = []
        deicticObses_tp1 = []
        deicticWeights = []
        for i in range(np.shape(obses_t)[0]):
            for j in range(np.shape(obses_t)[1] - windowLen):
                for k in range(np.shape(obses_t)[2] - windowLen):
                    deicticObses_t.append(obses_t[i,j:j+windowLen,k:k+windowLen,:])
                    deicticActions.append(actions[i])
                    deicticObses_tp1.append(obses_tp1[i,j:j+windowLen,k:k+windowLen,:])
                    deicticWeights.append(weights[i])
        return np.array(deicticObses_t), np.array(deicticActions), np.array(deicticObses_tp1), np.array(deicticWeights)

    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(
#        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # used in pong
#        hiddens=[256],  # used in pong
#        convs=[(8,4,1)], # used for non-deictic TestRob3-v0
        convs=[(4,3,1)], # used for deictic TestRob3-v0
        hiddens=[16],
        dueling=True
    )

    # parameters
    q_func=model
    lr=1e-3
#    max_timesteps=100000
#    max_timesteps=50000
    max_timesteps=20000
    buffer_size=50000
    exploration_fraction=0.1
#    exploration_fraction=0.3
    exploration_final_eps=0.02
#    exploration_final_eps=0.1
    train_freq=1
    batch_size=32
    print_freq=10
    checkpoint_freq=10000
    learning_starts=1000
    gamma=1.
    target_network_update_freq=500
    prioritized_replay=False
#    prioritized_replay=True
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    num_cpu=16
    
    deicticShape = (3,3,1)
    def make_obs_ph(name):
#        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(deicticShape, name=name)

    matchShape = (batch_size*25,)
    def make_match_ph(name):
        return U.BatchInput(matchShape, name=name)

    
    sess = U.make_session(num_cpu)
    sess.__enter__()

#    act, train, update_target, debug = build_graph.build_train(
#    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
#    getq, train, trainWOUpdate, debug = build_graph.build_train_deictic(
#    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic(
    getq, train, trainWOUpdate, update_target, debug = build_graph.build_train_deictic_min(
        make_obs_ph=make_obs_ph,
        make_match_ph=make_match_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()


    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    
#    with tempfile.TemporaryDirectory() as td:
    model_saved = False
#        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):
        
        # get action to take
#        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
#        qvalues = getq(np.array(obs)[None])
#        action = np.argmax(qvalues)
#        if np.random.rand() < exploration.value(t):
#            action = np.random.randint(env.action_space.n)
        
        deicticObs = getDeicticObs(obs,3)
        qvalues = getq(np.array(deicticObs))
        action = np.argmax(np.max(qvalues,0))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)
        
#        # temporarily take uniformly random actions all the time
#        action = np.random.randint(env.action_space.n)
        
        new_obs, rew, done, _ = env.step(action)
        
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs
        
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            
            # Get batch
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            
            # Convert batch to deictic format
            obses_t_deic, actions_deic, obses_tp1_deic, weights_deic = getDeictic(obses_t, actions, obses_tp1, weights, 3)
            
            obses_t_deic_fingerprints = [np.reshape(obses_t_deic[i],[9]) for i in range(np.shape(obses_t_deic)[0])]
            _, _, fingerprintMatch = np.unique(obses_t_deic_fingerprints,axis=0,return_index=True,return_inverse=True)
#            matchTemplates = [fingerprintMatch == i for i in range(np.max(fingerprintMatch)+1)]
            
#            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
#            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
#            debug1, debug2, debug3 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, dones, weights_deic)
#            debug1, debug2, debug3, debug4 = trainWOUpdate(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)
            td_errors = train(obses_t_deic, actions_deic, rewards, obses_tp1_deic, fingerprintMatch, dones, weights_deic)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
            

    num2avg = 20
    rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg
    plt.plot(rListAvg)
#    plt.plot(episode_rewards)
    plt.show()

    sess
Exemplo n.º 22
0
class Agent:
    def __init__(self, dimO, dimA):
        dimA, dimO = dimA[0], dimO[0]
        self.dimA = dimA
        self.dimO = dimO

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        if FLAGS.icnn_opt == 'adam':
            self.opt = self.adam
        elif FLAGS.icnn_opt == 'bundle_entropy':
            self.opt = self.bundle_entropy
        else:
            raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)

        if FLAGS.use_per:
            self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, alpha=FLAGS.alpha)
            self.beta_schedule = LinearSchedule(FLAGS.beta_iters,
                                                initial_p=FLAGS.beta0,
                                                final_p=1.0)
        else:
            self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)


        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True)))

        self.noise = np.zeros(self.dimA)

        obs = tf.placeholder(tf.float32, [None, dimO], "obs")
        act = tf.placeholder(tf.float32, [None, dimA], "act")
        rew = tf.placeholder(tf.float32, [None], "rew")
        per_weight = tf.placeholder(tf.float32, [None], "per_weight")

        with tf.variable_scope('q'):
            negQ = self.negQ(obs, act)
        negQ_entr = negQ - entropy(act)
        q = -negQ
        q_entr = -negQ_entr
        act_grad, = tf.gradients(negQ, act)
        act_grad_entr, = tf.gradients(negQ_entr, act)

        obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target")
        act_target = tf.placeholder(tf.float32, [None, dimA], "act_target")
        term_target = tf.placeholder(tf.bool, [None], "term_target")
        with tf.variable_scope('q_target'):
            # double Q
            negQ_target = self.negQ(obs_target, act_target)
        negQ_entr_target = negQ_target - entropy(act_target)
        act_target_grad, = tf.gradients(negQ_target, act_target)
        act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target)
        q_target = -negQ_target
        q_target_entr = -negQ_entr_target

        if FLAGS.icnn_opt == 'adam':
            y = tf.where(term_target, rew, rew + discount * q_target_entr)
            y = tf.maximum(q_entr - 1., y)
            y = tf.minimum(q_entr + 1., y)
            y = tf.stop_gradient(y)
            td_error = q_entr - y
        elif FLAGS.icnn_opt == 'bundle_entropy':
            raise RuntimError("Needs checking.")
            q_target = tf.where(term2, rew, rew + discount * q2_entropy)
            q_target = tf.maximum(q_entropy - 1., q_target)
            q_target = tf.minimum(q_entropy + 1., q_target)
            q_target = tf.stop_gradient(q_target)
            td_error = q_entropy - q_target

        if FLAGS.use_per:
            ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weight), 0)
        else:
            ms_td_error = tf.reduce_mean(tf.square(td_error), 0)

        regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/')
        loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses)

        self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/')
        self.theta_cvx_ = [v for v in self.theta_
                           if 'proj' in v.name and 'W:' in v.name]
        self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_]
        self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_]
        # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_]

        self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope='q_target/')
        update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i))
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]

        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)

        summary_path = os.path.join(model_path, 'board', FLAGS.exp_id)
        summary_writer = tf.summary.FileWriter(summary_path, self.sess.graph)


        if FLAGS.summary:
            if FLAGS.icnn_opt == 'adam':
                tf.summary.scalar('Q', tf.reduce_mean(q))
            elif FLAGS.icnn_opt == 'bundle_entropy':
                tf.summary.scalar('Q', tf.reduce_mean(q_entr))

            tf.summary.scalar('Q_target', tf.reduce_mean(q_target))
            tf.summary.scalar('loss', ms_td_error)
            tf.summary.scalar('reward', tf.reduce_mean(rew))
        merged = tf.summary.merge_all()


        # tf functions
        with self.sess.as_default():
            self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weight],
                              [optimize_q, update_target, loss_q, td_error, q, q_target],
                              merged, summary_writer)
            self._fg = Fun([obs, act], [negQ, act_grad])
            self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad])
            self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr])
            self._fg_entr_target = Fun([obs_target, act_target],
                                       [negQ_entr_target, act_entr_target_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(model_path + "/tf")
        if  not FLAGS.force and ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.global_variables_initializer())
            self.sess.run(self.makeCvx)
            self.sess.run([theta_target_i.assign(theta_i)
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)])

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)

    def bundle_entropy(self, func, obs):
        act = np.ones((obs.shape[0], self.dimA)) * 0.5
        def fg(x):
            value, grad = func(obs, 2 * x - 1)
            grad *= 2
            return value, grad

        act = bundle_entropy.solveBatch(fg, act)[0]
        act = 2 * act - 1

        return act

    def adam(self, func, obs, plot=False):
        # if npr.random() < 1./20:
        #     plot = True
        b1 = 0.9
        b2 = 0.999
        lam = 0.5
        eps = 1e-8
        alpha = 0.01
        nBatch = obs.shape[0]
        act = np.zeros((nBatch, self.dimA))
        m = np.zeros_like(act)
        v = np.zeros_like(act)

        b1t, b2t = 1., 1.
        act_best, a_diff, f_best = [None]*3
        hist = {'act': [], 'f': [], 'g': []}
        for i in range(1000):
            f, g = func(obs, act)
            if plot:
                hist['act'].append(act.copy())
                hist['f'].append(f)
                hist['g'].append(g)

            if i == 0:
                act_best = act.copy()
                f_best = f.copy()
            else:
                prev_act_best = act_best.copy()
                I = (f < f_best)
                act_best[I] = act[I]
                f_best[I] = f[I]
                a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1))
                a_diff = a_diff_i if a_diff is None \
                         else lam*a_diff + (1.-lam)*a_diff_i
                # print(a_diff_i, a_diff, np.sum(f))
                if a_diff < 1e-3 and i > 5:
                    #print('  + Adam took {} iterations'.format(i))
                    if plot:
                        self.adam_plot(func, obs, hist)
                    return act_best

            m = b1 * m + (1. - b1) * g
            v = b2 * v + (1. - b2) * (g * g)
            b1t *= b1
            b2t *= b2
            mhat = m/(1.-b1t)
            vhat = v/(1.-b2t)

            act -= alpha * mhat / (np.sqrt(v) + eps)
            # act = np.clip(act, -1, 1)
            act = np.clip(act, -1.+1e-8, 1.-1e-8)

        #print('  + Warning: Adam did not converge.')
        if plot:
            self.adam_plot(func, obs, hist)
        return act_best

    def adam_plot(self, func, obs, hist):
        hist['act'] = np.array(hist['act']).T
        hist['f'] = np.array(hist['f']).T
        hist['g'] = np.array(hist['g']).T
        if self.dimA == 1:
            xs = np.linspace(-1.+1e-8, 1.-1e-8, 100)
            ys = [func(obs[[0],:], [[xi]])[0] for xi in xs]
            fig = plt.figure()
            plt.plot(xs, ys, alpha=0.5, linestyle="--")
            plt.plot(hist['act'][0,0,:], hist['f'][0,:], label="Adam's trace")
            plt.legend()

            os.makedirs(os.path.join(model_path, "adam"), exist_ok=True)
            t = time.time()
            fname = os.path.join(model_path, "adam", 'adam_plot_{}.png'.format(t))
            plt.savefig(fname)
            plt.close(fig)
        elif self.dimA == 2:
            assert(False)
        else:
            xs = npr.uniform(-1., 1., (5000, self.dimA))
            ys = np.array([func(obs[[0],:], [xi])[0] for xi in xs])
            epi = np.hstack((xs, ys))
            pca = PCA(n_components=2).fit(epi)
            W = pca.components_[:,:-1]
            xs_proj = xs.dot(W.T)
            fig = plt.figure()

            X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100)
            Z = griddata(xs_proj[:,0], xs_proj[:,1], ys.ravel(),
                         X, Y, interp='linear')

            plt.contourf(X, Y, Z, 15)
            plt.colorbar()

            adam_x = hist['act'][:,0,:].T
            adam_x = adam_x.dot(W.T)
            plt.plot(adam_x[:,0], adam_x[:,1], label='Adam', color='k')
            plt.legend()

            os.makedirs(os.path.join(model_path, "adam"), exist_ok=True)
            t = time.time()
            fname = os.path.join(model_path, "adam", 'adam_plot_{}.png'.format(t))
            plt.savefig(fname)
            plt.close(fig)

    def reset(self, obs):
        self.noise = np.zeros(self.dimA)
        self.observation = obs  # initial observation

    def act(self, test=False):
        with self.sess.as_default():
            #print('--- Selecting action, test={}'.format(test))
            obs = np.expand_dims(self.observation, axis=0)

            if FLAGS.icnn_opt == 'adam':
                f = self._fg_entr
                # f = self._fg
            elif FLAGS.icnn_opt == 'bundle_entropy':
                f = self._fg
            else:
                raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)

            tflearn.is_training(False)
            action = self.opt(f, obs)
            tflearn.is_training(not test)

            if not test:
                self.noise -= FLAGS.outheta*self.noise - \
                              FLAGS.ousigma*npr.randn(self.dimA)
                action += self.noise
            action = np.clip(action, -1, 1)

            self.action = np.atleast_1d(np.squeeze(action, axis=0))
            return self.action

    def observe(self, rew, term, obs2, test=False):
        obs1 = self.observation
        self.observation = obs2

        # train
        if not test:
            self.t = self.t + 1

            if FLAGS.use_per:
                self.rm.add(obs1, self.action, rew, obs2, float(term))
            else:
                self.rm.enqueue(obs1, term, self.action, rew)

            if self.t > FLAGS.warmup:
                for i in range(FLAGS.iter):
                    loss = self.train()

    def train(self):
        with self.sess.as_default():
            if FLAGS.use_per:
                experience = self.rm.sample(FLAGS.bsize, beta=self.beta_schedule.value(self.t))
                (obs, act, rew, ob2, term2, weights, batch_idxes) = experience
            else:
                obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize)



            #if np.random.uniform() > 0.7 and np.sum(rew > 0.0) >0 :
            #    print("good reward samples", 100*np.sum(rew > 0.0) / FLAGS.bsize)
            if FLAGS.icnn_opt == 'adam':
                # f = self._opt_train_entr
                f = self._fg_entr_target
                # f = self._fg_target
            elif FLAGS.icnn_opt == 'bundle_entropy':
                f = self._fg_target
            else:
                raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)
            #print('--- Optimizing for training')
            tflearn.is_training(False)
            act2 = self.opt(f, ob2, plot=FLAGS.adam_plot)
            tflearn.is_training(True)

            _, _, loss, td_error, _, _ = self._train(obs, act, rew, ob2, act2,
                                                     term2, weights,
                                                     log=FLAGS.summary,
                                                     global_step=self.t)


            if FLAGS.use_per:
                new_priorities = np.abs(td_error) + FLAGS.eps
                self.rm.update_priorities(batch_idxes, new_priorities)

            self.sess.run(self.proj)
            return loss

    def negQ(self, x, y, reuse=False):
        szs = [FLAGS.l1size, FLAGS.l2size]
        assert(len(szs) >= 1)
        fc = tflearn.fully_connected
        bn = tflearn.batch_normalization
        lrelu = tflearn.activations.leaky_relu

        if reuse:
            tf.get_variable_scope().reuse_variables()

        nLayers = len(szs)
        us = []
        zs = []
        z_zs = []
        z_ys = []
        z_us = []

        reg = 'L2'

        prevU = x
        for i in range(nLayers):
            with tf.variable_scope('u'+str(i)) as s:
                u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg)
                if i < nLayers-1:
                    u = tf.nn.relu(u)
                    if FLAGS.icnn_bn:
                        u = bn(u, reuse=reuse, scope=s, name='bn')
            variable_summaries(u, suffix='u{}'.format(i))
            us.append(u)
            prevU = u

        prevU, prevZ = x, y
        for i in range(nLayers+1):
            sz = szs[i] if i < nLayers else 1
            z_add = []
            if i > 0:
                with tf.variable_scope('z{}_zu_u'.format(i)) as s:
                    zu_u = fc(prevU, szs[i-1], reuse=reuse, scope=s,
                              activation='relu', bias=True,
                              regularizer=reg, bias_init=tf.constant_initializer(1.))
                    variable_summaries(zu_u, suffix='zu_u{}'.format(i))
                with tf.variable_scope('z{}_zu_proj'.format(i)) as s:
                    z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s,
                              bias=False, regularizer=reg)
                    variable_summaries(z_zu, suffix='z_zu{}'.format(i))
                z_zs.append(z_zu)
                z_add.append(z_zu)

            with tf.variable_scope('z{}_yu_u'.format(i)) as s:
                yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True,
                          regularizer=reg, bias_init=tf.constant_initializer(1.))
                variable_summaries(yu_u, suffix='yu_u{}'.format(i))
            with tf.variable_scope('z{}_yu'.format(i)) as s:
                z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False,
                          regularizer=reg)
                z_ys.append(z_yu)
                variable_summaries(z_yu, suffix='z_yu{}'.format(i))
            z_add.append(z_yu)

            with tf.variable_scope('z{}_u'.format(i)) as s:
                z_u = fc(prevU, sz, reuse=reuse, scope=s,
                         bias=True, regularizer=reg,
                         bias_init=tf.constant_initializer(0.))
                variable_summaries(z_u, suffix='z_u{}'.format(i))
            z_us.append(z_u)
            z_add.append(z_u)

            z = tf.add_n(z_add)
            variable_summaries(z, suffix='z{}_preact'.format(i))
            if i < nLayers:
                # z = tf.nn.relu(z)
                z = lrelu(z, alpha=FLAGS.lrelu)
                variable_summaries(z, suffix='z{}_act'.format(i))

            zs.append(z)
            prevU = us[i] if i < nLayers else None
            prevZ = z

        z = tf.reshape(z, [-1], name='energies')
        return z


    def __del__(self):
        self.sess.close()
Exemplo n.º 23
0
def main():

    env = envstandalone.TestRob3Env()

    max_timesteps = 40000
    learning_starts = 1000
    buffer_size = 50000
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 10
    gamma = .98
    target_network_update_freq = 500
    learning_alpha = 0.2

    batch_size = 32
    train_freq = 1

    obsShape = (8, 8, 1)
    #    deicticShape = (3,3,1)
    deicticShape = (3, 3, 2)
    num_deictic_patches = 36

    num_actions = 4
    episode_rewards = [0.0]
    num_cpu = 16
    num_cascade = 5

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # same as getDeictic except this one just calculates for the observation
    # input: n x n x channels
    # output: dn x dn x channels
    def getDeicticObs(obs):
        windowLen = deicticShape[0]
        deicticObs = []
        for i in range(np.shape(obs)[0] - windowLen + 1):
            for j in range(np.shape(obs)[1] - windowLen + 1):

                #                # one-channel output
                #                deicticObsThis = obs[i:i+windowLen,j:j+windowLen,:]

                # two channel output
                deicticObsThis = np.zeros(deicticShape)
                deicticObsThis[:, :, 0] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 10
                deicticObsThis[:, :, 1] = obs[i:i + windowLen, j:j + windowLen,
                                              0] == 20

                deicticObs.append(deicticObsThis)

        return np.array(deicticObs)

    # Same as getDeicticObs, but it operates on a batch rather than a single obs
    # input: obs -> batches x glances x 3 x 3 x 4
    def getDeicticObsBatch(obs):
        obsShape = np.shape(obs)
        deicticObsBatch = []
        for batch in range(obsShape[0]):
            deicticObsBatch.append(getDeicticObs(obs[batch]))
        shape = np.shape(deicticObsBatch)
        return (np.reshape(
            np.array(deicticObsBatch),
            [shape[0] * shape[1], shape[2], shape[3], shape[4]]))

    # CNN version
    # conv model parameters: (num_outputs, kernel_size, stride)
    model = models.cnn_to_mlp(convs=[(16, 3, 1)], hiddens=[16], dueling=True)

    #    # MLP version
    #    model = models.mlp([16, 32])

    q_func = model
    lr = 0.001

    def make_obs_ph(name):
        return U.BatchInput(obsShape, name=name)

    def make_obsDeic_ph(name):

        # CNN version
        return U.BatchInput(deicticShape, name=name)

#        # MLP version
#        return U.BatchInput([9], name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_actions], name=name)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    getq = build_getq(make_obsDeic_ph=make_obsDeic_ph,
                      q_func=q_func,
                      num_actions=num_actions,
                      num_cascade=num_cascade)

    targetTrain = build_targetTrain(
        make_obsDeic_ph=make_obsDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr))

    getDeic = build_getDeic(make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    replay_buffer = ReplayBuffer(buffer_size)
    obs = env.reset()

    timerStart = time.time()
    for t in range(max_timesteps):

        #        obsDeictic = getDeicticObs(obs)
        obsDeictic = getDeic([obs])

        # CNN version
        qCurr = getq(np.array(obsDeictic))

        #        # MLP version
        #        qCurr = getq(np.reshape(obsDeictic,[-1,9]))

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(np.max(qCurrNoise[:, -1, :], 0))
        selPatch = np.argmax(np.max(qCurrNoise[:, -1, :], 1))
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(env.action_space.n)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(obs, action, rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            # Sample from replay buffer
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)

            # Put observations in deictic form
            obses_t_deic = getDeic(obses_t)
            obses_tp1_deic = getDeic(obses_tp1)

            # Reshape everything to (1152,) form
            donesTiled = np.repeat(dones, num_deictic_patches)
            rewardsTiled = np.repeat(rewards, num_deictic_patches)
            actionsTiled = np.repeat(actions, num_deictic_patches)

            # Get curr, next values: CNN version
            qNext = getq(obses_tp1_deic)
            qCurr = getq(obses_t_deic)

            #            # Get curr, next values: MLP version
            #            qNext = getq(np.reshape(obses_tp1_deic,[-1,9]))
            #            qCurr = getq(np.reshape(obses_t_deic,[-1,9]))

            # This version pairs a glimpse with the same glimpse on the next time step
            qNextmax = np.max(qNext[:, -1, :], 1)

            #            # This version takes the max over all glimpses
            #            qNextTiled = np.reshape(qNext[:,-1,:],[batch_size,num_deictic_patches,num_actions])
            #            qNextmax = np.repeat(np.max(np.max(qNextTiled,2),1),num_deictic_patches)

            # Compute Bellman estimate
            targets = rewardsTiled + (1 - donesTiled) * gamma * qNextmax

            #            targetsTiled = np.tile(np.reshape(targets,[-1,1]),[1,num_cascade])

            qCurrTargets = np.copy(qCurr)

            #            # Copy into cascade without pruning
            #            for i in range(num_cascade):
            #                qCurrTargets[range(batch_size*num_deictic_patches),i,actionsTiled] = targets

            # Copy into cascade with pruning.
            qCurrTargets[range(batch_size * num_deictic_patches), 0,
                         actionsTiled] = targets
            for i in range(num_cascade - 1):
                mask = targets < qCurrTargets[range(batch_size *
                                                    num_deictic_patches), i,
                                              actionsTiled]
                qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled] = \
                    mask*targets + \
                    (1-mask)*qCurrTargets[range(batch_size*num_deictic_patches),i+1,actionsTiled]

            # CNN version
            td_error_out, obses_deic_out, targets_out = targetTrain(
                obses_t_deic, qCurrTargets)

            qCurrTargets


#            # MLP version
#            td_error_out, obses_deic_out, targets_out = targetTrain(
#                    np.reshape(obses_t_deic,[-1,9]),
#                    qCurrTargets
#                    )

# bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemplo n.º 24
0
    def __init__(self, dimO, dimA):
        dimA, dimO = dimA[0], dimO[0]
        self.dimA = dimA
        self.dimO = dimO

        tau = FLAGS.tau
        discount = FLAGS.discount
        l2norm = FLAGS.l2norm
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        if FLAGS.icnn_opt == 'adam':
            self.opt = self.adam
        elif FLAGS.icnn_opt == 'bundle_entropy':
            self.opt = self.bundle_entropy
        else:
            raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt)

        if FLAGS.use_per:
            self.rm = PrioritizedReplayBuffer(FLAGS.rmsize, alpha=FLAGS.alpha)
            self.beta_schedule = LinearSchedule(FLAGS.beta_iters,
                                                initial_p=FLAGS.beta0,
                                                final_p=1.0)
        else:
            self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)


        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True)))

        self.noise = np.zeros(self.dimA)

        obs = tf.placeholder(tf.float32, [None, dimO], "obs")
        act = tf.placeholder(tf.float32, [None, dimA], "act")
        rew = tf.placeholder(tf.float32, [None], "rew")
        per_weight = tf.placeholder(tf.float32, [None], "per_weight")

        with tf.variable_scope('q'):
            negQ = self.negQ(obs, act)
        negQ_entr = negQ - entropy(act)
        q = -negQ
        q_entr = -negQ_entr
        act_grad, = tf.gradients(negQ, act)
        act_grad_entr, = tf.gradients(negQ_entr, act)

        obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target")
        act_target = tf.placeholder(tf.float32, [None, dimA], "act_target")
        term_target = tf.placeholder(tf.bool, [None], "term_target")
        with tf.variable_scope('q_target'):
            # double Q
            negQ_target = self.negQ(obs_target, act_target)
        negQ_entr_target = negQ_target - entropy(act_target)
        act_target_grad, = tf.gradients(negQ_target, act_target)
        act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target)
        q_target = -negQ_target
        q_target_entr = -negQ_entr_target

        if FLAGS.icnn_opt == 'adam':
            y = tf.where(term_target, rew, rew + discount * q_target_entr)
            y = tf.maximum(q_entr - 1., y)
            y = tf.minimum(q_entr + 1., y)
            y = tf.stop_gradient(y)
            td_error = q_entr - y
        elif FLAGS.icnn_opt == 'bundle_entropy':
            raise RuntimError("Needs checking.")
            q_target = tf.where(term2, rew, rew + discount * q2_entropy)
            q_target = tf.maximum(q_entropy - 1., q_target)
            q_target = tf.minimum(q_entropy + 1., q_target)
            q_target = tf.stop_gradient(q_target)
            td_error = q_entropy - q_target

        if FLAGS.use_per:
            ms_td_error = tf.reduce_sum(tf.multiply(tf.square(td_error), per_weight), 0)
        else:
            ms_td_error = tf.reduce_mean(tf.square(td_error), 0)

        regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/')
        loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses)

        self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/')
        self.theta_cvx_ = [v for v in self.theta_
                           if 'proj' in v.name and 'W:' in v.name]
        self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_]
        self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_]
        # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_]

        self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope='q_target/')
        update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i))
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]

        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars_q = optim_q.compute_gradients(loss_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)

        summary_path = os.path.join(model_path, 'board', FLAGS.exp_id)
        summary_writer = tf.summary.FileWriter(summary_path, self.sess.graph)


        if FLAGS.summary:
            if FLAGS.icnn_opt == 'adam':
                tf.summary.scalar('Q', tf.reduce_mean(q))
            elif FLAGS.icnn_opt == 'bundle_entropy':
                tf.summary.scalar('Q', tf.reduce_mean(q_entr))

            tf.summary.scalar('Q_target', tf.reduce_mean(q_target))
            tf.summary.scalar('loss', ms_td_error)
            tf.summary.scalar('reward', tf.reduce_mean(rew))
        merged = tf.summary.merge_all()


        # tf functions
        with self.sess.as_default():
            self._train = Fun([obs, act, rew, obs_target, act_target, term_target, per_weight],
                              [optimize_q, update_target, loss_q, td_error, q, q_target],
                              merged, summary_writer)
            self._fg = Fun([obs, act], [negQ, act_grad])
            self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad])
            self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr])
            self._fg_entr_target = Fun([obs_target, act_target],
                                       [negQ_entr_target, act_entr_target_grad])

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(model_path + "/tf")
        if  not FLAGS.force and ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.global_variables_initializer())
            self.sess.run(self.makeCvx)
            self.sess.run([theta_target_i.assign(theta_i)
                    for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)])

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)
Exemplo n.º 25
0
def main():

#    env = gym.make("CartPoleRob-v0")
#    env = gym.make("CartPole-v0")
#    env = gym.make("CartPole-v1")
#    env = gym.make("Acrobot-v1")
#    env = gym.make("MountainCarRob-v0")
#    env = gym.make("FrozenLake-v0")
#    env = gym.make("FrozenLake8x8-v0")
    env = gym.make("FrozenLake8x8nohole-v0")
    
#    robShape = (2,)
#    robShape = (3,)
#    robShape = (200,)
#    robShape = (16,)
    robShape = (64,)
    def make_obs_ph(name):
#        return U.BatchInput(env.observation_space.shape, name=name)
        return U.BatchInput(robShape, name=name)

#    # these params are specific to mountaincar
#    def getOneHotObs(obs):
#        obsFraction = (obs[0] + 1.2) / 1.8
#        idx1 = np.int32(np.trunc(obsFraction*100))
#        obsFraction = (obs[1] + 0.07) / 0.14
#        idx2 = np.int32(np.trunc(obsFraction*100))
#        ident = np.identity(100)
#        return np.r_[ident[idx1,:],ident[idx2,:]]

    # these params are specific to frozenlake
    def getOneHotObs(obs):
#        ident = np.identity(16)
        ident = np.identity(64)
        return ident[obs,:]

    model = models.mlp([32])
#    model = models.mlp([64])
#    model = models.mlp([64], layer_norm=True)
#    model = models.mlp([16, 16])

    # parameters
    q_func=model
    lr=1e-3
#    max_timesteps=100000
    max_timesteps=50000
#    max_timesteps=10000
    buffer_size=50000
    exploration_fraction=0.1
#    exploration_fraction=0.3
    exploration_final_eps=0.02
#    exploration_final_eps=0.1
    train_freq=1
    batch_size=32
    print_freq=10
    checkpoint_freq=10000
    learning_starts=1000
    gamma=1.0
    target_network_update_freq=500
#    prioritized_replay=False
    prioritized_replay=True
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
    prioritized_replay_eps=1e-6
    num_cpu=16

#    # try mountaincar w/ different input dimensions
#    inputDims = [50,2]
    
    sess = U.make_session(num_cpu)
    sess.__enter__()

    act, train, update_target, debug = build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()


    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    obs = getOneHotObs(obs)
    
#    with tempfile.TemporaryDirectory() as td:
    model_saved = False
#        model_file = os.path.join(td, "model")
    for t in range(max_timesteps):

        # Take action and update exploration to the newest value
        action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        new_obs = getOneHotObs(new_obs)
        
        # Store transition in the replay buffer.
        replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs
        
        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            obs = getOneHotObs(obs)
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
            
            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            
            # Update target network periodically.
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#        if done:
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))))
#            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
#                logger.record_tabular("steps", t)
#                logger.record_tabular("episodes", num_episodes)
#                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
#                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
#                logger.dump_tabular()
#        sess
            

    num2avg = 20
    rListAvg = np.convolve(episode_rewards,np.ones(num2avg))/num2avg
    plt.plot(rListAvg)
#    plt.plot(episode_rewards)
    plt.show()

    sess
Exemplo n.º 26
0
def main(max_timesteps):

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    env = envstandalone.BlockArrange()

    # Standard q-learning parameters
    #    max_timesteps=30000
    #    exploration_fraction=0.3
    exploration_fraction = 1
    exploration_final_eps = 0.1
    gamma = .90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts = 10
    buffer_size = 10000
    batch_size = 10
    target_network_update_freq = 1
    train_freq = 1
    print_freq = 1
    lr = 0.0003

    # first two elts of deicticShape must be odd
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches
    #    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"

    fullImageSize = (env.maxSide, env.maxSide, 1)

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay = False
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(convs=[(16, 3, 1), (32, 3, 1)],
                               hiddens=[48],
                               dueling=True)

    def make_fullImage_ph(name):
        return U.BatchInput(fullImageSize, name=name)

    def make_target_fullstate_ph(name):
        return U.BatchInput([num_actions], name=name)

    def make_weight_fullstate_ph(name):
        return U.BatchInput([num_actions], name=name)

    if valueFunctionType == 'DQN':

        getqFullStateNotHolding = build_getq_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=1,
            scope="deepq",
            qscope="q_func_fullstate_notholding",
            reuse=None)
        getqFullStateHolding = build_getq_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=1,
            scope="deepq",
            qscope="q_func_fullstate_holding",
            reuse=None)

        targetTrainFullStateNotHolding = build_targetTrain_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            make_target_ph=make_target_fullstate_ph,
            make_weight_ph=make_weight_fullstate_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_fullstate_notholding",
            grad_norm_clipping=None,
            reuse=None)
        targetTrainFullStateHolding = build_targetTrain_fullstate(
            make_fullImage_ph=make_fullImage_ph,
            make_target_ph=make_target_fullstate_ph,
            make_weight_ph=make_weight_fullstate_ph,
            q_func=q_func,
            num_actions=num_actions,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq",
            qscope="q_func_fullstate_holding",
            grad_norm_clipping=None,
            reuse=None)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()
    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()

    for t in range(max_timesteps):

        # Get qCurr values
        if obs[1]:
            qCurr = getqFullStateHolding([obs[0]])
        else:
            qCurr = getqFullStateNotHolding([obs[0]])

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise)
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)

        # stateImage_t, stateDiscrete_t, actionDiscrete_t, reward, stateImage_tp1, stateDiscrete_tp1, done
        replay_buffer.add(np.copy(obs[0]), np.copy(obs[1]), np.copy(action),
                          np.copy(rew), np.copy(new_obs[0]),
                          np.copy(new_obs[1]), np.copy(float(done)))

        if t > learning_starts and t % train_freq == 0:

            states_images_t, states_discrete_t, actions, rewards, states_images_tp1, states_discrete_tp1, dones = replay_buffer.sample(
                batch_size)
            weights, batch_idxes = np.ones_like(rewards), None

            qNextNotHolding = getqFullStateNotHolding(states_images_tp1)
            qNextHolding = getqFullStateHolding(states_images_tp1)

            qNext = np.stack([qNextNotHolding, qNextHolding], axis=2)
            qNextmax = np.max(qNext[range(batch_size), :, states_discrete_tp1],
                              axis=1)
            targets = rewards + (1 - dones) * gamma * qNextmax

            qCurrNotHoldingBatch = getqFullStateNotHolding(states_images_t)
            qCurrHoldingBatch = getqFullStateHolding(states_images_t)

            qCurrTargetBatch = np.stack(
                [qCurrNotHoldingBatch, qCurrHoldingBatch], axis=2)
            qCurrTargetBatch[range(batch_size), actions,
                             states_discrete_t] = targets

            targetTrainFullStateNotHolding(
                states_images_t, qCurrTargetBatch[:, :, 0],
                np.tile(np.reshape(weights, [batch_size, 1]),
                        [1, num_actions]))
            targetTrainFullStateHolding(
                states_images_t, qCurrTargetBatch[:, :, 1],
                np.tile(np.reshape(weights, [batch_size, 1]),
                        [1, num_actions]))

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        #        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = copy.deepcopy(
            new_obs)  # without this deepcopy, RL totally fails...

    # save learning curve
    filename = 'BAR2_rewards_' + str(num_patches) + "_" + str(
        max_timesteps) + '.dat'
    np.savetxt(filename, episode_rewards)
Exemplo n.º 27
0
    def learn(self, total_timesteps=None, total_episodes=None, log_interval=100, ckpt_interval=100, ckpt_path=None):

        def _sample_episode():
            sample = []
            obs = self.env.reset()
            done = False

            while not done:
                update_eps = self.exploration.value(self.ep_done)
                
                if np.random.random_sample() > update_eps:
                    action, value = self.policy.predict(obs, deterministic=True)
                else:
                    action, value = self.policy.predict(obs, deterministic=False)

                new_obs, reward, done, info = self.env.step(action)

                sample.append((obs, action, reward))
                obs = new_obs

            return sample

        episode_rewards = []
        episode_successes = []
        loop_var = total_timesteps if total_timesteps is not None else total_episodes

        if total_timesteps is not None:
            raise ValueError('Only total_episodes can be specified for this class')

        # if self.exploration_frac is None:
        #     self.exploration = LinearSchedule(frac=self.exploration_ep,
        #                                       initial=self.exploration_initial_eps,
        #                                       final=self.exploration_final_eps)
        # else:
        #     self.exploration = LinearSchedule(frac=self.exploration_frac * loop_var,
        #                                       initial=self.exploration_initial_eps,
        #                                       final=self.exploration_final_eps)

        if self.exploration_type == 'linear':
            self.exploration = LinearSchedule(
                frac=self.exploration_frac * loop_var,
                initial=self.exploration_initial_eps,
                final=self.exploration_final_eps)
        elif self.exploration_type == 'exponential':
            self.exploration = ExponentialSchedule(
                frac=self.exploration_frac,
                initial=self.exploration_initial_eps,
                final=self.exploration_final_eps)

        train = True
        ep_reward = 0

        while train:
            sample = _sample_episode()
            obses, actions, rewards = zip(*sample)
            self.ep_reward = np.sum(rewards)
            for idx in range(len(sample)):
                self.elapsed_steps += 1
                discounts = np.array([self.gamma**i for i in range(len(obses)+1)])
                expected_reward = sum(rewards[idx:]*discounts[:-(1+idx)]) - self.qvalues[obses[idx], actions[idx]]
                self.qvalues[obses[idx], actions[idx]] += self.learning_rate * expected_reward
                # print(np.where(self.qvalues!=0))

                if self.policy.intent:
                    intent_update = np.zeros(self.qvalues.shape)
                    for obs, action in zip(obses[idx:], actions[idx:]):
                        intent_update[obs, action] += self.learning_rate
                    tmp = self.hvalues[obses[idx], actions[idx]] * (1-self.learning_rate)
                    tmp += intent_update
                    self.hvalues[obses[idx], actions[idx]] = tmp
                

            self.ep_done += 1
            last_100rewards[self.ep_done%100] = ep_reward
            print("\rEpisode {}/{}, Average Reward {}".format(self.ep_done,total_episodes, np.nanmean(last_100rewards)),end="")
            # print(len(sample))
            ep_reward = 0

            if self.ep_done >= total_episodes:
                train = False

            if ckpt_path is not None and ckpt_interval:
                if loop_type == 'episode':
                    if self.ep_done % ckpt_interval == 0 and done:
                        ckpt_str = str(self.ep_done)
                        full_path = ckpt_path + '/' + ckpt_str
                        # super(DBNModel, self).save(full_path)
                        super(MCTabularRLModel, self).save(full_path)
                if loop_type == 'timesteps':
                    if self.elapsed_steps % ckpt_interval == 0 and done:
                        ckpt_str = str(self.ep_done)
                        full_path = ckpt_path + '/' + ckpt_str
                        # super(DBNModel, self).save(full_path)
                        super(MCTabularRLModel, self).save(full_path)
def main():

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([
            q_func_tabular[x] if x in q_func_tabular else 10 *
            np.ones(num_states) for x in keys
        ])

#    def trainTabular(vectorKey,qCurrTargets,weights):

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
                q_func_tabular[keys[i]] = (1 - alpha) * q_func_tabular[
                    keys[i]] + alpha * qCurrTargets[i]
#                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]

    env = envstandalone.BlockArrange()

    max_timesteps = 4000
    exploration_fraction = 0.3
    exploration_final_eps = 0.1
    print_freq = 1
    gamma = .90
    num_cpu = 16

    # first two elts of deicticShape must be odd
    actionShape = (3, 3, 2)
    num_states = 2  # either holding or not
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches
    num_actions_discrete = 2
    valueFunctionType = "TABULAR"
    #    valueFunctionType = "DQN"
    #    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE"  # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph, deicticShape=actionShape)

    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()

    episode_rewards = [0.0]
    timerStart = time.time()
    for t in range(max_timesteps):

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors * 2 - 1
        actionsPickDescriptors = np.stack(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.stack(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]

        # Get qCurr values
        actionDescriptorsFlat = np.reshape(
            actionDescriptors,
            [-1, actionShape[0] * actionShape[1] * actionShape[2]]) == 1
        qCurr = getTabular(actionDescriptorsFlat)

        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly

        # select action at random
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:, obs[1]])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _, idx, inv = np.unique(actionDescriptors,
                                    axis=0,
                                    return_index=True,
                                    return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx, obs[1]])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv == actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")


#        env.render()

# take action
        new_obs, rew, done, _ = env.step(action)

        #        print("action: " + str(action) + ", reward: " + str(rew) + ", done: " + str(done))
        #        print("action patch:\n" + str(actionDescriptors[action,:]))
        #        if done:
        #            print("*** done ***")

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptorsNext = getMoveActionDescriptors([new_obs[0]])
        moveDescriptorsNext = moveDescriptorsNext * 2 - 1

        actionsPickDescriptorsNext = np.stack(
            [moveDescriptorsNext,
             np.zeros(np.shape(moveDescriptorsNext))],
            axis=3)
        actionsPlaceDescriptorsNext = np.stack(
            [np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],
            axis=3)
        actionDescriptorsNext = np.stack(
            [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=0)
        actionDescriptorsNext = np.reshape(actionDescriptorsNext, [
            num_patches * num_actions_discrete, actionShape[0], actionShape[1],
            actionShape[2]
        ])

        actionDescriptorsNextFlat = np.reshape(
            actionDescriptorsNext,
            [num_patches * num_actions_discrete, -1]) == 1
        qNext = getTabular(actionDescriptorsNextFlat)

        # Calculate TD target
        qNextmax = np.max(qNext[:, new_obs[1]])
        target = rew + (1 - done) * gamma * qNextmax

        # Update value function
        qCurrTarget = qCurr[action, :]
        qCurrTarget[obs[1]] = target  # target avg value
        trainTabular([actionDescriptorsFlat[action, :]], [qCurrTarget])

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = np.copy(new_obs)

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors * 2 - 1

    actionsPickDescriptors = np.stack(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.stack(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)

    print(str(obs[0][:, :, 0]))

    #    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    #    qPickHolding = getqHolding(actionsPickDescriptors)
    #    qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)

    qPick = getTabular(
        np.reshape(actionsPickDescriptors, [num_patches, -1]) == 1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:, 0], [8, 8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:, 1], [8, 8])))

    #    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    #    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    #    qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)

    qPlace = getTabular(
        np.reshape(actionsPlaceDescriptors, [num_patches, -1]) == 1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:, 0], [8, 8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:, 1], [8, 8])))
Exemplo n.º 29
0
def main():
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    env = envstandalone.BlockArrange()

    # Standard q-learning parameters
    max_timesteps=50000
    exploration_fraction=0.3
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=10
    buffer_size=1
    batch_size=1
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    # first two elts of deicticShape must be odd
#    actionShape = (3,3,2)
    patchShape = (3,3,1)
    lookstackShape = (3,3,2)
    lookShape = (3,3,3)
    ppShape = (3,3,2)
#    num_states = 2 # either holding or not
    num_patches = env.maxSide**2
    num_actions_discrete = 2
    num_actions = num_patches + num_actions_discrete
    valueFunctionType = "DQN"
    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
#    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

#    prioritized_replay=True
    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
#        convs=[(16,3,1), (32,3,1)],
#        hiddens=[48],
        convs=[(32,3,1)],
        hiddens=[48],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def displayLookStack(lookStack):
        np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})
        lookStack1 = str(lookStack[:,:,0])
        lookStack1 = np.core.defchararray.replace(lookStack1,".00","")
        lookStack1 = np.core.defchararray.replace(lookStack1,".","")
        lookStack1 = np.core.defchararray.replace(lookStack1,"0",".")
        lookStack2 = str(lookStack[:,:,1])
        lookStack2 = np.core.defchararray.replace(lookStack2,".00","")
        lookStack2 = np.core.defchararray.replace(lookStack2,".","")
        lookStack2 = np.core.defchararray.replace(lookStack2,"0",".")
        print("lookStack:")
        print(lookStack1)
        print(lookStack2)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_lookDeic_ph(name):
        return U.BatchInput(lookShape, name=name)

    def make_ppDeic_ph(name):
        return U.BatchInput(ppShape, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=lookShape)
    
    getqLookNotHolding = build_getq(
            make_deic_ph=make_lookDeic_ph,
            q_func=q_func,
            scope="deepq",
            qscope="q_func_LookNotHolding"
            )
    getqLookHolding = build_getq(
            make_deic_ph=make_lookDeic_ph,
            q_func=q_func,
            scope="deepq",
            qscope="q_func_LookHolding"
            )
    getqPPNotHolding = build_getq(
            make_deic_ph=make_ppDeic_ph,
            q_func=q_func,
            scope="deepq",
            qscope="q_func_PPNotHolding"
            )
    getqPPHolding = build_getq(
            make_deic_ph=make_ppDeic_ph,
            q_func=q_func,
            scope="deepq",
            qscope="q_func_PPHolding"
            )
    
    targetTrainLookNotHolding = build_targetTrain(
        make_deic_ph=make_lookDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_LookNotHolding",
        grad_norm_clipping=1.
    )
    targetTrainLookHolding = build_targetTrain(
        make_deic_ph=make_lookDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_LookHolding",
        grad_norm_clipping=1.
    )
    targetTrainPPNotHolding = build_targetTrain(
        make_deic_ph=make_ppDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_PPNotHolding",
        grad_norm_clipping=1.
    )
    targetTrainPPHolding = build_targetTrain(
        make_deic_ph=make_ppDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        scope="deepq", 
        qscope="q_func_PPHolding",
        grad_norm_clipping=1.
    )
        
    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()
    lookStack = np.zeros(lookstackShape)
    lookStackNext = np.zeros(lookstackShape)
    
    episode_rewards = [0.0]
    td_errors = [0.0]
    timerStart = time.time()
    U.initialize()
    for t in range(max_timesteps):
        
        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors*2-1
        moveDescriptors = np.reshape(moveDescriptors,[num_patches,patchShape[0],patchShape[1],patchShape[2]])
        looksStackTiled = np.tile(lookStack,[num_patches,1,1,1])
        lookDescriptors = np.concatenate([moveDescriptors,looksStackTiled],axis=3)
        
        if obs[1] == 0: # not holding
            qCurrLook = getqLookNotHolding(lookDescriptors)
            qCurrPP = np.r_[getqPPNotHolding([lookStack]),[[0]]]
        else: # holding
            qCurrLook = getqLookHolding(lookDescriptors)
            qCurrPP = np.r_[[[0]],getqPPHolding([lookStack])]
        qCurr = np.concatenate([qCurrLook,qCurrPP],axis=0)

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise)
            if np.random.rand() < exploration.value(t):
                actionClass = np.random.randint(3)
                if actionClass == 0:
                    action = np.random.randint(num_patches)
                else:
                    action = np.random.randint(num_patches,num_patches+2)
#                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(lookDescriptors,axis=0,return_index=True,return_inverse=True)
            idx = np.r_[idx,num_patches,num_patches+1]
            actionIdx = np.argmax(qCurrNoise[idx])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            if actionIdx < len(idx)-2:
                actionsSelected = np.nonzero(inv==actionIdx)[0]
                action = actionsSelected[np.random.randint(len(actionsSelected))]
            else:
                action = idx[actionIdx]
        else:
            print("Error...")


        # take action
        new_obs, rew, done, _ = env.step(action)
        
        # If look action, then update look stack
        if action < num_patches:
            lookStackNext[:,:,1] = np.copy(lookStack[:,:,0])
            lookStackNext[:,:,0] = np.copy(moveDescriptors[action][:,:,0])
            lookAction = moveDescriptors[action]
            discreteAction = 0
        else:
            lookAction = np.zeros(patchShape)
            discreteAction = action - num_patches
        
        print("action: " + str(action))
        env.render()
        print("Reward: " + str(rew) + ", done: " + str(done))
        displayLookStack(lookStackNext)
        
        # discrete state, look state, discrete action, look action, reward, discrete next state, look next state, done
        replay_buffer.add(obs[1], lookStack, discreteAction, lookAction, rew, new_obs[1], lookStackNext, new_obs[0], float(done))
        
        lookStack = np.copy(lookStackNext)
        
        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                statesHolding_t, statesLookStack_t, actionsDiscrete, lookActions, rewards, statesHolding_tp1, statesLookStack_tp1, observations_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(observations_tp1)
            moveDescriptorsNext = moveDescriptorsNext*2-1
            moveDescriptorsNext = np.reshape(moveDescriptorsNext,[-1,patchShape[0],patchShape[1],patchShape[2]])
            looksStackNextTiled = np.repeat(statesLookStack_tp1,num_patches,axis=0)
            lookDescriptorsNext = np.concatenate([moveDescriptorsNext,looksStackNextTiled],axis=3)

            # calculate qNext
            qNextLookNotHolding = np.max(np.reshape(getqLookNotHolding(lookDescriptorsNext),[batch_size,num_patches,1]),axis=1)
            qNextLookHolding = np.max(np.reshape(getqLookHolding(lookDescriptorsNext),[batch_size,num_patches,1]),axis=1)
            qNextPPNotHolding = getqPPNotHolding(statesLookStack_tp1)
            qNextPPHolding = getqPPHolding(statesLookStack_tp1)
            qNextNotHolding = np.max(np.c_[qNextLookNotHolding,qNextPPNotHolding],axis=1)
            qNextHolding = np.max(np.c_[qNextLookHolding,qNextPPHolding],axis=1)
            qNext = np.stack([qNextNotHolding,qNextHolding],axis=1)

            targets = rewards + (1-dones) * gamma * qNext[range(batch_size),statesHolding_tp1]
            
            # Calculate qCurrTarget
            lookDescriptors = np.concatenate([lookActions,statesLookStack_t],axis=3)
            qCurrLookNotHoldingT = getqLookNotHolding(lookDescriptors)
            qCurrLookHoldingT = getqLookHolding(lookDescriptors)
            qCurrPPNotHoldingT = getqPPNotHolding(statesLookStack_t)
            qCurrPPHoldingT = getqPPHolding(statesLookStack_t)
            qCurrT = np.c_[qCurrLookNotHoldingT,qCurrPPNotHoldingT,qCurrLookHoldingT,qCurrPPHoldingT]
            
            td_error = qCurrT[range(batch_size),np.int32(actionsDiscrete > 0) + (2*statesHolding_t)] - targets
            qCurrT[range(batch_size),np.int32(actionsDiscrete > 0) + (2*statesHolding_t)] = targets

            targetTrainLookNotHolding(lookDescriptors,  np.reshape(qCurrT[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainPPNotHolding(statesLookStack_t, np.reshape(qCurrT[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainLookHolding(lookDescriptors, np.reshape(qCurrT[:,2],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainPPHolding(statesLookStack_t, np.reshape(qCurrT[:,3],[batch_size,1]), np.reshape(weights,[batch_size,1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            td_errors[-1] += td_error


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
            td_errors.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            timerStart = timerFinal
        
        obs = np.copy(new_obs)


    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors*2-1

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    if valueFunctionType == "TABULAR":
        qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    else:
        qPickNotHolding = getqNotHolding(actionsPickDescriptors)
        qPickHolding = getqHolding(actionsPickDescriptors)
        qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[8,8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[8,8])))

    if valueFunctionType == "TABULAR":
        qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    else:
        qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
        qPlaceHolding = getqHolding(actionsPlaceDescriptors)
        qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)    
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[8,8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[8,8])))
Exemplo n.º 30
0
def main():
	
	parser = argparse.ArgumentParser()
	parser.add_argument("--policy_name", default="TD3")							# Policy name
	parser.add_argument("--env_name", default="Pendulum-v0")					# OpenAI gym environment name
	parser.add_argument("--replay_buffer", default="prioritized")				# Replay Buffer type
	parser.add_argument("--replay_buffer_size", default=5e4, type=int)			# Replay Buffer capacity
	parser.add_argument("--replay_buffer_alpha", default=0.6, type=float)		# Replay Buffer prioritization weight
	parser.add_argument("--seed", default=0, type=int)							# Sets Gym, PyTorch and Numpy seeds
	parser.add_argument("--start_timesteps", default=1e4, type=int)				# How many time steps purely random policy is run for
	parser.add_argument("--eval_freq", default=1e3, type=float)					# How often (time steps) we evaluate
	parser.add_argument("--max_timesteps", default=5e4, type=float)				# Max time steps to run environment for
	parser.add_argument("--save_models", default="True", type=bool)				# Whether or not models are saved
	parser.add_argument("--expl_noise", default=0.1, type=float)				# Std of Gaussian exploration noise
	parser.add_argument("--batch_size", default=100, type=int)					# Batch size for both actor and critic
	parser.add_argument("--discount", default=0.99, type=float)					# Discount factor
	parser.add_argument("--tau", default=0.005, type=float)						# Target network update rate
	parser.add_argument("--policy_noise", default=0.2, type=float)				# Noise added to target policy during critic update
	parser.add_argument("--noise_clip", default=0.5, type=float)				# Range to clip target policy noise
	parser.add_argument("--policy_freq", default=2, type=int)					# Frequency of delayed policy updates
	parser.add_argument("--lr_actor", default=0.001, type=float)				# Learning rate of actor
	parser.add_argument("--lr_critic", default=0.001, type=float)				# Learning rate of critic
	parser.add_argument("--prioritized_replay_eps", default=1e-3, type=float)	# Replay Buffer epsilon (PRE)
	parser.add_argument("--prioritized_replay_beta0", default=0.4, type=float)	# Replay Buffer initial beta (PRE)
	args = parser.parse_args()

#Training kwargs
	kwargs = {  "policy_name": args.policy_name,
				"env_name": args.env_name,
				"replay_buffer": args.replay_buffer,
				"replay_buffer_size": args.replay_buffer_size,
				"replay_buffer_alpha": args.replay_buffer_alpha,
				"seed": args.seed,
				"start_timesteps": args.start_timesteps,
				"eval_freq": args.eval_freq,
				"max_timesteps": args.max_timesteps,
				"save_models": args.save_models,
				"expl_noise": args.expl_noise,
				"batch_size": args.batch_size,
				"discount": args.discount,
				"tau": args.tau,
				"policy_noise": args.policy_noise,
				"noise_clip": args.noise_clip,
				"policy_freq": args.policy_freq,
				"lr_actor": args.lr_actor,
				"prioritized_replay_eps": args.prioritized_replay_eps,
				"prioritized_replay_beta0": args.prioritized_replay_beta0
         }

	# cls
	os.system('cls' if os.name == 'nt' else 'clear')

	if not os.path.exists("./results"):
    		os.makedirs("./results")
	if args.save_models and not os.path.exists("./pytorch_models"):
		os.makedirs("./pytorch_models")

	# Time stamp for repeated test names
	ts = time.time()
	ts = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H-%M-%S')

	test_name = "%s_%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed), ts)
	plot_name = "%s_%s_%s_%s_plot.png" % (args.policy_name, args.env_name, str(args.seed), ts)
	kwargs_name = "%s_%s_%s_%s_kwargs.csv" % (args.policy_name, args.env_name, str(args.seed), ts)
	scores_name = "%s_%s_%s_%s_scores.csv" % (args.policy_name, args.env_name, str(args.seed), ts)

	print("---------------------------------------")
	print("Settings: %s" % (test_name))
	utils.save_kwargs(kwargs, "./results/%s" % (kwargs_name))
	print("---------------------------------------")

	# Environment and Agent instantiation

	env = gym.make(args.env_name)

	# Set seeds
	env.seed(args.seed)
	torch.manual_seed(args.seed)
	np.random.seed(args.seed)
	
	state_dim = env.observation_space.shape[0]
	action_dim = env.action_space.shape[0] 
	max_action = float(env.action_space.high[0])

	# Instantiate Replay Buffer	
	if args.replay_buffer == "vanilla": 
		replay_buffer = rb.ReplayBuffer(size = args.replay_buffer_size)
		PER = False
	elif args.replay_buffer == "prioritized": 
		replay_buffer = rb.PrioritizedReplayBuffer(size = int(np.round(np.sqrt(args.replay_buffer_size))), 
												   alpha = args.replay_buffer_alpha)
		PER = True
		prioritized_replay_beta_iters = args.max_timesteps
		prioritized_replay_beta0 = args.prioritized_replay_beta0
		beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p = prioritized_replay_beta0,
                                       final_p = 1.0)

	# Instantiate policy
	if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps)
	elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps)

	# Evaluate untrained policy
	evaluations = [evaluate_policy(env, policy)] 

	# Training loop #######################################

	total_timesteps = 0
	timesteps_since_eval = 0
	episode_num = 0
	episode_rewards = []
	done = True 

	while total_timesteps < args.max_timesteps:
		
		if done: 

			if total_timesteps != 0: 
				print('Total T: {} Episode Num: {} Episode T: {} Reward: {}'.format(total_timesteps, episode_num, episode_timesteps, episode_reward))
				episode_rewards.append(episode_reward)
				
				# PER Beta scheduled update 
				if PER: beta = beta_schedule.value(total_timesteps)
				else: beta = 0.
				# Policy update step
				if args.policy_name == "TD3":
					policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, beta)
				else: 
					policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, beta)
			
			# Evaluate episode
			if timesteps_since_eval >= args.eval_freq:
				timesteps_since_eval %= args.eval_freq
				evaluations.append(evaluate_policy(env, policy))
				
				# save evaluation
				#if args.save_models: policy.save(test_name, directory="./pytorch_models")
				#np.save("./results/%s" % (test_name), evaluations) 
			
			# Reset environment
			obs = env.reset()
			done = False
			episode_reward = 0
			episode_timesteps = 0
			episode_num += 1 
		
		# Select action randomly or according to policy
		if total_timesteps < args.start_timesteps:
			action = env.action_space.sample()
		else:
			action = policy.select_action(np.array(obs))
			if args.expl_noise != 0: 
				action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)

		# Perform action
		new_obs, reward, done, _ = env.step(action) 
		done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
		episode_reward += reward

		# Push experience into replay buffer
		experience = (obs, action, reward, new_obs, done_bool)
		replay_buffer.add(experience)

		obs = new_obs

		episode_timesteps += 1
		total_timesteps += 1
		timesteps_since_eval += 1
		
	# Final evaluation 
	evaluations.append(evaluate_policy(env, policy))
	
	# Save results
	if args.save_models: policy.save("%s" % (test_name), directory="./pytorch_models")
	#np.save("./results/%s" % (evaluations_file), evaluations)  
	#np.save("./results/%s" % ('rewards.txt'), episode_rewards) 
	utils.save_scores(episode_rewards, "./results/%s" % (scores_name))
	utils.plot(episode_rewards, "./results/%s" % (plot_name), 1)