Exemplo n.º 1
0
def train(config):
    """

    """
    memory = ReplayBuffer((8, ), (1, ), config["expert_buffer_size"],
                          config["device"])
    memory.load_memory(config["buffer_path"])
    agent = Agent(8, 1, 4, config)

    for i_episode in range(config['episodes']):
        text = "Inverse Episode {}  \ {} \r".format(i_episode,
                                                    config["episodes"])
        print(text, end='')
        agent.learn(memory)
        break
Exemplo n.º 2
0
 def __init__(self, state_size, action_size, config):
     self.action_size = action_size
     self.state_size = state_size
     self.Q = np.zeros([state_size, action_size])
     self.Q_inverse = np.zeros([state_size, action_size])
     self.debug_Q = np.zeros([state_size, action_size])
     self.Q_shift = np.zeros([state_size, action_size])
     self.r = np.zeros([state_size, action_size])  
     self.counter = np.zeros([state_size, action_size])
     self.gamma = config["gamma"]
     self.epsilon = 1
     self.lr = config["lr"]
     self.lr_iql_q = config["lr_iql_q"]
     self.lr_iql_r = config["lr_iql_r"]
     self.min_epsilon = config["min_epsilon"]
     self.max_epsilon =1
     self.episode = 15000
     self.decay = config["decay"]
     self.total_reward = 0
     self.eval_frq = 50
     self.render_env = False
     self.env = gym.make(config["env_name"])
     self.memory = ReplayBuffer((1,),(1,),config["buffer_size"], config["device"])
     self.gamma_iql = 0.99
     self.gamma_iql = 0.99
     self.lr_sh = config["lr_q_sh"]
     self.ratio = 1. / action_size
     self.eval_q_inverse = 50000
     self.episodes_qinverse = int(5e6)
     self.update_freq = config['freq_q']
     self.steps = 0
     pathname = "lr_inv_q {} lr_inv_r {} freq {}".format(self.lr_iql_q, self.lr_iql_r, self.update_freq)
     tensorboard_name = str(config["locexp"]) + '/runs/' + pathname 
     self.writer = SummaryWriter(tensorboard_name)
     tensorboard_name = str(config["locexp"]) + '/runs/' + "inverse" 
     self.writer_inverse = SummaryWriter(tensorboard_name)
     tensorboard_name = str(config["locexp"]) + '/runs/' + "expert" 
     self.writer_expert = SummaryWriter(tensorboard_name)
     self.last_100_reward_errors = deque(maxlen=100) 
     self.average_same_action = deque(maxlen=100) 
     self.expert_buffer_size = config["expert_buffer_size"]
Exemplo n.º 3
0
def main():
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})
    
    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
#        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
        return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys])
    
    def trainTabular(vectorKey,qCurrTargets,weights):
        keys = getTabularKeys(vectorKey)
        alpha=0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
#                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i,:]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]


    # Standard DQN parameters
#    max_timesteps=20000
    max_timesteps=30000
#    max_timesteps=2000
    learning_starts=1000
#    learning_starts=10
#    buffer_size=50000
    buffer_size=10000
#    buffer_size=1000
#    buffer_size=320
#    buffer_size=32
#    buffer_size=8
#    buffer_size=1
#    exploration_fraction=0.2
    exploration_fraction=0.3
#    exploration_final_eps=0.02
    exploration_final_eps=0.1
    print_freq=1
#    gamma=.98
    gamma=.9
    target_network_update_freq=1
    batch_size=32
#    batch_size=1
    train_freq=1
#    train_freq=2
    num_cpu = 16
#    lr=0.001
    lr=0.0003
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay=True
#    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1
    
    # Deictic state/action parameters
    deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (3,3,2)
    num_cascade = 5
#    num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything
    num_states = 2 # either holding or not
    num_patches = env.maxSide**2
    num_actions = 2*num_patches
    num_actions_discrete = 2
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
#    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected
    
    # ******* Build tensorflow functions ********

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
        convs=[(16,3,1), (32,3,1)],
        hiddens=[48],
#        convs=[(32,3,1)],
#        hiddens=[32],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(deicticActionShape, name=name)

    def make_target_ph(name):
#        return U.BatchInput([num_actions], name=name)
#        return U.BatchInput([num_cascade,num_states], name=name)
        return U.BatchInput([num_states], name=name)

    def make_weight_ph(name):
        return U.BatchInput([num_states], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    if valueFunctionType == 'DQN':
        getq = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=num_cascade,
                scope="deepq",
                qscope="q_func"
                )
    
        targetTrain = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=num_cascade,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func",
            grad_norm_clipping=1.
    #        grad_norm_clipping=0.1
        )

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = np.int32(obs[1]>0) # holding

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptorsRaw = getMoveActionDescriptors([obs[0]])
        moveDescriptors = np.int32(moveDescriptorsRaw>0)
        moveDescriptors = moveDescriptors*2-1

        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        if valueFunctionType == "TABULAR":
            actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1
            qCurr = getTabular(actionDescriptorsFlat)
        else:
            qCurr = getq(actionDescriptors)
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly

        # select action at random
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:,stateDeictic])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx,stateDeictic])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv==actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # display state at the end
        if t > max_timesteps-200:
            print(str(obs[0][:,:,0]))
            print(str(obs[1]))
            print("action: " + str(action))

        # take action
        new_obs, rew, done, _ = env.step(action)
        
        # display state at the end
        if (t > max_timesteps-200) and done:
            print("done *********************** done")
            
        replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            states_tp1 = np.int32(states_tp1>0)
            
            moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext1 = np.int32(moveDescriptorsNext1>0)
            moveDescriptorsNext1 = moveDescriptorsNext1*2-1

            actionsPickDescriptorsNext1 = np.stack([moveDescriptorsNext1, np.zeros(np.shape(moveDescriptorsNext1))],axis=3)
            actionsPlaceDescriptorsNext1 = np.stack([np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1],axis=3)
            actionDescriptorsNext1 = np.stack([actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1], axis=0)
            actionDescriptorsNext1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,deicticActionShape[0],deicticActionShape[1],deicticActionShape[2]])
            
            if valueFunctionType == "TABULAR":
                actionDescriptorsNextFlat1 = np.reshape(actionDescriptorsNext1,[batch_size*num_patches*num_actions_discrete,-1]) == 1
                qNextFlat1 = getTabular(actionDescriptorsNextFlat1)
            else:
                qNextFlat1 = getq(actionDescriptorsNext1)
            
            qNext1 = np.reshape(qNextFlat1,[batch_size,num_patches,num_actions_discrete,num_states])
            qNextmax1 = np.max(np.max(qNext1[range(batch_size),:,:,states_tp1],2),1)
            targets1 = rewards + (1-dones) * gamma * qNextmax1

            if valueFunctionType == "TABULAR":
                actionsFlat = np.reshape(actions,[batch_size,-1]) == 1
                qCurrTarget1 = getTabular(actionsFlat)
            else:
                qCurrTarget1 = getq(actions)

            td_errors = qCurrTarget1[range(batch_size),states_t] - targets1
            qCurrTarget1[range(batch_size),states_t] = targets1

            if valueFunctionType == "TABULAR":
                trainTabular(actionsFlat, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (TABULAR)
            else:
                targetTrain(actions, qCurrTarget1, np.transpose(np.tile(weights,[num_states,1]))) # (DQN)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", beta: " + str(beta) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
        
    # display value function
    obs = env.reset()
    moveDescriptorsRaw = getMoveActionDescriptors([obs[0]])
    moveDescriptors = np.int32(moveDescriptorsRaw>0)
    moveDescriptors = moveDescriptors*2-1

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    qPick = getq(actionsPickDescriptors)
#    qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[8,8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[8,8])))

    qPlace = getq(actionsPlaceDescriptors)
#    qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[8,8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[8,8])))
def main():

    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
    
    def trainTabular(vectorKey,qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha=1.0
        for i in range(len(keys)):
            if keys[i] in q_func:
                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func[keys[i]] = qCurrTargets[i]


    # Standard DQN parameters
    max_timesteps=40000
    learning_starts=1000
#    learning_starts=10
#    buffer_size=50000
#    buffer_size=10000
    buffer_size=1000
#    buffer_size=100
#    buffer_size=2
    exploration_fraction=0.2
    exploration_final_eps=0.02
    print_freq=1
    gamma=.98
    target_network_update_freq=1
    batch_size=32
#    batch_size=8
    train_freq=1
    num_cpu = 16
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    replay_buffer = ReplayBuffer(buffer_size)

    # Deictic state/action parameters
    deicticShape = (3,3,2) # IMPORTANT: first two elts of deicticShape must be odd
    num_cascade = 5
    num_states = env.num_blocks + 1 # one more state than blocks to account for not holding anything
    num_patches = env.maxSide**2
    num_actions = 2*num_patches
    
    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)
    
    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,deicticShape=deicticShape)

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    obs = env.reset()
    for t in range(max_timesteps):
        
        # Get state: in range(0,env.num_blocks)
        stateDeictic = obs[1] # obj in hand

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = np.reshape(getMoveActionDescriptors([obs[0]]),[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
        actionDescriptors = np.r_[np.c_[np.zeros([num_patches,1])==1,moveDescriptors],np.c_[np.ones([num_patches,1])==1,moveDescriptors]]

        # Get q-values
        qCurr = getTabular(actionDescriptors)
        
        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:,stateDeictic])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(stateDeictic, actionDescriptors[action,:], rew, new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
            
            moveDescriptorsNext1Tiled = np.reshape(getMoveActionDescriptors(images_tp1),[batch_size,num_patches,deicticShape[0]*deicticShape[1]*deicticShape[2]])
            actionDescriptorsNext1Tiled = np.stack(
                    [np.c_[np.zeros([batch_size,num_patches,1])==1,moveDescriptorsNext1Tiled],
                    np.c_[np.ones([batch_size,num_patches,1])==1,moveDescriptorsNext1Tiled]]
                    ,axis=1)
            actionDescriptorsNext = np.reshape(actionDescriptorsNext1Tiled,[batch_size*2*num_patches,-1])
            
            qNext1 = getTabular(actionDescriptorsNext)
            
            states_tp1Full = np.repeat(states_tp1,2*num_patches)
            
            
            qNextTiled = np.reshape(qNext1[range(2*batch_size*num_patches),states_tp1Full],[batch_size,2,num_patches,-1])
            qNextmax = np.max(np.max(np.max(qNextTiled,3),2),1)
            
            targets = rewards + (1-dones) * gamma * qNextmax

            qCurrTarget = getTabular(actions)
            qCurrTarget[range(batch_size),states_tp1] = np.minimum(qCurrTarget[range(batch_size),states_tp1], targets)
            trainTabular(actions,qCurrTarget)


            # ********************************************
            
#            # Sample from replay buffer
#            states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
#
#            # Get action set: <num_patches> pick actions followed by <num_patches> place actions
#            moveDescriptorsNext = np.reshape(getMoveActionDescriptors(images_tp1),[batch_size,num_patches,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#            actionDescriptorsNext = np.stack([np.c_[np.zeros([batch_size,num_patches,1])==1,moveDescriptorsNext], 
#                            np.c_[np.ones([batch_size,num_patches,1])==1,moveDescriptorsNext]],
#                            axis=1)
#            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*2*num_patches,-1])
#            
#            # Get targets
#            qNext = getTabular(actionDescriptorsNext)
#            np.repeat(states_tp1,2*num_patches)
#            qNextAtState = qNext[range(batch_size*2*num_patches),np.repeat(states_tp1,2*num_patches)]
#            qNextTiled = np.reshape(qNextAtState,[batch_size,2*num_patches])
#            qNextmax = np.max(qNextTiled,1)
#            targets = rewards + (1-dones) * gamma * qNextmax
#            
#            qCurrTarget = getTabular(actions)
#            qCurrTarget[range(batch_size),states_t] = targets
#            trainTabular(actions,qCurrTarget)
            
            # ********************************************

#        # Get state: in range(0,env.num_blocks)
#        stateDeicticNext = new_obs[1] # holding
#
#        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
#        moveDescriptorsNext = np.reshape(getMoveActionDescriptors([new_obs[0]]),[-1,deicticShape[0]*deicticShape[1]*deicticShape[2]])
#        actionDescriptorsNext = np.r_[np.c_[np.zeros([num_patches,1])==1,moveDescriptorsNext],np.c_[np.ones([num_patches,1])==1,moveDescriptorsNext]]
#
#        # Calculate TD target
#        qNext = getTabular(actionDescriptorsNext)
#        qNextmax = np.max(qNext[:,stateDeicticNext])
#        target = rew + (1-done) * gamma * qNextmax
#
#        # Update dictionary value function
#        qCurrTarget = qCurr[action,:]
#        qCurrTarget[stateDeictic] = np.minimum(qCurrTarget[stateDeictic], target)
#        trainTabular([actionDescriptors[action,:]],[qCurrTarget])



        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = new_obs
Exemplo n.º 5
0
def main():

    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func_dict = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        #        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
        return np.array([
            q_func_dict[x] if x in q_func_dict else 0 *
            np.ones([num_cascade, num_states]) for x in keys
        ])

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 0.3
        for i in range(len(keys)):
            if keys[i] in q_func_dict:
                q_func_dict[keys[i]] = (
                    1 - alpha) * q_func_dict[keys[i]] + alpha * qCurrTargets[i]
            else:
                q_func_dict[keys[i]] = qCurrTargets[i]

    # Standard DQN parameters
    max_timesteps = 40000
    #    max_timesteps=80000
    #    max_timesteps=160000
    learning_starts = 1000
    #    buffer_size=50000
    buffer_size = 10000
    #    buffer_size=1000
    #    buffer_size=100
    #    buffer_size=2
    #    exploration_fraction=0.4
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 1
    #    gamma=.98
    gamma = .96
    target_network_update_freq = 1
    #    batch_size=32
    batch_size = 64
    #    batch_size=128
    #    batch_size=256
    #    batch_size=8
    #    train_freq=1
    train_freq = 2
    #    train_freq=4
    #    train_freq=8
    #    train_freq=16
    num_train_iter = 1
    num_cpu = 16
    lr = 0.001
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    replay_buffer = ReplayBuffer(buffer_size)

    # Deictic state/action parameters
    deicticShape = (3, 3, 2
                    )  # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (
        3, 3, 4)  # IMPORTANT: first two elts of deicticShape must be odd
    num_cascade = 5
    num_states = env.num_blocks + 1  # one more state than blocks to account for not holding anything
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches

    # ******* Build tensorflow functions ********

    q_func = models.cnn_to_mlp(
        #    q_func = models.cnn_to_mlp_2pathways(
        convs=[(32, 3, 1)],
        #        convs=[(16,3,1)],
        hiddens=[32],
        dueling=True)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(deicticActionShape, name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        return U.BatchInput([num_cascade, num_states], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph, deicticShape=deicticActionShape)

    getq = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                      q_func=q_func,
                      num_states=num_states,
                      num_cascade=num_cascade,
                      scope="deepq",
                      qscope="q_func")

    targetTrain = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func",
        grad_norm_clipping=1.
        #        grad_norm_clipping=0.1
    )

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = obs[1]  # obj in hand

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions

        moveDescriptors = getMoveActionDescriptors([obs[0]])
        #        actionsPickDescriptors = np.concatenate([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        #        actionsPlaceDescriptors = np.concatenate([np.ones(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionsPickDescriptors = np.concatenate(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.concatenate(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]

        #        # TABULAR version
        #        actionDescriptors = np.reshape(actionDescriptors,[-1,deicticActionShape[0]*deicticActionShape[1]*deicticActionShape[2]]) == 1
        #        qCurr = getTabular(actionDescriptors)

        # DQN version
        qCurr = getq(actionDescriptors)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:, -1, stateDeictic])  # USE CASCADE
        #        action = np.argmax(qCurrNoise[:,0,stateDeictic]) # NO CASCADE
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew,
                          new_obs, float(done))

        # sample from replay buffer and train
        if t > learning_starts and t % train_freq == 0:

            for iter in range(num_train_iter):

                states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(
                    batch_size)

                moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
                #                actionsPickDescriptorsNext = np.concatenate([np.zeros(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3)
                #                actionsPlaceDescriptorsNext = np.concatenate([np.ones(np.shape(moveDescriptorsNext)),moveDescriptorsNext],axis=3)
                actionsPickDescriptorsNext = np.concatenate([
                    moveDescriptorsNext,
                    np.zeros(np.shape(moveDescriptorsNext))
                ],
                                                            axis=3)
                actionsPlaceDescriptorsNext = np.concatenate([
                    np.zeros(np.shape(moveDescriptorsNext)),
                    moveDescriptorsNext
                ],
                                                             axis=3)
                actionDescriptorsNextFlat = np.stack(
                    [actionsPickDescriptorsNext, actionsPlaceDescriptorsNext],
                    axis=1)

                #            # TABULAR version
                #            actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat,[batch_size*2*num_patches,-1]) == 1
                #            qNext = getTabular(actionDescriptorsNext)

                # DQN version
                actionDescriptorsNext = np.reshape(actionDescriptorsNextFlat, [
                    batch_size * 2 * num_patches, deicticActionShape[0],
                    deicticActionShape[1], deicticActionShape[2]
                ]) == 1
                qNext = getq(actionDescriptorsNext)

                states_tp1Full = np.repeat(states_tp1, 2 * num_patches)

                qNextTiled = np.reshape(
                    qNext[range(2 * batch_size * num_patches), -1,
                          states_tp1Full],
                    [batch_size, 2, num_patches, -1])  # USE CASCADE
                #            qNextTiled = np.reshape(qNext[range(2*batch_size*num_patches),0,states_tp1Full],[batch_size,2,num_patches,-1]) # NO CASCADE
                qNextmax = np.max(np.max(np.max(qNextTiled, 3), 2), 1)

                targets = rewards + (1 - dones) * gamma * qNextmax

                #            # TABULAR version
                #            qCurr = getTabular(actions)

                # DQN version
                qCurr = getq(actions)

                qCurrTarget = np.copy(qCurr)
                qCurrTarget[range(batch_size), 0, states_tp1] = targets
                for i in range(num_cascade - 1):
                    mask = targets < qCurr[range(batch_size), i, states_tp1]
                    qCurrTarget[range(batch_size),i+1,states_tp1] = \
                        mask*targets + \
                        (1-mask)*qCurrTarget[range(batch_size),i+1,states_tp1]

    #            # TABULAR version
    #            trainTabular(actions,qCurrTarget)

    # DQN version
                targetTrain(actions, qCurrTarget)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs
Exemplo n.º 6
0
class Agent():
    def __init__(self, state_size, action_size, config):
        self.action_size = action_size
        self.state_size = state_size
        self.Q = np.zeros([state_size, action_size])
        self.Q_inverse = np.zeros([state_size, action_size])
        self.debug_Q = np.zeros([state_size, action_size])
        self.Q_shift = np.zeros([state_size, action_size])
        self.r = np.zeros([state_size, action_size])  
        self.counter = np.zeros([state_size, action_size])
        self.gamma = config["gamma"]
        self.epsilon = 1
        self.lr = config["lr"]
        self.lr_iql_q = config["lr_iql_q"]
        self.lr_iql_r = config["lr_iql_r"]
        self.min_epsilon = config["min_epsilon"]
        self.max_epsilon =1
        self.episode = 15000
        self.decay = config["decay"]
        self.total_reward = 0
        self.eval_frq = 50
        self.render_env = False
        self.env = gym.make(config["env_name"])
        self.memory = ReplayBuffer((1,),(1,),config["buffer_size"], config["device"])
        self.gamma_iql = 0.99
        self.gamma_iql = 0.99
        self.lr_sh = config["lr_q_sh"]
        self.ratio = 1. / action_size
        self.eval_q_inverse = 50000
        self.episodes_qinverse = int(5e6)
        self.update_freq = config['freq_q']
        self.steps = 0
        pathname = "lr_inv_q {} lr_inv_r {} freq {}".format(self.lr_iql_q, self.lr_iql_r, self.update_freq)
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname 
        self.writer = SummaryWriter(tensorboard_name)
        tensorboard_name = str(config["locexp"]) + '/runs/' + "inverse" 
        self.writer_inverse = SummaryWriter(tensorboard_name)
        tensorboard_name = str(config["locexp"]) + '/runs/' + "expert" 
        self.writer_expert = SummaryWriter(tensorboard_name)
        self.last_100_reward_errors = deque(maxlen=100) 
        self.average_same_action = deque(maxlen=100) 
        self.expert_buffer_size = config["expert_buffer_size"]
    def act(self, state, epsilon, eval_pi=False, use_debug=False):

        if np.random.random() > epsilon or eval_pi:
            action = np.argmax(self.Q[state])
            if use_debug:
                action = np.argmax(self.debug_Q[state])
        else:
            action = self.env.action_space.sample() 
        return action
   
    def act_inverse_q(self, state):
        action = np.argmax(self.Q_inverse[state])
        return action
    
    def optimize(self, state, action, reward, next_state, debug=False):
        if debug:
            max_next_state = np.max(self.debug_Q[next_state])
            td_error =  max_next_state - self.debug_Q[state, action]
            self.debug_Q[(state,action)] = self.debug_Q[(state,action)] + self.lr * (reward + self.gamma *td_error)
            return

        max_next_state = np.max(self.Q[next_state])
        td_error =  max_next_state - self.Q[state, action]
        self.Q[(state,action)] = self.Q[(state,action)] + self.lr * (reward + self.gamma *td_error)
    
    def learn(self):
        states, actions, rewards, next_states, done =  self.memory.sample(self.batch_size)
        # update Q function
        
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, done):
            max_next_state = np.max(self.Q[next_state])
            td_error = self.Q[state, action] - max_next_state
            self.Q[(state,action)] = self.Q[(state,action)] + self.lr * (reward + self.gamma*  td_error)
    
    def compute_reward_loss(self, episode=10):
        """
        use the env to create the real reward and compare it to the predicted
        reward of the model
 
        """
        self.env.seed(np.random.randint(0,10))
        reward_loss = 0
        reward_list = []
        for epi in range(episode):
            state = self.env.reset()
            done = False
            while not done:
                action = np.argmax(self.trained_Q[state])
                next_state, reward, done, _ = self.env.step(action)
                predict_reward = self.r[state, action]
                reward_list.append((reward, predict_reward))
                if done: 
                    break
        reward_loss =([abs(r[0] - r[1]) for r in reward_list]  )
        reward_loss_length = len(reward_loss)
        reward_loss = sum(reward_loss) / reward_loss_length
        self.last_100_reward_errors.append(reward_loss)
        average_loss = np.mean(self.last_100_reward_errors)
        print("average mean loss ", average_loss)
        self.writer.add_scalar('Reward_loss', reward_loss, self.steps)
        self.writer.add_scalar('Average_Reward_loss', average_loss, self.steps)
        #print(reward_loss)

    
    def invers_q(self, continue_train=False):
        self.memory.load_memory("memory") 
        self.load_q_table()
        if not continue_train:
            print("clean policy")
            self.Q = np.zeros([self.state_size, self.action_size])
        mkdir("", "inverse_policy") 
        for epi in range(1, self.episodes_qinverse + 1):
            self.steps += 1
            text = "Inverse Episode {} \r".format(epi)
            # print(text, end = '')
            if epi % self.eval_q_inverse == 0:
                self.start_reward()
                self.memory.save_memory("inverse_policy")
                self.save_q_table("inverse_Q")
                self.save_r_table()
                self.render_env = False
                self.eval_policy(use_inverse=True, episode=5)
                self.eval_policy(use_expert=True, episode=5)
                self.render_env =False
            state, action, r, next_state, _ = self.memory.sample(1)
            action = action[0][0]
            state = state[0][0]
            next_state = next_state[0][0]
            self.counter[state, action] += 1
            total_num = np.sum(self.counter[state,:])
            action_prob = self.counter[state] / total_num
            assert(np.isclose(np.sum(action_prob),1))
            # update Q shift 
            Q_shift_target = self.lr_sh * (self.gamma_iql * np.max(self.Q_inverse[next_state]))
            #print("q values", self.Q[state])
            self.Q_shift[state, action] = ((1 - self.lr_sh) * self.Q_shift[state, action]) + Q_shift_target
            # compute n a
            if action_prob[action] == 0:
                action_prob[action] =  np.finfo(float).eps
            n_a = np.log(action_prob[action]) - self.Q_shift[state, action]
            
            # update reward function
            self.update_r(state, action, n_a, action_prob)
            #self.debug_train()
            # update Q function
            self.update_q(state, action, next_state)
            # self.policy_diff(state, action)

    def update_q(self, state, action, next_state):
        q_old = (1 - self.lr_iql_q) * self.Q_inverse[state, action]
        q_new = self.lr_iql_q *(self.r[state, action] + (self.gamma_iql * np.max(self.Q_inverse[next_state])))
        #print("q old ", q_old)
        #print("q_new", q_new)
        #print("q invers ", q_old + q_new)
        self.Q_inverse[state, action] = q_old + q_new
        
    def update_r(self, state, action, n_a, action_prob):
        r_old = (1 - self.lr_iql_r) * self.r[state, action]
        part1 = n_a
        #print("part1", n_a)
        part2 = self.ratio * self.sum_over_action(state, action, action_prob)
        r_new = self.lr_iql_r * (part1 + part2)
        #print("r old ", r_old)
        #print("r_new", r_new)
        self.r[state, action] = r_old + r_new       
    
    def sum_over_action(self, state, a, action_prob):
        res = 0
        for b in range(self.action_size):
            if b == a:
                continue
            res = res + (self.r[state, b] - self.compute_n_a(state, b, action_prob))
        return res

    def compute_n_a(self, state, action, action_prob):
        if action_prob[action] == 0:
            action_prob[action] = np.finfo(float).eps
        return np.log(action_prob[action]) - self.Q_shift[state, action]


    def start_reward(self):
        self.env.seed = 1
        
        state = self.env.reset()
        print(state)
        ns, r, d, _ = self.env.step(0)
        np.set_printoptions(precision=2)
        print(" expert q {}".format(self.trained_Q[state])) 
        print("inverse q {}".format(self.Q_inverse[state]))
        return 

    
    def eval_policy(self, random_agent=False, use_expert=False, use_debug=False, use_inverse=False,episode=10):
        if use_expert:
            self.load_q_table()
        total_steps = 0
        total_reward = 0
        total_penetlies = 0
        for i_episode in range(1, episode + 1):
            score = 0
            steps = 0
            state = self.env.reset()
            done  = False
            penelty = 0
            while not done:
                steps += 1
                if use_expert:
                    action = np.argmax(self.trained_Q[state])
                elif random_agent:
                    action = self.env.action_space.sample() 
                elif use_debug:
                    action = np.argmax(self.debug_Q[state])
                elif use_inverse:
                    action = np.argmax(self.Q_inverse[state])
                else:
                    action = self.act(state, 0, True)
                
                next_state, reward, done, _ = self.env.step(action)
                state = next_state
                if self.render_env:
                    self.env.render()
                    time.sleep(0.1)
                score += reward
                if reward == -10:
                    penelty += 1
                if done:
                    total_steps += steps
                    total_reward += score
                    total_penetlies += penelty
                    break
        if self.render_env:
            self.env.close()
        aver_steps = total_steps / episode
        average_reward = total_reward / episode
        aver_penelties = total_penetlies / episode
        
        if use_expert:
            print("Expert avge steps {} average reward  {:.2f}  average penelty {} ".format(aver_steps, average_reward, aver_penelties))

        elif random_agent:
            print("Random Eval avge steps {} average reward  {:.2f}  average penelty {} ".format(aver_steps, average_reward, aver_penelties))
        
        elif use_inverse:
            print("Inverse q Eval avge steps {} average reward  {:.2f}  average penelty {} ".format(aver_steps, average_reward, aver_penelties))
        
        else:    
            print("Eval avge steps {} average reward  {:.2f}  average penelty {} ".format(aver_steps, average_reward, aver_penelties))
            self.writer.add_scalar('Eval_Average_steps', aver_steps, self.steps)
            self.writer.add_scalar('Eval_Average_reward', average_reward, self.steps)
            self.writer.add_scalar('Eval_Average_penelties', aver_penelties, self.steps)
       
    def save_q_table(self, table="Q", filename="policy"):
        mkdir("", filename)
        if table == "Q":
            with open(filename + '/Q.npy', 'wb') as f:
                np.save(f, self.Q)
        if table =="inverse_Q":
            with open(filename + '/Inverse_Q.npy', 'wb') as f:
                np.save(f, self.Q_inverse)

    def load_q_table(self, table="Q", filename="policy"):
        if table == "Q":
            with open(filename + '/Q.npy', 'rb') as f:
                self.Q = np.load(f)
        if table == "inverse_Q":
            with open(filename + '/Inverse_Q.npy', 'rb') as f:
                self.Q_inverse = np.load(f)

        self.trained_Q = self.Q
    
    def save_r_table(self, filename="reward_function"):
        mkdir("", filename)
        with open(filename + '/r.npy', 'wb') as f:
            np.save(f, self.r)

    def load_r_table(self, filename="reward_function"):
        with open(filename + '/r.npy', 'rb') as f:
            self.r = np.load(f)


    def eval_inverse(self):
        self.load_q_table(table= "inverse_Q")
        for i_episode in range(1, 11):
            score = 0
            steps = 0
            penelties = 0
            state = self.env.reset()
            done  = False
            while not done:
                steps += 1
                print(self.Q_inverse)
                action = np.argmax(self.Q_inverse[state])
                next_state, reward, done, _ = self.env.step(action)
                score += reward
                if reward == -10:
                    penelties += 1
                state = next_state
            print("Inverse  steps {} reward  {:.2f}  penelty {} ".format(steps, score, penelties))




    def policy_diff(self, state, expert_action):

        self.trained_Q = self.Q

    def create_expert_policy(self):
        self.load_q_table()
        self.trained_Q = self.Q
        for i_episode in range(1, self.expert_buffer_size + 1):
            text = "create Buffer {} of {}\r".format(i_episode, self.expert_buffer_size)
            print(text, end=" ")
            state = self.env.reset()
            if state == 184:
                print("yes ")
            done  = False
            score = 0
            while True:
                action = self.act(state, 0, True)
                next_state, reward, done, _ = self.env.step(action)
                score += reward
                self.memory.add(state, action, reward, next_state, done, done)
                state = next_state
                if done:
                    #print("reward ", score)
                    break
        self.memory.save_memory("memory")


    def policy_diff(self, state, expert_action):
        action = np.argmax(self.Q_inverse[state])
        if action == expert_action:
            print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {}  epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon))
            self.writer.add_scalar('Average_reward', average_reward, self.steps)
            self.writer.add_scalar('Train_reward', score, self.steps)
        self.trained_Q = self.Q
        self.memory.save_memory("memory")
        
        
    def debug_train(self):
        """

        use the trained reward function to train the agent

        """
        state = self.env.reset()
        done  = False
        score = 0
        self.steps += 1
        epsiode_steps =  0
        while True:
            action = self.act(state, 0, True)
            next_state, _, done, _ = self.env.step(action)
            reward = self.r[state, action]
            self.optimize(state, action, reward, next_state, debug=True)

            score += reward
            epsiode_steps += 1
            if done:
                break
            state = next_state

        self.total_reward += score
        average_reward = self.total_reward / self.steps
        print("Episode {} Reward {:.2f} Average Reward {:.2f}  epi steps {}".format(self.steps, score, average_reward, epsiode_steps))


    def train(self):
      
        total_timestep = 0
        for i_episode in range(1, self.episode + 1):
            score = 0
            state = self.env.reset()
            done  = False
            steps = 0
            while not done:
                self.steps +=1
                steps += 1
                total_timestep += 1
                action = self.act(state, self.epsilon)
                next_state, reward, done, _ = self.env.step(action)
                score += reward
                self.optimize(state, action, reward, next_state)
                self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay * i_episode)
                
                if done:
                    break
                state = next_state
            
            if i_episode % self.eval_frq == 0:
                self.eval_policy()
            
            self.total_reward += score
            average_reward = self.total_reward / i_episode
            print("Episode {} Reward {:.2f} Average Reward {:.2f} steps {}  epsilon {:.2f}".format(i_episode, score, average_reward, steps, self.epsilon))
            self.writer.add_scalar('Average_reward', average_reward, self.steps)
            self.writer.add_scalar('Train_reward', score, self.steps)
        self.trained_Q = self.Q
def main():
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey,1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:,i])
        return obsKeys
    
    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([q_func_tabular[x] if x in q_func_tabular else 10*np.ones(num_states) for x in keys])
    
#    def trainTabular(vectorKey,qCurrTargets,weights):
    def trainTabular(vectorKey,qCurrTargets,weights):
        keys = getTabularKeys(vectorKey)
        alpha=0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
#                q_func_tabular[keys[i]] = (1-alpha)*q_func_tabular[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[keys[i]] = q_func_tabular[keys[i]] + alpha*weights[i]*(qCurrTargets[i] - q_func_tabular[keys[i]]) # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]


    env = envstandalone.NumbersArrange()

    # Standard q-learning parameters
    max_timesteps=8000
    exploration_fraction=0.3
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=10
    buffer_size=1
    batch_size=1
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    # first two elts of deicticShape must be odd
#    actionShape = (3,3,2)
    actionShape = (env.blockSideSize*3,env.blockSideSize*3,2)
    actionShapeSmall = (10,10,2) # shrink actionShape down to this size for faster processing
    num_states = 2 # either holding or not
    num_patches = env.numBlocksWide**2
    num_actions = 2*num_patches
    num_actions_discrete = 2
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
#    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

#    prioritized_replay=True
    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
#        convs=[(16,3,1), (32,3,1)],
#        hiddens=[48],
        convs=[(16,3,1)],
        hiddens=[32],
#        convs=[(32,3,1)],
#        hiddens=[48],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
#        return U.BatchInput(actionShape, name=name)
        return U.BatchInput(actionShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=actionShape,actionShapeSmall=actionShapeSmall)
    
    if valueFunctionType == 'DQN':
        getqNotHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_notholding"
                )
        getqHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_holding"
                )
    
        targetTrainNotHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_notholding",
            grad_norm_clipping=1.
        )

        targetTrainHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_holding",
            grad_norm_clipping=1.
        )
        
    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()

    episode_rewards = [0.0]
    td_errors = [0.0]
    timerStart = time.time()
    U.initialize()
    for t in range(max_timesteps):
        
        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors*2-1
        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        # Get qCurr values
        if valueFunctionType == "TABULAR":
            actionDescriptorsFlat = np.reshape(actionDescriptors,[-1,actionShape[0]*actionShape[1]*actionShape[2]]) == 1
            qCurr = getTabular(actionDescriptorsFlat)
        else:
            qCurrNotHolding = getqNotHolding(actionDescriptors)
            qCurrHolding = getqHolding(actionDescriptors)
            qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1)

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:,obs[1]])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx,obs[1]])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv==actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # take action
        new_obs, rew, done, _ = env.step(action)
        
        replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext = moveDescriptorsNext*2-1

            actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))],axis=3)
            actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],axis=3)
            actionDescriptorsNext = np.stack([actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # I sometimes get this axis parameter wrong... pay attention!
#            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,actionShape[0],actionShape[1],actionShape[2]])
            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,actionShapeSmall[0],actionShapeSmall[1],actionShapeSmall[2]])

            if valueFunctionType == "TABULAR":
                actionDescriptorsNextFlat = np.reshape(actionDescriptorsNext,[batch_size*num_patches*num_actions_discrete,-1]) == 1
                qNextFlat = getTabular(actionDescriptorsNextFlat)
            else:
                qNextNotHolding = getqNotHolding(actionDescriptorsNext)
                qNextHolding = getqHolding(actionDescriptorsNext)
                qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1)

            qNext = np.reshape(qNextFlat,[batch_size,num_patches,num_actions_discrete,num_states])
            qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1)
            targets = rewards + (1-dones) * gamma * qNextmax
            
            if valueFunctionType == "TABULAR":
                actionsFlat = np.reshape(actionPatches,[batch_size,-1]) == 1
                qCurrTarget = getTabular(actionsFlat)
            else:
                qCurrTargetNotHolding = getqNotHolding(actionPatches)
                qCurrTargetHolding = getqHolding(actionPatches)
                qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1)

            td_error = qCurrTarget[range(batch_size),states_t] - targets
            qCurrTarget[range(batch_size),states_t] = targets

            if valueFunctionType == "TABULAR":
                trainTabular(actionsFlat, qCurrTarget, np.tile(np.reshape(weights,[batch_size,1]),[1,2]))
            else:
                targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1]))
                targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            td_errors[-1] += td_error


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
            td_errors.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            timerStart = timerFinal
        
        obs = np.copy(new_obs)


    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors*2-1

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    if valueFunctionType == "TABULAR":
        qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    else:
        qPickNotHolding = getqNotHolding(actionsPickDescriptors)
        qPickHolding = getqHolding(actionsPickDescriptors)
        qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[8,8])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[8,8])))

    if valueFunctionType == "TABULAR":
        qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    else:
        qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
        qPlaceHolding = getqHolding(actionsPlaceDescriptors)
        qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)    
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[8,8])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[8,8])))
Exemplo n.º 8
0
def main(envStride, fileIn, fileOut, inputmaxtimesteps):

    reuseModels = None
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    env = envstandalone.PuckArrange()
    env.stride = envStride # stride input to this problem
    env.reset() # need to do the reset her in order to populate parameters
    
    # Standard q-learning parameters
#    max_timesteps=2000
    max_timesteps=inputmaxtimesteps
    exploration_fraction=0.3
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=60
    buffer_size=1000
    batch_size=10
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    # first two elts of deicticShape must be odd
    descriptorShape = (env.blockSize*3,env.blockSize*3,2)
#    descriptorShapeSmall = (10,10,2)
#    descriptorShapeSmall = (15,15,2)
    descriptorShapeSmall = (20,20,2)
    num_states = 2 # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2*num_patches
    num_actions_discrete = 2
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
#    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

#    prioritized_replay=True
    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
#        convs=[(16,3,1), (32,3,1)],
#        hiddens=[48],
        convs=[(16,3,1)],
        hiddens=[32],
#        convs=[(32,3,1)],
#        hiddens=[48],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride)
    
    if valueFunctionType == 'DQN':
        getqNotHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_notholding",
                reuse=reuseModels
                )
        getqHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_holding",
                reuse=reuseModels
                )
    
        targetTrainNotHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_notholding",
            grad_norm_clipping=1.,
            reuse=reuseModels
        )

        targetTrainHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_holding",
            grad_norm_clipping=1.,
            reuse=reuseModels
        )
        
    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()

    episode_rewards = [0.0]
    td_errors = [0.0]
    timerStart = time.time()
    U.initialize()
    
    # load prior model
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)

    for t in range(max_timesteps):
        
        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors*2-1
        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        qCurrNotHolding = getqNotHolding(actionDescriptors)
        qCurrHolding = getqHolding(actionDescriptors)
        qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1)

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:,obs[1]])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx,obs[1]])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv==actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # take action
        new_obs, rew, done, _ = env.step(action)
        
        replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext = moveDescriptorsNext*2-1

            actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))],axis=3)
            actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],axis=3)
            actionDescriptorsNext = np.stack([actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # I sometimes get this axis parameter wrong... pay attention!
            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[-1,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]])

            qNextNotHolding = getqNotHolding(actionDescriptorsNext)
            qNextHolding = getqHolding(actionDescriptorsNext)
            qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1)

            qNext = np.reshape(qNextFlat,[batch_size,num_patches,num_actions_discrete,num_states])
            qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1)
            
            targets = rewards + (1-dones) * gamma * qNextmax
            
            qCurrTargetNotHolding = getqNotHolding(actionPatches)
            qCurrTargetHolding = getqHolding(actionPatches)
            qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1)

            td_error = qCurrTarget[range(batch_size),states_t] - targets
            qCurrTarget[range(batch_size),states_t] = targets

            targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            td_errors[-1] += td_error


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
            td_errors.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
#        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = np.copy(new_obs)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors*2-1
    gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0]))

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    qPickHolding = getqHolding(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[gridSize,gridSize])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[gridSize,gridSize])))

    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[gridSize,gridSize])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[gridSize,gridSize])))
    
    plt.subplot(1,3,1)
    plt.imshow(np.tile(env.state[0],[1,1,3]))
    plt.subplot(1,3,2)
    plt.imshow(np.reshape(qPick[:,0],[gridSize,gridSize]))
    plt.subplot(1,3,3)
    plt.imshow(np.reshape(qPlace[:,1],[gridSize,gridSize]))
    plt.show()
Exemplo n.º 9
0
def main():

    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        return np.array([
            q_func[x] if x in q_func else 0 * np.ones(num_states) for x in keys
        ])

    def trainTabular(vectorKey, qCurrTargets):
        keys = getTabularKeys(vectorKey)
        alpha = 1.0
        for i in range(len(keys)):
            if keys[i] in q_func:
                q_func[keys[i]] = (
                    1 - alpha) * q_func[keys[i]] + alpha * qCurrTargets[i]
            else:
                q_func[keys[i]] = qCurrTargets[i]

    # Standard DQN parameters
    max_timesteps = 40000
    #    learning_starts=1000
    learning_starts = 10
    #    buffer_size=50000
    #    buffer_size=10000
    #    buffer_size=1000
    #    buffer_size=100
    #    buffer_size=32
    buffer_size = 8
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 1
    gamma = .98
    target_network_update_freq = 1
    #    batch_size=32
    batch_size = 1
    train_freq = 1
    #    train_freq=2
    num_cpu = 16
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    replay_buffer = ReplayBuffer(buffer_size)

    # Deictic state/action parameters
    deicticShape = (3, 3, 2
                    )  # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (3, 3, 4)
    num_cascade = 5
    num_states = env.num_blocks + 1  # one more state than blocks to account for not holding anything
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches
    num_actions_discrete = 2

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = obs[1]  # holding

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        actionsPickDescriptors = np.concatenate(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.concatenate(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]
        actionDescriptors = np.reshape(actionDescriptors, [
            -1, deicticActionShape[0] * deicticActionShape[1] *
            deicticActionShape[2]
        ]) == 1

        # Get q-values
        qCurr = getTabular(actionDescriptors)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:, stateDeictic])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew,
                          new_obs, float(done))

        if t > learning_starts and t % train_freq == 0:

            states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(
                batch_size)

            moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1)
            actionsPickDescriptorsNext1 = np.concatenate([
                moveDescriptorsNext1,
                np.zeros(np.shape(moveDescriptorsNext1))
            ],
                                                         axis=3)
            actionsPlaceDescriptorsNext1 = np.concatenate([
                np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1
            ],
                                                          axis=3)
            actionDescriptorsNext1 = np.stack(
                [actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1],
                axis=0)
            actionDescriptorsNextFlat1 = np.reshape(
                actionDescriptorsNext1,
                [batch_size * num_patches * num_actions_discrete, -1]) == 1

            qNextFlat1 = getTabular(actionDescriptorsNextFlat1)
            qNext1 = np.reshape(
                qNextFlat1,
                [batch_size, num_patches, num_actions_discrete, num_states])
            qNextmax1 = np.max(
                np.max(qNext1[range(batch_size), :, :, states_tp1], 2), 1)
            targets1 = rewards + (1 - dones) * gamma * qNextmax1

            qCurrTarget1 = getTabular(actions)
            qCurrTarget1[range(batch_size), states_t] = targets1
            trainTabular(actions, qCurrTarget1)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    actionsPickDescriptors = np.concatenate(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.concatenate(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)

    print(str(obs[0][:, :, 0]))

    #    qPick = getq(actionsPickDescriptors)
    qPick = getTabular(
        np.reshape(actionsPickDescriptors, [num_patches, -1]) == 1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:, 0], [8, 8])))

    #    qPlace = getq(actionsPlaceDescriptors)
    qPlace = getTabular(
        np.reshape(actionsPlaceDescriptors, [num_patches, -1]) == 1)
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:, 1], [8, 8])))

    print("Value function for place action in hold-2 state:")
    print(str(np.reshape(qPlace[:, 2], [8, 8])))
import random
import torch
import numpy as np
from dqn_agent import DQNAgent
from replay_buffer2 import ReplayBuffer
from iql_agent import mkdir

env = gym.make('LunarLander-v2')
env.seed(0)

print('State shape: ', env.observation_space.shape)
print('Number of actions: ', env.action_space.n)
agent = DQNAgent(state_size=8, action_size=4, seed=0)

agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))
memory = ReplayBuffer((8, ), (1, ), 20000, 'cuda')
n_episodes = 40
max_t = 500
eps = 0
for i_episode in range(1, n_episodes + 1):
    state = env.reset()
    score = 0
    for t in range(max_t):
        action = agent.act(state, eps)
        next_state, reward, done, _ = env.step(action)
        score += reward
        memory.add(state, action, reward, next_state, done, done)
        state = next_state
        # env.render()
        if done:
            print("Episode {}  Reward {}".format(i_episode, score))