Exemplo n.º 1
0
    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]

        #Placeholders
        self.sess=sess
        self.HPs = settings["NetworkHPs"]

        self.s = tf.placeholder(dtype=tf.float32, shape=[None]+stateShape, name="state")
        self.a = tf.placeholder(tf.int32, [None,1], "act")
        # self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error
        self.v_ = tf.placeholder(tf.float32, [None, 1], "v_next")
        self.r = tf.placeholder(tf.float32, [None,1], 'r')


        #These need to be returned in the call function of a tf.keras.Model class.
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize)

        inputs = {"state":self.s}
        out = self.Model(inputs)
        self.acts_prob = out["actor"]
        self.critic = out["critic"]

        #Defining Training Operations which will be called in the Update Function.
        with tf.variable_scope('Update_Operation'):
            with tf.name_scope('squared_TD_error'):
                self.td_error = self.r + self.HPs["Gamma"] * self.v_ - self.critic
                self.c_loss = tf.reduce_mean(tf.square(self.td_error))    # TD_error = (r+gamma*V_next) - V_eval

            with tf.name_scope('train_critic'):
                self.c_params = self.Model.GetVariables("Critic")
                self.c_grads = tf.gradients(self.c_loss, self.c_params)
                self.update_c_op = tf.train.AdamOptimizer(self.HPs["Critic LR"]).apply_gradients(zip(self.c_grads, self.c_params))

            with tf.name_scope('exp_v'):
                log_prob = tf.log(self.acts_prob + 1e-5) * tf.one_hot(self.a, actionSize, dtype=tf.float32)
                self.a_loss = -tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss

            with tf.name_scope('train_actor'):
                self.a_params = self.Model.GetVariables("Actor")
                print(self.a_params)
                self.a_grads = tf.gradients(self.a_loss, self.a_params)
                self.update_a_op = tf.train.AdamOptimizer(self.HPs["Actor LR"]).apply_gradients(zip(self.a_grads, self.a_params))

            self.update_ops=[self.update_c_op,self.update_a_op]

            self.entropy = -tf.reduce_mean(self.acts_prob * _log(self.acts_prob), name='entropy')

            self.logging_ops = [self.a_loss,self.c_loss,self.entropy]
            self.labels = ["Loss Actor","Loss Critic","Entropy"]
            self.logging_MA = [MovingAverage(400) for i in range(len(self.logging_ops))]
Exemplo n.º 2
0
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """

        # Creating nested Method that will be updated.
        network = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                 netConfigOverride=netConfigOverride,
                                 actionSize=N)
        Method = GetFunction(settings["SubMethod"])
        self.nestedMethod = Method(sess,
                                   settings,
                                   netConfigOverride,
                                   stateShape=dFeatures,
                                   actionSize=N,
                                   nTrajs=nTrajs)
Exemplo n.º 3
0
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 env,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        EXP_NAME = settings["RunName"]
        LoadName = settings["LoadName"]
        MODEL_PATH_ = './models/' + EXP_NAME + '/'
        MODEL_PATH = './models/' + LoadName + '/'
        LOG_PATH = './logs/' + EXP_NAME + '/'
        CreatePath(LOG_PATH)
        CreatePath(MODEL_PATH_)
        self.sess = sess
        self.env = env

        N = settings["NumOptions"]

        #Create the Q Maps

        if "LoadQMaps" in settings:
            #Loading the Q-tables for the sub-policies
            loadedData = np.load('./models/' + settings["LoadQMaps"] +
                                 '/options.npz')
            opt = loadedData["options"]
            options = []
            for i in range(opt.shape[0]):
                options.append(opt[i, :, :, :, :])
        else:
            if "LoadSamples" in settings:
                pass
            else:
                print("Creating Samples")

                #Creating Instance of environment and running through it to generate samples
                def GetAction(state):
                    """
                    Contains the code to run the network based on an input.
                    """
                    p = 1 / actionSize
                    if len(state.shape) == 3:
                        probs = np.full((1, actionSize), p)
                    else:
                        probs = np.full((state.shape[0], actionSize), p)
                    actions = np.array([
                        np.random.choice(probs.shape[1], p=prob / sum(prob))
                        for prob in probs
                    ])
                    return actions

                s = []
                for i in range(settings["SampleEpisodes"]):
                    s0 = env.reset()

                    for j in range(settings["MAX_EP_STEPS"] + 1):

                        a = GetAction(state=s0)

                        s1, r, done, _ = env.step(a)
                        if arreq_in_list(s0, s):
                            pass
                        else:
                            s.append(s0)

                        s0 = s1
                        if done:
                            break

            with open(MODEL_PATH + 'netConfigOverride.json') as json_file:
                networkOverrides = json.load(json_file)
            # if "DefaultParams" not in networkOverrides:
            #     networkOverrides["DefaultParams"] = {}
            # networkOverrides["DefaultParams"]["Trainable"]=False
            # print(settings["SFNetworkConfig"])
            # print(networkOverrides)
            SF1, SF2, SF3, SF4, SF5 = buildNetwork(settings["SFNetworkConfig"],
                                                   actionSize,
                                                   networkOverrides,
                                                   scope="Global")
            SF5.load_weights(MODEL_PATH + "model.h5")

            #Selecting the samples:
            psi = SF2.predict(np.vstack(s))  # [X,SF Dim]

            #test for approximate equality (for floating point types)
            def arreqclose_in_list(myarr, list_arrays):
                return next((True
                             for elem in list_arrays if elem.size == myarr.size
                             and np.allclose(elem, myarr, atol=1E-6)), False)

            print("Selecting Samples")
            if settings["Selection"] == "First":
                samples = []
                points = []
                i = 0
                while len(samples) < settings["TotalSamples"]:
                    if not arreqclose_in_list(psi[i, :], samples):
                        samples.append(psi[i, :])
                        points.append(i)
                    i += 1
            elif settings["Selection"] == "Random":
                samples = []
                points = []
                while len(samples) < settings["TotalSamples"]:
                    idx = randint(1, psi.shape[0] - 1)
                    if not arreqclose_in_list(psi[idx, :], samples):
                        samples.append(psi[idx, :])
                        points.append(idx)
            elif settings["Selection"] == "Random_sampling":
                #PCA Decomp to dimension:
                import pandas as pd
                from sklearn.decomposition import PCA
                feat_cols = ['pixel' + str(i) for i in range(psi.shape[1])]
                df = pd.DataFrame(psi, columns=feat_cols)
                np.random.seed(42)
                rndperm = np.random.permutation(df.shape[0])
                pca = PCA(n_components=2)
                pca_result = pca.fit_transform(df[feat_cols].values)

                from SampleSelection import SampleSelection_v1
                points = SampleSelection_v1(pca_result,
                                            settings["TotalSamples"],
                                            returnIndicies=True)
            elif settings["Selection"] == "Hull_pca":
                #PCA Decomp to dimension:
                import pandas as pd
                from sklearn.decomposition import PCA
                feat_cols = ['pixel' + str(i) for i in range(psi.shape[1])]
                df = pd.DataFrame(psi, columns=feat_cols)
                np.random.seed(42)
                rndperm = np.random.permutation(df.shape[0])
                pca = PCA(n_components=4)
                pca_result = pca.fit_transform(df[feat_cols].values)

                from SampleSelection import SampleSelection_v2
                points = SampleSelection_v2(pca_result,
                                            settings["TotalSamples"],
                                            returnIndicies=True)
            elif settings["Selection"] == "Hull_tsne":
                #PCA Decomp to dimension:
                import pandas as pd
                from sklearn.manifold import TSNE
                feat_cols = ['pixel' + str(i) for i in range(psi.shape[1])]
                df = pd.DataFrame(psi, columns=feat_cols)
                np.random.seed(42)
                rndperm = np.random.permutation(df.shape[0])
                tsne = TSNE(n_components=3,
                            verbose=1,
                            perplexity=10,
                            n_iter=1000)
                tsne_results = tsne.fit_transform(df[feat_cols].values)

                from SampleSelection import SampleSelection_v2
                points = SampleSelection_v2(tsne_results,
                                            settings["TotalSamples"],
                                            returnIndicies=True)
            elif settings["Selection"] == "Hull_cluster":
                #PCA Decomp to dimension:
                import pandas as pd
                from sklearn.decomposition import PCA
                feat_cols = ['pixel' + str(i) for i in range(psi.shape[1])]
                df = pd.DataFrame(psi, columns=feat_cols)
                np.random.seed(42)
                rndperm = np.random.permutation(df.shape[0])
                pca = PCA(n_components=4)
                pca_result = pca.fit_transform(df[feat_cols].values)

                from SampleSelection import SampleSelection_v3
                points = SampleSelection_v3(pca_result,
                                            settings["TotalSamples"],
                                            returnIndicies=True)
            else:
                print("Invalid Method selected")
                exit()

            psiSamples = []
            for point in points:
                psiSamples.append(psi[point, :])

            while len(psiSamples) < len(psiSamples[0]):
                psiSamples.extend(psiSamples)

            samps = np.stack(psiSamples)
            samps2 = samps[0:samps.shape[1], :]
            w_g, v_g = np.linalg.eig(samps2)

            # print("here")
            dim = samps2.shape[1]
            #Creating Sub-policies
            offset = 0
            options = []

            # QMapStructure = self.env.GetQMapStructure()
            print("Getting data for a Q-Map")
            grids = self.env.ConstructAllSamples()
            phis = SF3.predict(grids)

            for sample in range(int(N / 2)):
                print("Creating Option", sample)
                if sample + offset >= dim:
                    continue
                v_option, v_option_inv = self.env.ReformatSamples(
                    np.real(np.matmul(phis, v_g[:, sample + offset])))
                options.append(v_option)
                options.append(v_option_inv)
                if np.iscomplex(w_g[sample + offset]):
                    offset += 1
                if settings["PlotOptions"]:
                    imgplot = plt.imshow(v_option)
                    plt.title(" Option " + str(sample) +
                              " Value Estimate | Eigenvalue:" +
                              str(w_g[sample + offset]))
                    plt.savefig(LOG_PATH + "/option" + str(sample) + ".png")
                    plt.close()

                #Plotting the first couple samples with random enemy positions:

            #Saving the different options. to log:
            np.savez_compressed(MODEL_PATH_ + "options.npz",
                                options=np.stack(options))

            self.options = options

        # Creating nested Method that will be updated.
        network = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                 netConfigOverride=netConfigOverride,
                                 actionSize=N)
        Method = GetFunction(settings["SubMethod"])
        self.nestedMethod = Method(sess,
                                   settings,
                                   netConfigOverride,
                                   stateShape=stateShape,
                                   actionSize=N,
                                   nTrajs=nTrajs)
Exemplo n.º 4
0
    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        EXP_NAME = settings["RunName"]
        LoadName = settings["LoadName"]
        MODEL_PATH = './models/'+LoadName+ '/'
        IMAGE_PATH = './images/SF/'+EXP_NAME+'/'
        MODEL_PATH_ = './models/'+EXP_NAME+'/'
        LOG_PATH = './logs/CTF_1v1/'+EXP_NAME
        CreatePath(LOG_PATH)
        CreatePath(IMAGE_PATH)
        CreatePath(MODEL_PATH)
        CreatePath(MODEL_PATH_)
        self.sess=sess

        N = settings["NumOptions"]
        for (dirpath, dirnames, filenames) in os.walk("configs/environment"):
            for filename in filenames:
                if settings["EnvConfig"] == filename:
                    envConfigFile = os.path.join(dirpath,filename)
                    break
        with open(envConfigFile) as json_file:
            envSettings = json.load(json_file)
        env,dFeatures,nActions,nTrajs = CreateEnvironment(envSettings)
        #Create the Q Maps

        if "LoadQMaps" in settings:
            #Loading the Q-tables for the sub-policies
            loadedData = np.load('./models/'+settings["LoadQMaps"]+ '/options.npz')
            opt = loadedData["options"]
            options=[]
            for i in range(opt.shape[0]):
                options.append(opt[i,:,:,:,:])
        else:
            if "LoadSamples" in settings:
                pass
            else:
                #Creating Instance of environment and running through it to generate samples
                def GetAction(state):
                    """
                    Contains the code to run the network based on an input.
                    """
                    p = 1/nActions
                    if len(state.shape)==3:
                        probs =np.full((1,nActions),p)
                    else:
                        probs =np.full((state.shape[0],nActions),p)
                    actions = np.array([np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs])
                    return actions


                s = []
                for i in range(settings["SampleEpisodes"]):
                    s0 = env.reset()

                    for j in range(settings["MAX_EP_STEPS"]+1):

                        a = GetAction(state=s0)

                        s1,r,done,_ = env.step(a)
                        if arreq_in_list(s0,s):
                            pass
                        else:
                            s.append(s0)

                        s0 = s1
                        if done:
                            break

            #Creating and smoothing Q Maps
            def ConstructSamples(env,position2):
                grid = env.get_obs_blue
                locX,locY = np.unravel_index(np.argmax(grid[:,:,4], axis=None), grid[:,:,0].shape)
                locX2,locY2 = np.unravel_index(np.argmin(grid[:,:,4], axis=None), grid[:,:,0].shape)
                #Removing the agent
                grid[locX,locY,4] = 0
                grid[locX2,locY2,4] = 0

                stacked_grids = np.repeat(np.expand_dims(grid,0), grid.shape[0]*grid.shape[1],0)

                for i in range(stacked_grids.shape[1]):
                    for j in range(stacked_grids.shape[2]):
                        stacked_grids[i*stacked_grids.shape[2]+j,stacked_grids.shape[2]-i-1,j,4] = 5

                stacked_grids[:,position2[0],position2[1],4] = -5
                return stacked_grids

            def SmoothOption(option_, gamma =0.9):
                # option[option<0.0] = 0.0
                #Create the Adjacency Matric
                v_option=np.full((dFeatures[0],dFeatures[1],dFeatures[0],dFeatures[1]),0,dtype=np.float32)
                for i2,j2 in itertools.product(range(dFeatures[0]),range(dFeatures[1])):
                    option = option_[:,:,i2,j2]
                    states_ = {}
                    count = 0
                    for i in range(option.shape[0]):
                        for j in range(option.shape[1]):
                            if option[i,j] != 0:
                                states_[count] = [i,j]
                                # states_.append([count, [i,j]])
                                count+=1
                    states=len(states_.keys())
                    x = np.zeros((states,states))
                    for i in range(len(states_)):
                        [locx,locy] = states_[i]
                        sum = 0
                        for j in range(len(states_)):
                            if states_[j] == [locx+1,locy]:
                                x[i,j] = 0.25
                                sum += 0.25
                            if states_[j] == [locx-1,locy]:
                                x[i,j] = 0.25
                                sum += 0.25
                            if states_[j] == [locx,locy+1]:
                                x[i,j] = 0.25
                                sum += 0.25
                            if states_[j] == [locx,locy-1]:
                                x[i,j] = 0.25
                                sum += 0.25
                        x[i,i]= 1.0-sum

                    #Create W
                    w = np.zeros((states))
                    for count,loc in states_.items():
                        w[count] = option[loc[0],loc[1]]

                    # (I-gamma*Q)^-1
                    I = np.identity(states)
                    psi = np.linalg.inv(I-gamma*x)

                    smoothedOption = np.zeros_like(option,dtype=float)

                    value = np.matmul(psi,w)
                    for j,loc in states_.items():
                        smoothedOption[loc[0],loc[1]] = value[j]

                    v_option[:,:,i2,j2] = smoothedOption
                return v_option

            SF1,SF2,SF3,SF4,SF5 = buildNetwork(settings["SFNetworkConfig"],nActions,{},scope="Global")
            SF5.load_weights('./models/'+LoadName+ '/'+"model.h5")

            #Selecting the samples:
            psi = SF2.predict(np.vstack(s)) # [X,SF Dim]

            #test for approximate equality (for floating point types)
            def arreqclose_in_list(myarr, list_arrays):
                return next((True for elem in list_arrays if elem.size == myarr.size and np.allclose(elem, myarr,atol=1E-6)), False)
            if settings["Selection"]=="First":
                samples = [];points=[]
                i =0
                while len(samples) < settings["TotalSamples"]:
                    if not arreqclose_in_list(psi[i,:], samples):
                        samples.append(psi[i,:])
                        points.append(i)
                    i+=1
            elif settings["Selection"]=="Random":
                samples = [];points=[]
                while len(samples) < settings["TotalSamples"]:
                    idx = randint(1,psi.shape[0])
                    if not arreqclose_in_list(psi[idx,:], samples):
                        samples.append(psi[idx,:])
                        points.append(idx)
            elif settings["Selection"]=="Random_sampling":
                #PCA Decomp to dimension:
                import pandas as pd
                from sklearn.decomposition import PCA
                feat_cols = [ 'pixel'+str(i) for i in range(psi.shape[1]) ]
                df = pd.DataFrame(psi,columns=feat_cols)
                np.random.seed(42)
                rndperm = np.random.permutation(df.shape[0])
                pca = PCA(n_components=2)
                pca_result = pca.fit_transform(df[feat_cols].values)

                from SampleSelection import SampleSelection_v1
                points = SampleSelection_v1(pca_result,settings["TotalSamples"],returnIndicies=True)
            elif settings["Selection"]=="Hull_pca":
                #PCA Decomp to dimension:
                import pandas as pd
                from sklearn.decomposition import PCA
                feat_cols = [ 'pixel'+str(i) for i in range(psi.shape[1]) ]
                df = pd.DataFrame(psi,columns=feat_cols)
                np.random.seed(42)
                rndperm = np.random.permutation(df.shape[0])
                pca = PCA(n_components=4)
                pca_result = pca.fit_transform(df[feat_cols].values)

                from SampleSelection import SampleSelection_v2
                points = SampleSelection_v2(pca_result,settings["TotalSamples"],returnIndicies=True)
            elif settings["Selection"]=="Hull_tsne":
                #PCA Decomp to dimension:
                import pandas as pd
                from sklearn.manifold import TSNE
                feat_cols = [ 'pixel'+str(i) for i in range(psi.shape[1]) ]
                df = pd.DataFrame(psi,columns=feat_cols)
                np.random.seed(42)
                rndperm = np.random.permutation(df.shape[0])
                tsne = TSNE(n_components=3, verbose=1, perplexity=10, n_iter=1000)
                tsne_results = tsne.fit_transform(df[feat_cols].values)

                from SampleSelection import SampleSelection_v2
                points = SampleSelection_v2(tsne_results,settings["TotalSamples"],returnIndicies=True)
            elif settings["Selection"]=="Hull_cluster":
                #PCA Decomp to dimension:
                import pandas as pd
                from sklearn.decomposition import PCA
                feat_cols = [ 'pixel'+str(i) for i in range(psi.shape[1]) ]
                df = pd.DataFrame(psi,columns=feat_cols)
                np.random.seed(42)
                rndperm = np.random.permutation(df.shape[0])
                pca = PCA(n_components=4)
                pca_result = pca.fit_transform(df[feat_cols].values)

                from SampleSelection import SampleSelection_v3
                points = SampleSelection_v3(pca_result,settings["TotalSamples"],returnIndicies=True)
            else:
                print("Invalid Method selected")
                exit()

            psiSamples=[]
            for point in points:
                psiSamples.append(psi[point,:])

            while len(psiSamples) < len(psiSamples[0]):
                psiSamples.extend(psiSamples)

            samps = np.stack(psiSamples)
            samps2 = samps[0:samps.shape[1],:]
            w_g,v_g = np.linalg.eig(samps2)

            # print("here")
            dim = samps2.shape[1]
            #Creating Sub-policies
            offset = 0
            options = []
            for sample in range(int(N/2)):
                print("Creating Option",sample)
                v_option=np.full((dFeatures[0],dFeatures[1],dFeatures[0],dFeatures[1]),0,dtype=np.float32)
                for i2,j2 in itertools.product(range(dFeatures[0]),range(dFeatures[1])):
                    if sample+offset >= dim:
                        continue
                    grids = ConstructSamples(env,[i2,j2])
                    phi= SF3.predict(grids)
                    v_option[:,:,i2,j2]=np.real(np.matmul(phi,v_g[:,sample+offset])).reshape([dFeatures[0],dFeatures[1]])
                    if np.iscomplex(w_g[sample+offset]):
                        offset+=1
                print("Smoothing Option")
                v_option_ = SmoothOption(v_option)
                options.append(v_option_)
                options.append(-v_option_)
                #Plotting the first couple samples with random enemy positions:
                v_map = v_option_[:,:,10,10]
                imgplot = plt.imshow(v_map)
                plt.title(" Option "+str(sample)+" Value Estimate | Eigenvalue:" +str(w_g[sample+offset]))
                plt.savefig(IMAGE_PATH+"/option"+str(sample)+"_"+str(1)+".png")
                plt.close()
                v_map = v_option_[:,:,10,17]
                imgplot = plt.imshow(v_map)
                plt.title(" Option "+str(sample)+" Value Estimate | Eigenvalue:" +str(w_g[sample+offset]))
                plt.savefig(IMAGE_PATH+"/option"+str(sample)+"_"+str(2)+".png")
                plt.close()
                v_map = v_option_[:,:,17,10]
                imgplot = plt.imshow(v_map)
                plt.title(" Option "+str(sample)+" Value Estimate | Eigenvalue:" +str(w_g[sample+offset]))
                plt.savefig(IMAGE_PATH+"/option"+str(sample)+"_"+str(3)+".png")
                plt.close()
                v_map = v_option_[:,:,10,2]
                imgplot = plt.imshow(v_map)
                plt.title(" Option "+str(sample)+" Value Estimate | Eigenvalue:" +str(w_g[sample+offset]))
                plt.savefig(IMAGE_PATH+"/option"+str(sample)+"_"+str(4)+".png")
                plt.close()
                v_map = v_option_[:,:,2,10]
                imgplot = plt.imshow(v_map)
                plt.title(" Option "+str(sample)+" Value Estimate | Eigenvalue:" +str(w_g[sample+offset]))
                plt.savefig(IMAGE_PATH+"/option"+str(sample)+"_"+str(5)+".png")
                plt.close()

            #Saving the different options. to log:
            np.savez_compressed(MODEL_PATH_ +"options.npz", options=np.stack(options))

            self.options = options

        # Creating nested Method that will be updated.
        network = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=N)
        Method = GetFunction(settings["SubMethod"])
        self.nestedMethod = Method(sess,settings,netConfigOverride,stateShape=dFeatures,actionSize=N,nTrajs=nTrajs)
Exemplo n.º 5
0
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)
        scope = "PPO"

        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=8) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Placeholders
                self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None],
                                                 'TD_target')
                self.advantage_ = tf.placeholder(shape=[None],
                                                 dtype=tf.float32,
                                                 name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize],
                                                      dtype=tf.float32,
                                                      name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]

                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))

                entropy = self.entropy = -tf.reduce_mean(
                    self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(
                    tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his,
                                       actionSize,
                                       dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH,
                                             1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(
                    ratio, 1 - self.HPs["eps"],
                    1 + self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate,
                                            clipped_surrogate,
                                            name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(
                    surrogate_loss, name='actor_loss')

                actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"]
                loss = actor_loss + critic_loss * self.HPs["CriticBeta"]

                # Build Trainer
                self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                self.gradients = self.optimizer.get_gradients(
                    loss, self.Model.trainable_variables)
                self.update_ops = self.optimizer.apply_gradients(
                    zip(self.gradients, self.Model.trainable_variables))

        #Creating variables for logging.
        self.EntropyMA = MovingAverage(400)
        self.CriticLossMA = MovingAverage(400)
        self.ActorLossMA = MovingAverage(400)
        self.GradMA = MovingAverage(400)
Exemplo n.º 6
0
    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes I/O placeholders and the training process of a Multi-step DQN.
        Main principal is that instead of one-step TD diference, the loss is evaluated on a
        temporally extended basis.
        G = R_t + γR_t+1 + ... γ^n-1 R_t+n + q(S_t+n,a*,θ-)
        loss = MSE(G,q(S_t,A_t,θ))

        """
        #Placeholders
        self.actionSize = actionSize
        self.sess=sess
        self.scope="worker"
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="worker")
        self.Model_ = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="target")

        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]
        with self.sess.as_default(), self.sess.graph.as_default():
            self.states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='states')
            self.next_states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='next_states')
            self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold')
            self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='rewards_hold')
            self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold')
            with tf.name_scope("target"):
                out2 = self.Model_({"state":self.next_states_})
                q_next = out2["Q"]
                self.targetParams = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target")
            with tf.name_scope(self.scope):
                input = {"state":self.states_}
                out = self.Model(input)
                self.q = out["Q"]

                with tf.name_scope('current_Q'):
                    oh_action = tf.one_hot(self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent]

                with tf.name_scope('target_Q'):
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q
                    # td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q * (1. - self.done_)

                with tf.name_scope('td_error'):
                    loss = tf.keras.losses.MSE(td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(softmax_q * tf.log(softmax_q+ 1e-5))
                    self.loss=total_loss = loss + self.HPs["EntropyBeta"] * self.entropy

                if self.HPs["Optimizer"] == "Adam":
                    self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                self.workerParams = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

                self.gradients = self.optimizer.get_gradients(total_loss, self.workerParams)
                self.update_op = self.optimizer.apply_gradients(zip(self.gradients, self.workerParams))

                with tf.name_scope('push'):
                    self.push_ops = [l_p.assign(g_p) for l_p, g_p in zip(self.targetParams, self.workerParams)]

                self.grads=[self.gradients]
                self.losses=[self.loss]
                self.update_ops=[self.update_op]

        self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
        self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))]
        self.labels = ["Critic"]
Exemplo n.º 7
0
    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders

        self.sess=sess
        self.scope="DQN"
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize)

        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]
        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(self.scope):
                if len(stateShape) == 4:
                    self.states_ = tf.placeholder(shape=[None]+stateShape[1:4], dtype=tf.float32, name='states')
                    self.next_states_ = tf.placeholder(shape=[None]+stateShape[1:4], dtype=tf.float32, name='next_states')
                else:
                    self.states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='states')
                    self.next_states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='next_states')
                self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold')
                self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='rewards_hold')
                self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold')

                input = {"state":self.states_}
                out = self.Model(input)
                self.q = out["Q"]

                out2 = self.Model({"state":self.next_states_})
                q_next = out2["Q"]

                with tf.name_scope('current_Q'):
                    oh_action = tf.one_hot(self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent]

                with tf.name_scope('target_Q'):
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q
                    # td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q * (1. - self.done_)

                with tf.name_scope('td_error'):
                    loss = tf.keras.losses.MSE(td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(softmax_q * tf.log(softmax_q+ 1e-5))
                    self.loss=total_loss = loss + self.HPs["EntropyBeta"] * self.entropy

                if self.HPs["Optimizer"] == "Adam":
                    self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

                self.gradients = self.optimizer.get_gradients(total_loss, self.params)
                self.update_op = self.optimizer.apply_gradients(zip(self.gradients, self.params))

                self.grads=[self.gradients]
                self.losses=[self.loss]
                self.update_ops=[self.update_op]

        self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
        self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))]
        self.entropy_MA = MovingAverage(400)
        self.labels = ["Critic"]
Exemplo n.º 8
0
class PPO(Method):
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.HPs = settings["NetworkHPs"]

        #Building the network.
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)

        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("PPO"):
                #Placeholders
                if len(stateShape) == 4:
                    self.s = tf.placeholder(tf.float32,
                                            [None] + stateShape[0:4], 'S')
                else:
                    self.s = tf.placeholder(tf.float32, [None] + stateShape,
                                            'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None],
                                                 dtype=tf.float32,
                                                 name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize],
                                                      dtype=tf.float32,
                                                      name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]

                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))

                self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob),
                                               name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                self.critic_loss = tf.reduce_mean(tf.square(td_error),
                                                  name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his,
                                       actionSize,
                                       dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH,
                                             1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(
                    ratio, 1 - self.HPs["eps"],
                    1 + self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate,
                                            clipped_surrogate,
                                            name='surrogate_loss')
                self.actor_loss = -tf.reduce_mean(surrogate_loss,
                                                  name='actor_loss')

                loss = self.actor_loss + self.critic_loss * self.HPs[
                    "CriticBeta"]

                # Build Trainer
                if self.HPs["Optimizer"] == "Adam":
                    self.optimizerA = tf.keras.optimizers.Adam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizerA = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizerA = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizerA = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizerA = tf.keras.optimizers.Adamax(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adamax(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizerA = tf.keras.optimizers.SGD(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.SGD(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"], amsgrad=True)
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"], amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                a_params = self.Model.GetVariables("Actor")
                c_params = self.Model.GetVariables("Critic")
                self.gradients_a = self.optimizerA.get_gradients(
                    loss, self.Model.trainable_variables)
                self.update_op_a = self.optimizerA.apply_gradients(
                    zip(self.gradients_a, self.Model.trainable_variables))

                entropy_loss = -self.entropy * self.HPs["EntropyBeta"]
                self.gradients_e = self.optimizerE.get_gradients(
                    entropy_loss, a_params)
                self.update_op_e = self.optimizerE.apply_gradients(
                    zip(self.gradients_e, a_params))

                total_counter = 1
                vanish_counter = 0
                for gradient in self.gradients_a:
                    total_counter += np.prod(gradient.shape)
                    stuff = tf.reduce_sum(
                        tf.cast(
                            tf.math.less_equal(tf.math.abs(gradient),
                                               tf.constant(1e-8)), tf.int32))
                    vanish_counter += stuff
                self.vanishing_gradient = vanish_counter / total_counter

        self.update_ops = [self.update_op_a, self.update_op_e]
        self.logging_ops = [
            self.actor_loss, self.critic_loss, self.entropy,
            tf.reduce_mean(self.advantage_),
            tf.reduce_mean(ratio), loss, self.vanishing_gradient
        ]
        self.labels = [
            "Loss Actor", "Loss Critic", "Entropy", "Advantage", "PPO Ratio",
            "Loss Total", "Vanishing Gradient"
        ]
        self.logging_MA = [
            MovingAverage(400) for i in range(len(self.logging_ops))
        ]

    def GetAction(self, state, episode=1, step=0):
        """
        Method to run data through the neural network.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        try:
            probs, log_logits, v = self.sess.run(
                [self.a_prob, self.log_logits, self.v], {self.s: state})
        except ValueError:
            probs, log_logits, v = self.sess.run(
                [self.a_prob, self.log_logits, self.v],
                {self.s: np.expand_dims(state, axis=0)})
        actions = np.array([
            np.random.choice(probs.shape[1], p=prob / sum(prob))
            for prob in probs
        ])
        return actions, [v, log_logits]

    def Update(self, episode=0):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        samples = 0
        for i in range(len(self.buffer)):
            samples += len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            #Finding if there are more than 1 done in the sequence. Clipping values if required.

            td_target, advantage = self.ProcessBuffer(traj)

            batches = len(
                self.buffer[traj][0]) // self.HPs["MinibatchSize"] + 1
            s = np.array_split(self.buffer[traj][0], batches)
            a_his = np.array_split(
                np.asarray(self.buffer[traj][1]).reshape(-1), batches)
            td_target_ = np.array_split(td_target, batches)
            advantage_ = np.array_split(np.reshape(advantage, [-1]), batches)
            old_log_logits_ = np.array_split(
                np.reshape(self.buffer[traj][6], [-1, self.actionSize]),
                batches)

            #Create a dictionary with all of the samples?
            #Use a sampler to feed the update operation?

            #Staging Buffer inputs into the entries to run through the network.
            # print(td_target)
            for epoch in range(self.HPs["Epochs"]):
                for i in range(batches):

                    feedDict = {
                        self.s: np.squeeze(np.asarray(s[i])),
                        self.a_his: np.asarray(a_his[i]),
                        self.td_target_: np.asarray(td_target_[i]),
                        self.advantage_: np.asarray(advantage_[i]),
                        self.old_log_logits_: np.asarray(old_log_logits_[i])
                    }

                    out = self.sess.run(
                        self.update_ops + self.logging_ops,
                        feedDict)  # local grads applied to global net.
                    logging = out[len(self.update_ops):]

                    for i, log in enumerate(logging):
                        self.logging_MA[i].append(log)

        self.ClearTrajectory()

    def GetStatistics(self):
        dict = {}
        for i, label in enumerate(self.labels):
            dict["Training Results/" + label] = self.logging_MA[i]()
        return dict

    def ProcessBuffer(self, traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.
        clip : list[bool]
            List where the trajectory has finished.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        # print("Starting Processing Buffer\n")
        # tracker.print_diff()

        split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x]

        reward_lists = np.split(self.buffer[traj][2], split_loc)
        value_lists = np.split(self.buffer[traj][5], split_loc)

        td_target = []
        advantage = []
        for rew, value in zip(reward_lists, value_lists):
            td_target_i, advantage_i = gae(rew.reshape(-1),
                                           value.reshape(-1).tolist(), 0,
                                           self.HPs["Gamma"],
                                           self.HPs["lambda"])
            td_target.extend(td_target_i)
            advantage.extend(advantage_i)
        return td_target, advantage

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
Exemplo n.º 9
0
    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess=sess
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="local")
        self.Model2 = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="global")
        self.scope =scope ="MAML"
        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("MAML"):
                #Placeholders
                if len(stateShape) == 4:
                    self.s = tf.placeholder(tf.float32, [None]+stateShape[1:4], 'S')
                else:
                    self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state":self.s}
                out = self.Model(inputs)
                _ = self.Model2(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]
                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))
                entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(ratio, 1-self.HPs["eps"], 1+self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss')

                actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"]
                loss = actor_loss + critic_loss * self.HPs["CriticBeta"]

                # Build Trainer
                if self.HPs["Optimizer"] == "Adam":
                    self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adam(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.RMSProp(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adagrad(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adadelta(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adamax(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.SGD(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True)
                    self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"],amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()

                vars1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/local')
                self.gradients = self.optimizer.get_gradients(loss, vars1)
                self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, vars1))

                with tf.name_scope("MetaUpdater"):
                    vars2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/global')
                    self.meta_update_ops = self.metaOptimizer.apply_gradients(zip(self.gradients, vars2))

                with tf.name_scope('sync'):
                    self.pull_params_op = [l_p.assign(g_p) for l_p, g_p in zip(vars1,vars2)]

        #Creating variables for logging.
        self.EntropyMA = MovingAverage(400)
        self.CriticLossMA = MovingAverage(400)
        self.ActorLossMA = MovingAverage(400)
        self.GradMA = MovingAverage(400)
        self.counter = 0
Exemplo n.º 10
0
def ApeXWorkers(sess, settings, netConfigOverride):

    EXP_NAME = settings["RunName"]
    LoadName = settings["LoadName"]
    MODEL_PATH = './models/' + LoadName + '/'
    IMAGE_PATH = './images/SF/' + EXP_NAME + '/'
    MODEL_PATH_ = './models/' + EXP_NAME + '/'
    LOG_PATH = './logs/CTF_1v1/' + EXP_NAME
    CreatePath(LOG_PATH)
    CreatePath(IMAGE_PATH)
    CreatePath(MODEL_PATH)
    CreatePath(MODEL_PATH_)
    self.sess = sess

    N = settings["NumOptions"]
    with open("configs/environment/" + settings["EnvConfig"]) as json_file:
        envSettings = json.load(json_file)
    env, dFeatures, nActions, nTrajs = CreateEnvironment(envSettings)
    #Create the Q Maps

    if "LoadQMaps" in settings:
        #Loading the Q-tables for the sub-policies
        loadedData = np.load('./models/' + settings["LoadQMaps"] +
                             '/options.npz')
        opt = loadedData["options"]
        options = []
        for i in range(opt.shape[0]):
            options.append(opt[i, :, :, :, :])
    else:
        if "LoadSamples" in settings:
            pass
        else:
            #Creating Instance of environment and running through it to generate samples
            def GetAction(state):
                """
                Contains the code to run the network based on an input.
                """
                p = 1 / nActions
                if len(state.shape) == 3:
                    probs = np.full((1, nActions), p)
                else:
                    probs = np.full((state.shape[0], nActions), p)
                actions = np.array([
                    np.random.choice(probs.shape[1], p=prob / sum(prob))
                    for prob in probs
                ])
                return actions

            s = []
            for i in range(settings["SampleEpisodes"]):
                s0 = env.reset()

                for j in range(settings["MAX_EP_STEPS"] + 1):

                    a = GetAction(state=s0)

                    s1, r, done, _ = env.step(a)
                    if arreq_in_list(s0, s):
                        pass
                    else:
                        s.append(s0)

                    s0 = s1
                    if done:
                        break

        #Creating and smoothing Q Maps
        def ConstructSamples(env, position2):
            grid = env.get_obs_blue
            locX, locY = np.unravel_index(np.argmax(grid[:, :, 4], axis=None),
                                          grid[:, :, 0].shape)
            locX2, locY2 = np.unravel_index(
                np.argmin(grid[:, :, 4], axis=None), grid[:, :, 0].shape)
            #Removing the agent
            grid[locX, locY, 4] = 0
            grid[locX2, locY2, 4] = 0

            stacked_grids = np.repeat(np.expand_dims(grid, 0),
                                      grid.shape[0] * grid.shape[1], 0)

            for i in range(stacked_grids.shape[1]):
                for j in range(stacked_grids.shape[2]):
                    stacked_grids[i * stacked_grids.shape[2] + j,
                                  stacked_grids.shape[2] - i - 1, j, 4] = 5

            stacked_grids[:, position2[0], position2[1], 4] = -5
            return stacked_grids

        def SmoothOption(option_, gamma=0.9):
            # option[option<0.0] = 0.0
            #Create the Adjacency Matric
            v_option = np.full(
                (dFeatures[0], dFeatures[1], dFeatures[0], dFeatures[1]),
                0,
                dtype=np.float32)
            for i2, j2 in itertools.product(range(dFeatures[0]),
                                            range(dFeatures[1])):
                option = option_[:, :, i2, j2]
                states_ = {}
                count = 0
                for i in range(option.shape[0]):
                    for j in range(option.shape[1]):
                        if option[i, j] != 0:
                            states_[count] = [i, j]
                            # states_.append([count, [i,j]])
                            count += 1
                states = len(states_.keys())
                x = np.zeros((states, states))
                for i in range(len(states_)):
                    [locx, locy] = states_[i]
                    sum = 0
                    for j in range(len(states_)):
                        if states_[j] == [locx + 1, locy]:
                            x[i, j] = 0.25
                            sum += 0.25
                        if states_[j] == [locx - 1, locy]:
                            x[i, j] = 0.25
                            sum += 0.25
                        if states_[j] == [locx, locy + 1]:
                            x[i, j] = 0.25
                            sum += 0.25
                        if states_[j] == [locx, locy - 1]:
                            x[i, j] = 0.25
                            sum += 0.25
                    x[i, i] = 1.0 - sum

                #Create W
                w = np.zeros((states))
                for count, loc in states_.items():
                    w[count] = option[loc[0], loc[1]]

                # (I-gamma*Q)^-1
                I = np.identity(states)
                psi = np.linalg.inv(I - gamma * x)

                smoothedOption = np.zeros_like(option, dtype=float)

                value = np.matmul(psi, w)
                for j, loc in states_.items():
                    smoothedOption[loc[0], loc[1]] = value[j]

                v_option[:, :, i2, j2] = smoothedOption
            return v_option

        SF1, SF2, SF3, SF4, SF5 = buildNetwork(settings["SFNetworkConfig"],
                                               nActions, {},
                                               scope="Global")
        SF5.load_weights('./models/' + LoadName + '/' + "model.h5")

        #Selecting the samples:
        psi = SF2.predict(np.vstack(s))  # [X,SF Dim]

        #test for approximate equality (for floating point types)
        def arreqclose_in_list(myarr, list_arrays):
            return next((True
                         for elem in list_arrays if elem.size == myarr.size
                         and np.allclose(elem, myarr, atol=1E-6)), False)

        if settings["Selection"] == "First":
            samples = []
            points = []
            i = 0
            while len(samples) < settings["TotalSamples"]:
                if not arreqclose_in_list(psi[i, :], samples):
                    samples.append(psi[i, :])
                    points.append(i)
                i += 1
        elif settings["Selection"] == "Random":
            samples = []
            points = []
            while len(samples) < settings["TotalSamples"]:
                idx = randint(1, psi.shape[0])
                if not arreqclose_in_list(psi[idx, :], samples):
                    samples.append(psi[idx, :])
                    points.append(idx)
        elif settings["Selection"] == "Hull_pca":
            #PCA Decomp to dimension:
            import pandas as pd
            from sklearn.decomposition import PCA
            feat_cols = ['pixel' + str(i) for i in range(psi.shape[1])]
            df = pd.DataFrame(psi, columns=feat_cols)
            np.random.seed(42)
            rndperm = np.random.permutation(df.shape[0])
            pca = PCA(n_components=4)
            pca_result = pca.fit_transform(df[feat_cols].values)

            from SampleSelection import SampleSelection_v2
            points = SampleSelection_v2(pca_result,
                                        settings["TotalSamples"],
                                        returnIndicies=True)
        elif settings["Selection"] == "Hull_tsne":
            #PCA Decomp to dimension:
            import pandas as pd
            from sklearn.manifold import TSNE
            feat_cols = ['pixel' + str(i) for i in range(psi.shape[1])]
            df = pd.DataFrame(psi, columns=feat_cols)
            np.random.seed(42)
            rndperm = np.random.permutation(df.shape[0])
            tsne = TSNE(n_components=3, verbose=1, perplexity=10, n_iter=1000)
            tsne_results = tsne.fit_transform(df[feat_cols].values)

            from SampleSelection import SampleSelection_v2
            points = SampleSelection_v2(tsne_results,
                                        settings["TotalSamples"],
                                        returnIndicies=True)
        elif settings["Selection"] == "Hull_cluster":
            #PCA Decomp to dimension:
            import pandas as pd
            from sklearn.decomposition import PCA
            feat_cols = ['pixel' + str(i) for i in range(psi.shape[1])]
            df = pd.DataFrame(psi, columns=feat_cols)
            np.random.seed(42)
            rndperm = np.random.permutation(df.shape[0])
            pca = PCA(n_components=4)
            pca_result = pca.fit_transform(df[feat_cols].values)

            from SampleSelection import SampleSelection_v3
            points = SampleSelection_v3(pca_result,
                                        settings["TotalSamples"],
                                        returnIndicies=True)
        else:
            print("Invalid Method selected")
            exit()

        psiSamples = []
        for point in points:
            psiSamples.append(psi[point, :])

        while len(psiSamples) < len(psiSamples[0]):
            psiSamples.extend(psiSamples)

        samps = np.stack(psiSamples)
        samps2 = samps[0:samps.shape[1], :]
        w_g, v_g = np.linalg.eig(samps2)

        # print("here")
        dim = samps2.shape[1]
        #Creating Sub-policies
        offset = 0
        options = []
        for sample in range(int(N / 2)):
            print("Creating Option", sample)
            v_option = np.full(
                (dFeatures[0], dFeatures[1], dFeatures[0], dFeatures[1]),
                0,
                dtype=np.float32)
            for i2, j2 in itertools.product(range(dFeatures[0]),
                                            range(dFeatures[1])):
                if sample + offset >= dim:
                    continue
                grids = ConstructSamples(env, [i2, j2])
                phi = SF3.predict(grids)
                v_option[:, :, i2, j2] = np.real(
                    np.matmul(phi, v_g[:, sample + offset])).reshape(
                        [dFeatures[0], dFeatures[1]])
                if np.iscomplex(w_g[sample + offset]):
                    offset += 1
            print("Smoothing Option")
            v_option_ = SmoothOption(v_option)
            options.append(v_option_)
            options.append(-v_option_)
            #Plotting the first couple samples with random enemy positions:
            v_map = v_option_[:, :, 10, 10]
            imgplot = plt.imshow(v_map)
            plt.title(" Option " + str(sample) +
                      " Value Estimate | Eigenvalue:" +
                      str(w_g[sample + offset]))
            plt.savefig(IMAGE_PATH + "/option" + str(sample) + "_" + str(1) +
                        ".png")
            plt.close()
            v_map = v_option_[:, :, 10, 17]
            imgplot = plt.imshow(v_map)
            plt.title(" Option " + str(sample) +
                      " Value Estimate | Eigenvalue:" +
                      str(w_g[sample + offset]))
            plt.savefig(IMAGE_PATH + "/option" + str(sample) + "_" + str(2) +
                        ".png")
            plt.close()
            v_map = v_option_[:, :, 17, 10]
            imgplot = plt.imshow(v_map)
            plt.title(" Option " + str(sample) +
                      " Value Estimate | Eigenvalue:" +
                      str(w_g[sample + offset]))
            plt.savefig(IMAGE_PATH + "/option" + str(sample) + "_" + str(3) +
                        ".png")
            plt.close()
            v_map = v_option_[:, :, 10, 2]
            imgplot = plt.imshow(v_map)
            plt.title(" Option " + str(sample) +
                      " Value Estimate | Eigenvalue:" +
                      str(w_g[sample + offset]))
            plt.savefig(IMAGE_PATH + "/option" + str(sample) + "_" + str(4) +
                        ".png")
            plt.close()
            v_map = v_option_[:, :, 2, 10]
            imgplot = plt.imshow(v_map)
            plt.title(" Option " + str(sample) +
                      " Value Estimate | Eigenvalue:" +
                      str(w_g[sample + offset]))
            plt.savefig(IMAGE_PATH + "/option" + str(sample) + "_" + str(5) +
                        ".png")
            plt.close()

        #Saving the different options. to log:
        np.savez_compressed(MODEL_PATH_ + "options.npz",
                            options=np.stack(options))

        self.options = options

    EXP_NAME = settings["RunName"]
    MODEL_PATH = './models/' + EXP_NAME
    LOG_PATH = './logs/' + EXP_NAME
    CreatePath(LOG_PATH)
    CreatePath(MODEL_PATH)

    with open("configs/environment/" + settings["EnvConfig"]) as json_file:
        envSettings = json.load(json_file)

    progbar = tf.keras.utils.Progbar(None,
                                     unit_name='Training',
                                     stateful_metrics=["Reward"])
    writer = tf.summary.FileWriter(LOG_PATH, graph=sess.graph)
    global_step = tf.Variable(0, trainable=False, name='global_step')
    global_step_next = tf.assign_add(global_step, 1)

    workers = []

    sharedBuffer = ApexBuffer()
    _, dFeatures, nActions, nTrajs = CreateEnvironment(envSettings,
                                                       multiprocessing=1)

    network = NetworkBuilder(settings["NetworkConfig"],
                             netConfigOverride,
                             scope="Global",
                             actionSize=nActions)
    targetNetwork = NetworkBuilder(settings["NetworkConfig"],
                                   netConfigOverride,
                                   scope="target",
                                   actionSize=nActions)
    Updater = ApeX(network,
                   sess,
                   stateShape=dFeatures,
                   actionSize=nActions,
                   scope="Global",
                   HPs=settings["NetworkHPs"],
                   sharedBuffer=sharedBuffer,
                   targetNetwork=targetNetwork)
    Updater.Model.summary()
    saver = tf.train.Saver(max_to_keep=3,
                           var_list=Updater.getVars + [global_step])
    Updater.InitializeVariablesFromFile(saver, MODEL_PATH)
    workers.append(
        WorkerLearner(Updater, sess, global_step, global_step_next, settings,
                      progbar, writer, MODEL_PATH, saver))

    i_name = "prioritizer"
    network = NetworkBuilder(settings["NetworkConfig"],
                             netConfigOverride,
                             scope=i_name,
                             actionSize=nActions)
    localNetwork = ApeX(network,
                        sess,
                        stateShape=dFeatures,
                        actionSize=nActions,
                        scope=i_name,
                        HPs=settings["NetworkHPs"],
                        globalAC=Updater,
                        nTrajs=nTrajs,
                        sharedBuffer=sharedBuffer)
    localNetwork.InitializeVariablesFromFile(saver, MODEL_PATH)
    workers.append(WorkerPrioritizer(localNetwork, sess, global_step,
                                     settings))

    # Create workers
    for i in range(settings["NumberENV"]):
        i_name = 'W_%i' % i  # worker name
        network = NetworkBuilder(settings["NetworkConfig"],
                                 netConfigOverride,
                                 scope=i_name,
                                 actionSize=nActions)
        localNetwork = ApeX(network,
                            sess,
                            stateShape=dFeatures,
                            actionSize=nActions,
                            scope=i_name,
                            HPs=settings["NetworkHPs"],
                            globalAC=Updater,
                            nTrajs=nTrajs,
                            sharedBuffer=sharedBuffer)
        localNetwork.InitializeVariablesFromFile(saver, MODEL_PATH)
        env, _, _, _ = CreateEnvironment(envSettings, multiprocessing=1)
        workers.append(
            WorkerActor(localNetwork, env, sess, global_step, global_step_next,
                        settings, progbar, writer, MODEL_PATH, saver))

    return workers
Exemplo n.º 11
0
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)
        self.method = "Confidence"  #Create input for this.
        self.HPs = settings["NetworkHPs"]
        self.subReward = False
        self.UpdateSubpolicies = True
        self.nTrajs = nTrajs

        #Creating buffer
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]
        #[s0,a,r,s1,done]+[HL_action]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("OptionCritic"):
                #Generic placeholders
                self.batch_size = tf.placeholder(tf.int32, 1, 'BS')
                self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
                self.actions = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.rewards = tf.placeholder(tf.float32, [None], 'R')
                # self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.options = tf.placeholder(shape=[None],
                                              dtype=tf.int32,
                                              name="options")

                batch_indexer = tf.range(tf.reshape(self.batch_size, []))

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.term = out["metaTermination"]
                self.q = out["metaCritic"]

                self.sub_a_prob = out["subActor"]
                self.sub_log_logits = out["subLogLogits"]

                self.nPolicies = len(self.sub_a_prob)

                # Creating the Loss and update calls for the Hierarchical policy
                # Indexers
                self.responsible_options = tf.stack(
                    [batch_indexer, self.options], axis=1)
                self.responsible_actions = tf.stack(
                    [batch_indexer, self.actions], axis=1)
                self.network_indexer = tf.stack([self.options, batch_indexer],
                                                axis=1)

                # Q Values OVER options
                self.disconnected_q_vals = tf.stop_gradient(self.q)

                # Q values of each option that was taken
                self.responsible_opt_q_vals = tf.gather_nd(
                    params=self.q, indices=self.responsible_options
                )  # Extract q values for each option
                self.disconnected_q_vals_option = tf.gather_nd(
                    params=self.disconnected_q_vals,
                    indices=self.responsible_options)

                # Termination probability of each option that was taken
                self.terminations = tf.gather_nd(
                    params=self.term, indices=self.responsible_options)

                # Q values for each action that was taken
                relevant_networks = tf.gather_nd(params=self.sub_a_prob,
                                                 indices=self.network_indexer)
                relevant_networks = tf.nn.softmax(relevant_networks, dim=1)

                self.action_values = tf.gather_nd(
                    params=relevant_networks, indices=self.responsible_actions)

                # Weighted average value
                option_eps = 0.001
                self.value = tf.reduce_max(self.q) * (1 - option_eps) + (
                    option_eps * tf.reduce_mean(self.q))
                disconnected_value = tf.stop_gradient(self.value)

                # Losses; TODO: Why reduce sum vs reduce mean?
                vf_coef = 0.5
                self.value_loss = vf_coef * tf.reduce_mean(
                    vf_coef * 0.5 *
                    tf.square(self.rewards - self.responsible_opt_q_vals))
                self.policy_loss = tf.reduce_mean(
                    _log(self.action_values) *
                    (self.rewards - self.disconnected_q_vals_option))
                self.deliberation_costs = 0.020
                self.termination_loss = tf.reduce_mean(
                    self.terminations *
                    ((self.disconnected_q_vals_option - disconnected_value) +
                     self.deliberation_costs))

                ent_coef = 0.01
                action_probabilities = self.sub_a_prob
                self.entropy = ent_coef * tf.reduce_mean(
                    action_probabilities * _log(action_probabilities))

                self.loss = -self.policy_loss - self.entropy - self.value_loss - self.termination_loss

                variables = self.Model.getVars()
                variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                              "OptionCritic")
                optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                gradients = optimizer.get_gradients(self.loss, variables)
                self.update_op = optimizer.apply_gradients(
                    zip(gradients, variables))

            #Creating Variables for teh purpose of logging.
            self.SubpolicyDistribution = MovingAverage(1000)
Exemplo n.º 12
0
class OptionCritic(Method):
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)
        self.method = "Confidence"  #Create input for this.
        self.HPs = settings["NetworkHPs"]
        self.subReward = False
        self.UpdateSubpolicies = True
        self.nTrajs = nTrajs

        #Creating buffer
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]
        #[s0,a,r,s1,done]+[HL_action]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("OptionCritic"):
                #Generic placeholders
                self.batch_size = tf.placeholder(tf.int32, 1, 'BS')
                self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
                self.actions = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.rewards = tf.placeholder(tf.float32, [None], 'R')
                # self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.options = tf.placeholder(shape=[None],
                                              dtype=tf.int32,
                                              name="options")

                batch_indexer = tf.range(tf.reshape(self.batch_size, []))

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.term = out["metaTermination"]
                self.q = out["metaCritic"]

                self.sub_a_prob = out["subActor"]
                self.sub_log_logits = out["subLogLogits"]

                self.nPolicies = len(self.sub_a_prob)

                # Creating the Loss and update calls for the Hierarchical policy
                # Indexers
                self.responsible_options = tf.stack(
                    [batch_indexer, self.options], axis=1)
                self.responsible_actions = tf.stack(
                    [batch_indexer, self.actions], axis=1)
                self.network_indexer = tf.stack([self.options, batch_indexer],
                                                axis=1)

                # Q Values OVER options
                self.disconnected_q_vals = tf.stop_gradient(self.q)

                # Q values of each option that was taken
                self.responsible_opt_q_vals = tf.gather_nd(
                    params=self.q, indices=self.responsible_options
                )  # Extract q values for each option
                self.disconnected_q_vals_option = tf.gather_nd(
                    params=self.disconnected_q_vals,
                    indices=self.responsible_options)

                # Termination probability of each option that was taken
                self.terminations = tf.gather_nd(
                    params=self.term, indices=self.responsible_options)

                # Q values for each action that was taken
                relevant_networks = tf.gather_nd(params=self.sub_a_prob,
                                                 indices=self.network_indexer)
                relevant_networks = tf.nn.softmax(relevant_networks, dim=1)

                self.action_values = tf.gather_nd(
                    params=relevant_networks, indices=self.responsible_actions)

                # Weighted average value
                option_eps = 0.001
                self.value = tf.reduce_max(self.q) * (1 - option_eps) + (
                    option_eps * tf.reduce_mean(self.q))
                disconnected_value = tf.stop_gradient(self.value)

                # Losses; TODO: Why reduce sum vs reduce mean?
                vf_coef = 0.5
                self.value_loss = vf_coef * tf.reduce_mean(
                    vf_coef * 0.5 *
                    tf.square(self.rewards - self.responsible_opt_q_vals))
                self.policy_loss = tf.reduce_mean(
                    _log(self.action_values) *
                    (self.rewards - self.disconnected_q_vals_option))
                self.deliberation_costs = 0.020
                self.termination_loss = tf.reduce_mean(
                    self.terminations *
                    ((self.disconnected_q_vals_option - disconnected_value) +
                     self.deliberation_costs))

                ent_coef = 0.01
                action_probabilities = self.sub_a_prob
                self.entropy = ent_coef * tf.reduce_mean(
                    action_probabilities * _log(action_probabilities))

                self.loss = -self.policy_loss - self.entropy - self.value_loss - self.termination_loss

                variables = self.Model.getVars()
                variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                              "OptionCritic")
                optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                gradients = optimizer.get_gradients(self.loss, variables)
                self.update_op = optimizer.apply_gradients(
                    zip(gradients, variables))

            #Creating Variables for teh purpose of logging.
            self.SubpolicyDistribution = MovingAverage(1000)

    def GetAction(self, state, step, episode=0):
        """
        Method to run data through hierarchical network

        First run the state through the meta network to select subpolicy to use.
        Second run the state through the proper Subpolicy

        ToDo: Check if faster to run the entire network and select appropriate subpolicy afterwards. or run only the required bit.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        #Determine number of steps and whether to initiate confidence based on the length of the Buffer.
        if step == 0:
            self.pastActions = [None] * self.nTrajs

        # Run the Meta and Sub-policy Networks
        targets = [self.q, self.term] + self.sub_a_prob + self.sub_log_logits
        res = self.sess.run(targets, {self.s: state})
        q = res[0]
        terminations = res[1]
        sub_probs = res[2:3 + self.nPolicies]
        sub_log_logits = res[2 + self.nPolicies:2 + 2 * self.nPolicies]
        HL_actions = []
        for i, term in enumerate(terminations):
            if step == 0:
                action = np.argmax(q[i])
                HL_actions.append(action)
                self.pastActions[i] = action
            elif random.uniform(0, 1) < term[self.pastActions[i]]:
                # action = np.argmax(q[i])
                action = random.randint(0, 2)
                HL_actions.append(action)
                self.pastActions[i] = action
            else:
                action = random.randint(0, 2)
                HL_actions.append(action)
                # HL_actions.append(self.pastActions[i])
        self.traj_action = HL_actions
        print(q, HL_actions)

        # Run the Subpolicy Network
        actions = np.array([
            np.random.choice(self.actionSize,
                             p=sub_probs[mod][idx] / sum(sub_probs[mod][idx]))
            for idx, mod in enumerate(HL_actions)
        ])
        logits = [
            sub_log_logits[mod][idx] for idx, mod in enumerate(HL_actions)
        ]

        return actions, [HL_actions, q]

    def Update(self, HPs):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        samples = 0
        for i in range(len(self.buffer)):
            samples += len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):
            advantage = self.ProcessBuffer(traj)
            # Updating the Hierarchical Controller
            for epoch in range(self.HPs["Epochs"]):
                for batch in MultiBatchDivider([
                        self.buffer[traj][0], self.buffer[traj][1], advantage,
                        self.buffer[traj][5]
                ], self.HPs["MinibatchSize"]):

                    feed_dict = {
                        self.batch_size:
                        [np.asarray(batch[0]).squeeze().shape[0]],
                        self.s: np.asarray(batch[0]).squeeze(),
                        self.actions: np.asarray(batch[1]).squeeze(),
                        self.rewards: np.asarray(batch[2]).squeeze(),
                        self.options: np.reshape(batch[3], [-1])
                    }
                    self.sess.run(self.update_op, feed_dict)
            self.SubpolicyDistribution.extend(np.asarray(self.buffer[traj][5]))
            self.ClearTrajectory()

    def GetStatistics(self):
        stats = {}
        for i in range(self.nPolicies):
            length = len(self.SubpolicyDistribution.tolist())
            if length == 0:
                length = 1
            stats[
                "Subpolicy Use/" +
                str(i)] = self.SubpolicyDistribution.tolist().count(i) / length
        return stats

    def ProcessBuffer(self, traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.
        clip : list[bool]
            List where the trajectory has finished.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        #Splitting the buffer into different episodes based on the done tag.
        split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x]
        #Stuff need to be processed for the Low Level Controllers
        reward_lists = np.split(self.buffer[traj][2], split_loc[:-1])
        value_lists = np.split(self.buffer[traj][6], split_loc[:-1])

        HL_action_lists = np.split(self.buffer[traj][5], split_loc[:-1])

        td_target = []
        advantage = []

        for rew, value, options in zip(reward_lists, value_lists,
                                       HL_action_lists):
            # Calculating the per step advantage of each of the different sections
            val = []
            for i, option in enumerate(options):
                val.append(value[i, 0, option])
            td_target_i, advantage_i = gae(
                rew.reshape(-1).tolist(),
                np.asarray(val).reshape(-1).tolist(), 0, self.HPs["Gamma"],
                self.HPs["lambda"])
            td_target.extend(td_target_i)
            advantage.extend(advantage_i)

        return advantage

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
Exemplo n.º 13
0
class AC(Method):

    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]

        #Placeholders
        self.sess=sess
        self.HPs = settings["NetworkHPs"]

        self.s = tf.placeholder(dtype=tf.float32, shape=[None]+stateShape, name="state")
        self.a = tf.placeholder(tf.int32, [None,1], "act")
        # self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error
        self.v_ = tf.placeholder(tf.float32, [None, 1], "v_next")
        self.r = tf.placeholder(tf.float32, [None,1], 'r')


        #These need to be returned in the call function of a tf.keras.Model class.
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize)

        inputs = {"state":self.s}
        out = self.Model(inputs)
        self.acts_prob = out["actor"]
        self.critic = out["critic"]

        #Defining Training Operations which will be called in the Update Function.
        with tf.variable_scope('Update_Operation'):
            with tf.name_scope('squared_TD_error'):
                self.td_error = self.r + self.HPs["Gamma"] * self.v_ - self.critic
                self.c_loss = tf.reduce_mean(tf.square(self.td_error))    # TD_error = (r+gamma*V_next) - V_eval

            with tf.name_scope('train_critic'):
                self.c_params = self.Model.GetVariables("Critic")
                self.c_grads = tf.gradients(self.c_loss, self.c_params)
                self.update_c_op = tf.train.AdamOptimizer(self.HPs["Critic LR"]).apply_gradients(zip(self.c_grads, self.c_params))

            with tf.name_scope('exp_v'):
                log_prob = tf.log(self.acts_prob + 1e-5) * tf.one_hot(self.a, actionSize, dtype=tf.float32)
                self.a_loss = -tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss

            with tf.name_scope('train_actor'):
                self.a_params = self.Model.GetVariables("Actor")
                print(self.a_params)
                self.a_grads = tf.gradients(self.a_loss, self.a_params)
                self.update_a_op = tf.train.AdamOptimizer(self.HPs["Actor LR"]).apply_gradients(zip(self.a_grads, self.a_params))

            self.update_ops=[self.update_c_op,self.update_a_op]

            self.entropy = -tf.reduce_mean(self.acts_prob * _log(self.acts_prob), name='entropy')

            self.logging_ops = [self.a_loss,self.c_loss,self.entropy]
            self.labels = ["Loss Actor","Loss Critic","Entropy"]
            self.logging_MA = [MovingAverage(400) for i in range(len(self.logging_ops))]



    def GetAction(self, state, episode=0, step=0):
        """
        Contains the code to run the network based on an input.
        """
        try:
            s = state[np.newaxis, :]
            probs = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
        except ValueError:
            probs = self.sess.run(self.acts_prob, {self.s: state})   # get probabilities for all actions
        return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()), []   # return a int

    def Update(self, episode=0):
        """
        Takes an input buffer and applies the updates to the networks through gradient
        backpropagation
        """
        samples=0
        for i in range(len(self.buffer)):
            samples +=len(self.buffer[i])
        if samples < 1:
            return

        for traj in range(len(self.buffer)):
            v_ = self.sess.run(self.critic, {self.s: np.vstack(self.buffer[traj][3])})
            feedDict = {self.s: np.vstack(self.buffer[traj][0]),
                        self.v_: v_,
                        self.r: np.vstack(self.buffer[traj][2]),
                        self.a:np.vstack(self.buffer[traj][1])
                        }
            out = self.sess.run(self.update_ops+self.logging_ops, feedDict)   # local grads applied to global net.
            logging = out[len(self.update_ops):]

            for i,log in enumerate(logging):
                self.logging_MA[i].append(log)

            #Clear of reset the buffer.
        self.ClearTrajectory()
    def GetStatistics(self):
        dict ={}
        for i,label in enumerate(self.labels):
            dict["Training Results/" + label] = self.logging_MA[i]()
        return dict

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
Exemplo n.º 14
0
class A2C(Method):

    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]

        #Placeholders
        self.sess=sess
        self.HPs = settings["NetworkHPs"]

        self.s = tf.placeholder(dtype=tf.float32, shape=[None]+stateShape, name="state")
        self.a = tf.placeholder(tf.int32, [None,1], "act")
        # self.td_error = tf.placeholder(tf.float32, None, "td_error")  # TD_error
        self.v_ = tf.placeholder(tf.float32, [None, 1], "v_next")
        self.r = tf.placeholder(tf.float32, [None,1], 'r')


        #These need to be returned in the call function of a tf.keras.Model class.
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize)

        inputs = {"state":self.s}
        out = self.Model(inputs)
        self.acts_prob = out["actor"]
        self.critic = out["critic"]

        #Defining Training Operations which will be called in the Update Function.
        with tf.variable_scope('Update_Operation'):
            with tf.name_scope('squared_TD_error'):
                self.td_error = self.r + self.HPs["Gamma"] * self.v_ - self.critic
                self.loss = tf.reduce_mean(tf.square(self.td_error))    # TD_error = (r+gamma*V_next) - V_eval

            with tf.name_scope('train_critic'):
                self.train_op_c = tf.train.AdamOptimizer(self.HPs["Critic LR"]).minimize(self.loss)

            with tf.name_scope('exp_v'):
                log_prob = tf.log(self.acts_prob + 1e-5) * tf.one_hot(self.a, actionSize, dtype=tf.float32)
                self.exp_v = tf.reduce_mean(log_prob * self.td_error)  # advantage (TD_error) guided loss

            with tf.name_scope('train_actor'):
                self.train_op_a = tf.train.AdamOptimizer(self.HPs["Actor LR"]).minimize(-self.exp_v)  # minimize(-exp_v) = maximize(exp_v)

            self.update_ops=[self.train_op_c,self.train_op_a]

            self.entropy = -tf.reduce_mean(self.acts_prob * _log(self.acts_prob), name='entropy')

            self.logging_ops = [self.exp_v,self.loss,self.entropy]
            self.labels = ["Loss Actor","Loss Critic","Entropy"]
            self.logging_MA = [MovingAverage(400) for i in range(len(self.logging_ops))]



    def GetAction(self, state, episode=0, step=0):
        """
        Contains the code to run the network based on an input.
        """
        try:
            s = state[np.newaxis, :]
            probs,critic = self.sess.run(self.acts_prob, {self.s: s})   # get probabilities for all actions
        except ValueError:
            probs,critic = self.sess.run(self.acts_prob, {self.s: state})   # get probabilities for all actions
        return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()), [critic]   # return a int

    def Update(self, episode=0):
        """
        Takes an input buffer and applies the updates to the networks through gradient
        backpropagation
        """
        samples=0
        for i in range(len(self.buffer)):
            samples +=len(self.buffer[i])
        if samples < 1:
            return

        for traj in range(len(self.buffer)):
            td_target,advantage=self.ProcessBuffer(traj)
            v_ = self.sess.run(self.critic, {self.s: np.vstack(self.buffer[traj][3])})
            feedDict = {self.s: np.vstack(self.buffer[traj][0]),
                        self.v_: v_,
                        self.r: np.vstack(self.buffer[traj][2]),
                        self.a:np.vstack(self.buffer[traj][1])
                        }
            out = self.sess.run(self.update_ops+self.logging_ops, feedDict)   # local grads applied to global net.
            logging = out[len(self.update_ops):]

            for i,log in enumerate(logging):
                self.logging_MA[i].append(log)

    def ProcessBuffer(self,traj):
        """
        Process the buffer to calculate td_target.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x]

        reward_lists = np.split(self.buffer[traj][2],split_loc)
        value_lists = np.split(self.buffer[traj][5],split_loc)

        td_target=[]; advantage=[]
        for rew,value in zip(reward_lists,value_lists):
            td_target_i, advantage_i = gae(rew.reshape(-1),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"])
            td_target.extend(td_target_i); advantage.extend( advantage_i)
        return td_target, advantage

            #Clear of reset the buffer.
        self.ClearTrajectory()
    def GetStatistics(self):
        dict ={}
        for i,label in enumerate(self.labels):
            dict["Training Results/" + label] = self.logging_MA[i]()
        return dict

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
Exemplo n.º 15
0
class PPO(Method):
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)
        scope = "PPO"

        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=8) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Placeholders
                self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None],
                                                 'TD_target')
                self.advantage_ = tf.placeholder(shape=[None],
                                                 dtype=tf.float32,
                                                 name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize],
                                                      dtype=tf.float32,
                                                      name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]

                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))

                entropy = self.entropy = -tf.reduce_mean(
                    self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(
                    tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his,
                                       actionSize,
                                       dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH,
                                             1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(
                    ratio, 1 - self.HPs["eps"],
                    1 + self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate,
                                            clipped_surrogate,
                                            name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(
                    surrogate_loss, name='actor_loss')

                loss = self.actor_loss + self.critic_loss * self.HPs[
                    "CriticBeta"]

                # Build Trainer
                if self.HPs["Optimizer"] == "Adam":
                    self.optimizerA = tf.keras.optimizers.Adam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizerA = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizerA = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizerA = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizerA = tf.keras.optimizers.Adamax(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adamax(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizerA = tf.keras.optimizers.SGD(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.SGD(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"], amsgrad=True)
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"], amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                a_params = self.Model.GetVariables("Actor")
                c_params = self.Model.GetVariables("Critic")
                self.gradients_a = self.optimizerA.get_gradients(
                    loss, self.Model.trainable_variables)
                # capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in self.gradients_a]
                self.update_op_a = self.optimizerA.apply_gradients(
                    zip(self.gradients_a, self.Model.trainable_variables))

                entropy_loss = -self.entropy * self.HPs["EntropyBeta"]
                self.gradients_e = self.optimizerE.get_gradients(
                    entropy_loss, a_params)
                self.update_op_e = self.optimizerE.apply_gradients(
                    zip(self.gradients_e, a_params))

                total_counter = 1
                vanish_counter = 0
                for gradient in self.gradients_a:
                    total_counter += np.prod(gradient.shape)
                    stuff = tf.reduce_sum(
                        tf.cast(
                            tf.math.less_equal(tf.math.abs(gradient),
                                               tf.constant(1e-8)), tf.int32))
                    vanish_counter += stuff
                self.vanishing_gradient = vanish_counter / total_counter

        self.update_ops = [self.update_op_a, self.update_op_e]
        self.logging_ops = [
            self.actor_loss, self.critic_loss, self.entropy,
            tf.reduce_mean(self.advantage_),
            tf.reduce_mean(ratio), loss, self.vanishing_gradient
        ]
        self.labels = [
            "Loss Actor", "Loss Critic", "Entropy", "Advantage", "PPO Ratio",
            "Loss Total", "Vanishing Gradient"
        ]
        self.logging_MA = [
            MovingAverage(400) for i in range(len(self.logging_ops))
        ]
        self.count_MA = MovingAverage(400)

    def GetAction(self, state, episode=1, step=0):
        """
        Method to run data through the neural network.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        try:
            probs, log_logits, v = self.sess.run(
                [self.a_prob, self.log_logits, self.v], {self.s: state})
        except ValueError:
            probs, log_logits, v = self.sess.run(
                [self.a_prob, self.log_logits, self.v],
                {self.s: np.expand_dims(state, axis=0)})
        actions = np.array([
            np.random.choice(probs.shape[1], p=prob / sum(prob))
            for prob in probs
        ])

        confid = -np.mean(probs * np.log(probs), axis=1)
        if step == 0:
            self.store_actions = actions
            self.old_confid = confid
            self.count = 0
            return actions, [v, log_logits, True]
        else:
            if confid < self.old_confid:  # compare inverse entropy
                self.old_confid = confid
                self.store_actions = actions
                self.count_MA.append(self.count)
                self.count = 0
                return actions, [v, log_logits, True]
            else:
                if self.count >= 4:
                    self.old_confid = np.maximum(
                        self.old_confid + self.HPs["ConfidenceAnnealing"],
                        self.HPs["MinConfidence"])
                self.count += 1
                return self.store_actions, [v, log_logits, False]

    def Update(self, episode=0):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        #Counting number of samples.
        samples = 0
        for i in range(len(self.buffer)):
            samples += len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            td_target_hier, advantage_hier, actions_hier, ll_hier, s_hier = self.ProcessBuffer(
                traj)

            for epoch in range(self.HPs["Epochs"]):
                for batch in MultiBatchDivider([
                        s_hier, actions_hier, td_target_hier, advantage_hier,
                        ll_hier
                ], self.HPs["MinibatchSize"]):
                    #Staging Buffer inputs into the entries to run through the network.
                    feedDict = {
                        self.s: np.asarray(batch[0]).squeeze(),
                        self.a_his: np.asarray(batch[1]).squeeze(),
                        self.td_target_: np.asarray(batch[2]).squeeze(),
                        self.advantage_: np.reshape(batch[3], [-1]),
                        self.old_log_logits_: np.asarray(batch[4]).squeeze()
                    }
                    out = self.sess.run(
                        self.update_ops + self.logging_ops,
                        feedDict)  # local grads applied to global net.
                    logging = out[len(self.update_ops):]

                    for i, log in enumerate(logging):
                        self.logging_MA[i].append(log)

        self.ClearTrajectory()

    def GetStatistics(self):
        dict = {}
        for i, label in enumerate(self.labels):
            dict["Training Results/" + label] = self.logging_MA[i]()

        dict["Training Results/Average Traj Length"] = self.count_MA()
        return dict

    def ProcessBuffer(self, traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        # Split into different episodes based on the "done" signal. Assumes that episode terminates at done.
        # Cannot account for instances where there are multiple done signals in a row.

        split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x]

        # reward_lists = np.split(self.buffer[traj][2],split_loc)
        # value_lists = np.split(self.buffer[traj][5],split_loc)
        #
        # td_target=[]; advantage=[]
        # for rew,value in zip(reward_lists,value_lists):
        #     td_target_i, advantage_i = gae(rew.reshape(-1).tolist(),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"])
        #     td_target.extend(td_target_i); advantage.extend( advantage_i)
        # return td_target, advantage

        reward_lists = np.split(self.buffer[traj][2], split_loc[:-1])

        #Stuff needed for the
        HL_S_lists = np.split(self.buffer[traj][0], split_loc[:-1])
        HL_Critic_lists = np.split(self.buffer[traj][5], split_loc[:-1])
        HL_Logits_lists = np.split(self.buffer[traj][6], split_loc[:-1])
        HL_action_lists = np.split(self.buffer[traj][1], split_loc[:-1])
        HL_flag_lists = np.split(self.buffer[traj][7], split_loc[:-1])

        td_target_hier = []
        advantage_hier = []
        ll = []
        actions = []
        s = []

        for rew, HL_critic, HL_ll, HL_a, HL_flag, HL_s in zip(
                reward_lists, HL_Critic_lists, HL_Logits_lists,
                HL_action_lists, HL_flag_lists, HL_S_lists):
            #Colapsing different trajectory lengths for the hierarchical controller
            split_loc_ = [i for i, x in enumerate(HL_flag[:-1]) if x][1:]
            rew_hier = [np.sum(l) for l in np.split(rew, split_loc_)]
            value_hier = [l[0] for l in np.split(HL_critic, split_loc_)]
            actions.extend([l[0] for l in np.split(HL_a, split_loc_)])
            ll.extend([l[0] for l in np.split(HL_ll, split_loc_)])
            s.extend([l[0] for l in np.split(HL_s, split_loc_)])
            #Calculating the td_target and advantage for the hierarchical controller.
            td_target_i_, advantage_i_ = gae(
                np.asarray(rew_hier).reshape(-1).tolist(),
                np.asarray(value_hier).reshape(-1).tolist(), 0,
                self.HPs["Gamma"], self.HPs["lambda"])
            td_target_hier.extend(td_target_i_)
            advantage_hier.extend(advantage_i_)

        return td_target_hier, advantage_hier, actions, ll, s

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
Exemplo n.º 16
0
class MAML(Method):

    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess=sess
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="local")
        self.Model2 = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="global")
        self.scope =scope ="MAML"
        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("MAML"):
                #Placeholders
                if len(stateShape) == 4:
                    self.s = tf.placeholder(tf.float32, [None]+stateShape[1:4], 'S')
                else:
                    self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state":self.s}
                out = self.Model(inputs)
                _ = self.Model2(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]
                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))
                entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(ratio, 1-self.HPs["eps"], 1+self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss')

                actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"]
                loss = actor_loss + critic_loss * self.HPs["CriticBeta"]

                # Build Trainer
                if self.HPs["Optimizer"] == "Adam":
                    self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adam(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.RMSProp(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adagrad(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adadelta(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Adamax(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"])
                    self.metaOptimizer = tf.keras.optimizers.SGD(self.HPs["Meta LR"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True)
                    self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"],amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()

                vars1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/local')
                self.gradients = self.optimizer.get_gradients(loss, vars1)
                self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, vars1))

                with tf.name_scope("MetaUpdater"):
                    vars2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/global')
                    self.meta_update_ops = self.metaOptimizer.apply_gradients(zip(self.gradients, vars2))

                with tf.name_scope('sync'):
                    self.pull_params_op = [l_p.assign(g_p) for l_p, g_p in zip(vars1,vars2)]

        #Creating variables for logging.
        self.EntropyMA = MovingAverage(400)
        self.CriticLossMA = MovingAverage(400)
        self.ActorLossMA = MovingAverage(400)
        self.GradMA = MovingAverage(400)
        self.counter = 0

    def next_task(self):
        if self.counter > 3:
            self.counter = 0
            # self.sess.run(self.update_op)
            self.sess.run(self.pull_params_op)
            return True
        else:
            return False


    def GetAction(self, state, episode=1,step=0):
        """
        Method to run data through the neural network.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        try:
            probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: state})
        except ValueError:
            probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: np.expand_dims(state,axis=0)})
        actions = np.array([np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs])
        return actions, [v,log_logits]

    def Update(self,episode=0):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        samples=0
        for i in range(len(self.buffer)):
            samples +=len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            #Finding if there are more than 1 done in the sequence. Clipping values if required.

            td_target, advantage = self.ProcessBuffer(traj)

            batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1
            s = np.array_split( self.buffer[traj][0], batches)
            a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches)
            td_target_ = np.array_split( td_target, batches)
            advantage_ = np.array_split( np.reshape(advantage, [-1]), batches)
            old_log_logits_ = np.array_split( np.reshape(self.buffer[traj][6], [-1,self.actionSize]), batches)

            #Create a dictionary with all of the samples?
            #Use a sampler to feed the update operation?

            #Staging Buffer inputs into the entries to run through the network.
            # print(td_target)
            for epoch in range(self.HPs["Epochs"]):
                for i in range(batches):

                    feed_dict = {self.s: np.squeeze(np.asarray(s[i])),
                                 self.a_his: np.asarray(a_his[i]),
                                 self.td_target_:np.asarray(td_target_[i]),
                                 self.advantage_: np.asarray(advantage_[i]),
                                 self.old_log_logits_: np.asarray(old_log_logits_[i])}
                    # aLoss= self.sess.run([self.actor_loss], feed_dict)
                    if self.counter == 3:
                        aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.meta_update_ops], feed_dict)
                    else:
                        aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.update_ops], feed_dict)

                    self.EntropyMA.append(entropy)
                    self.CriticLossMA.append(cLoss)
                    self.ActorLossMA.append(aLoss)
                    total_counter = 0
                    vanish_counter = 0
                    for grad in grads:
                        total_counter += np.prod(grad.shape)
                        vanish_counter += (np.absolute(grad)<1e-8).sum()
                    self.GradMA.append(vanish_counter/total_counter)
        self.counter += 1
        self.ClearTrajectory()


    def GetStatistics(self):
        dict = {"Training Results/Entropy":self.EntropyMA(),
        "Training Results/Loss Critic":self.CriticLossMA(),
        "Training Results/Loss Actor":self.ActorLossMA(),
        "Training Results/Vanishing Gradient":self.GradMA(),}
        return dict


    def ProcessBuffer(self,traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.
        clip : list[bool]
            List where the trajectory has finished.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """

        split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x]

        reward_lists = np.split(self.buffer[traj][2],split_loc)
        value_lists = np.split(self.buffer[traj][5],split_loc)

        td_target=[]; advantage=[]
        for rew,value in zip(reward_lists,value_lists):
            td_target_i, advantage_i = gae(rew.reshape(-1),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"])
            td_target.extend(td_target_i); advantage.extend( advantage_i)
        return td_target, advantage

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
Exemplo n.º 17
0
class PPO(Method):

    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess=sess
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize)
        scope="PPO"

        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Placeholders
                self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'TD_target')
                self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state":self.s}
                out = self.Model(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]

                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))
                entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1)

                # Clipped surrogate function
                ratio =self.ratio= tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(ratio, 1-self.HPs["eps"], 1+self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss')
                actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss')

                actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"]
                loss = actor_loss + critic_loss * self.HPs["CriticBeta"]

                # Build Trainer
                self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                self.gradients = self.optimizer.get_gradients(loss, self.Model.trainable_variables)
                self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, self.Model.trainable_variables))

        #Creating variables for logging.
        self.EntropyMA = MovingAverage(400)
        self.CriticLossMA = MovingAverage(400)
        self.ActorLossMA = MovingAverage(400)
        self.GradMA = MovingAverage(400)

    def GetAction(self, state, episode=1,step=0):
        """
        Method to run data through the neural network.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        try:
            probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: state})
        except ValueError:
            probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: np.expand_dims(state,axis=0)})
        actions = np.array([np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs])

        if step % self.HPs["FS"] == 0:
            self.store_actions = actions
            return actions, [v,log_logits]
        else:
            return self.store_actions, [v,log_logits]

    def Update(self,episode=0):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        #Counting number of samples.
        samples=0
        for i in range(len(self.buffer)):
            samples +=len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            td_target, advantage = self.ProcessBuffer(traj)

            batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1
            s = np.array_split( self.buffer[traj][0], batches)
            a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches)
            td_target_ = np.array_split( td_target, batches)
            advantage_ = np.array_split( np.reshape(advantage, [-1]), batches)
            old_log_logits_ = np.array_split( np.reshape(self.buffer[traj][6], [-1,self.actionSize]), batches)

            for epoch in range(self.HPs["Epochs"]):
                for i in range(batches):
                    #Staging Buffer inputs into the entries to run through the network.
                    feed_dict = {self.s: np.squeeze(s[i]),
                                 self.a_his: a_his[i],
                                 self.td_target_:td_target_[i],
                                 self.advantage_: advantage_[i],
                                 self.old_log_logits_: old_log_logits_[i]}
                    aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.update_ops], feed_dict)

                    self.EntropyMA.append(entropy)
                    self.CriticLossMA.append(cLoss)
                    self.ActorLossMA.append(aLoss)
                    total_counter = 0
                    vanish_counter = 0
                    for grad in grads:
                        total_counter += np.prod(grad.shape)
                        vanish_counter += (np.absolute(grad)<1e-8).sum()
                    self.GradMA.append(vanish_counter/total_counter)

        self.ClearTrajectory()


    def GetStatistics(self):
        dict = {"Training Results/Entropy":self.EntropyMA(),
        "Training Results/Loss Critic":self.CriticLossMA(),
        "Training Results/Loss Actor":self.ActorLossMA(),
        "Training Results/Vanishing Gradient":self.GradMA(),}
        return dict


    def ProcessBuffer(self,traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        # Split into different episodes based on the "done" signal. Assumes that episode terminates at done.
        # Cannot account for instances where there are multiple done signals in a row.

        split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x]

        reward_lists = np.split(self.buffer[traj][2],split_loc)
        value_lists = np.split(self.buffer[traj][5],split_loc)

        td_target=[]; advantage=[]
        for rew,value in zip(reward_lists,value_lists):
            td_target_i, advantage_i = gae(rew.reshape(-1).tolist(),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"])
            td_target.extend(td_target_i); advantage.extend( advantage_i)
        return td_target, advantage

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
Exemplo n.º 18
0
def ApeXWorkers(sess, settings, netConfigOverride):

    EXP_NAME = settings["RunName"]
    MODEL_PATH = './models/' + EXP_NAME
    LOG_PATH = './logs/' + EXP_NAME
    CreatePath(LOG_PATH)
    CreatePath(MODEL_PATH)

    for (dirpath, dirnames, filenames) in os.walk("configs/environment"):
        for filename in filenames:
            if settings["EnvConfig"] == filename:
                envConfigFile = os.path.join(dirpath, filename)
                break
    with open(envConfigFile) as json_file:
        envSettings = json.load(json_file)

    progbar = tf.keras.utils.Progbar(None,
                                     unit_name='Training',
                                     stateful_metrics=["Reward"])
    writer = tf.summary.FileWriter(LOG_PATH, graph=sess.graph)
    global_step = tf.Variable(0, trainable=False, name='global_step')
    global_step_next = tf.assign_add(global_step, 1)

    workers = []

    sharedBuffer = ApexBuffer()
    _, dFeatures, nActions, nTrajs = CreateEnvironment(envSettings,
                                                       multiprocessing=1)

    network = NetworkBuilder(settings["NetworkConfig"],
                             netConfigOverride,
                             scope="Global",
                             actionSize=nActions)
    targetNetwork = NetworkBuilder(settings["NetworkConfig"],
                                   netConfigOverride,
                                   scope="target",
                                   actionSize=nActions)
    Updater = ApeX(network,
                   sess,
                   stateShape=dFeatures,
                   actionSize=nActions,
                   scope="Global",
                   HPs=settings["NetworkHPs"],
                   sharedBuffer=sharedBuffer,
                   targetNetwork=targetNetwork)
    Updater.Model.summary()
    saver = tf.train.Saver(max_to_keep=3,
                           var_list=Updater.getVars + [global_step])
    Updater.InitializeVariablesFromFile(saver, MODEL_PATH)
    workers.append(
        WorkerLearner(Updater, sess, global_step, global_step_next, settings,
                      progbar, writer, MODEL_PATH, saver))

    i_name = "prioritizer"
    network = NetworkBuilder(settings["NetworkConfig"],
                             netConfigOverride,
                             scope=i_name,
                             actionSize=nActions)
    localNetwork = ApeX(network,
                        sess,
                        stateShape=dFeatures,
                        actionSize=nActions,
                        scope=i_name,
                        HPs=settings["NetworkHPs"],
                        globalAC=Updater,
                        nTrajs=nTrajs,
                        sharedBuffer=sharedBuffer)
    localNetwork.InitializeVariablesFromFile(saver, MODEL_PATH)
    workers.append(WorkerPrioritizer(localNetwork, sess, global_step,
                                     settings))

    # Create workers
    for i in range(settings["NumberENV"]):
        i_name = 'W_%i' % i  # worker name
        network = NetworkBuilder(settings["NetworkConfig"],
                                 netConfigOverride,
                                 scope=i_name,
                                 actionSize=nActions)
        localNetwork = ApeX(network,
                            sess,
                            stateShape=dFeatures,
                            actionSize=nActions,
                            scope=i_name,
                            HPs=settings["NetworkHPs"],
                            globalAC=Updater,
                            nTrajs=nTrajs,
                            sharedBuffer=sharedBuffer)
        localNetwork.InitializeVariablesFromFile(saver, MODEL_PATH)
        env, _, _, _ = CreateEnvironment(envSettings, multiprocessing=1)
        workers.append(
            WorkerActor(localNetwork, env, sess, global_step, global_step_next,
                        settings, progbar, writer, MODEL_PATH, saver))

    return workers
Exemplo n.º 19
0
class DQN_ms_v2(Method):

    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes I/O placeholders and the training process of a Multi-step DQN.
        Main principal is that instead of one-step TD diference, the loss is evaluated on a
        temporally extended basis.
        G = R_t + γR_t+1 + ... γ^n-1 R_t+n + q(S_t+n,a*,θ-)
        loss = MSE(G,q(S_t,A_t,θ))

        """
        #Placeholders
        self.actionSize = actionSize
        self.sess=sess
        self.scope="worker"
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="worker")
        self.Model_ = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="target")

        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]
        with self.sess.as_default(), self.sess.graph.as_default():
            self.states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='states')
            self.next_states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='next_states')
            self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold')
            self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='rewards_hold')
            self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold')
            with tf.name_scope("target"):
                out2 = self.Model_({"state":self.next_states_})
                q_next = out2["Q"]
                self.targetParams = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target")
            with tf.name_scope(self.scope):
                input = {"state":self.states_}
                out = self.Model(input)
                self.q = out["Q"]

                with tf.name_scope('current_Q'):
                    oh_action = tf.one_hot(self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent]

                with tf.name_scope('target_Q'):
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q
                    # td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q * (1. - self.done_)

                with tf.name_scope('td_error'):
                    loss = tf.keras.losses.MSE(td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(softmax_q * tf.log(softmax_q+ 1e-5))
                    self.loss=total_loss = loss + self.HPs["EntropyBeta"] * self.entropy

                if self.HPs["Optimizer"] == "Adam":
                    self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                self.workerParams = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

                self.gradients = self.optimizer.get_gradients(total_loss, self.workerParams)
                self.update_op = self.optimizer.apply_gradients(zip(self.gradients, self.workerParams))

                with tf.name_scope('push'):
                    self.push_ops = [l_p.assign(g_p) for l_p, g_p in zip(self.targetParams, self.workerParams)]

                self.grads=[self.gradients]
                self.losses=[self.loss]
                self.update_ops=[self.update_op]

        self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
        self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))]
        self.labels = ["Critic"]

    def GetAction(self, state,episode,step):
        """
        Contains the code to run the network based on an input.
        """
        if len(state.shape) == 3:
            state = state[np.newaxis, :]
        if len(state.shape) == 1:
            state = state[np.newaxis, :]
        q = self.sess.run(self.q, {self.states_: state})
        if "Exploration" in self.HPs:
            if self.HPs["Exploration"]=="EGreedy":
                prob = self.HPs["ExploreSS"] + (1-self.HPs["ExploreSS"])*(np.exp(-episode/self.HPs["ExplorationDecay"]))
                if random.uniform(0, 1) < prob:
                    actions = random.randint(0,self.actionSize-1)
                else:
                    actions = np.argmax(q, axis=-1)
            else:
                actions = np.argmax(q, axis=-1)
        else:
            actions = np.argmax(q, axis=-1)
        return actions ,[]  # return a int and extra data that needs to be fed to buffer.

    def Update(self,episode=0):
        """
        The main update function for A3C. The function pushes gradients to the global AC Network.
        The second function is to Pull
        """
        #Checking that there is enough data for a batch
        samples=0
        for i in range(len(self.buffer)):
            samples +=len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        #Combining all trajs into 1:
        s_list = []
        a_list = []
        done_list = []
        g_list = []
        s_n_list = []
        for traj in range(len(self.buffer)):
            g,s_n=MultiStepDiscountProcessing(self.buffer[traj][2],self.buffer[traj][3],self.HPs["Gamma"],self.HPs["MultiStep"])
            s_list.extend(self.buffer[traj][0])
            a_list.extend(self.buffer[traj][1])
            g_list.extend(g)
            s_n_list.extend(s_n)
            done_list.extend(self.buffer[traj][4])

        #Separating into different batches
        batches = len(s_list)//self.HPs["MinibatchSize"]+1
        s = np.array_split( s_list, batches)
        a_his = np.array_split( np.asarray(a_list).reshape(-1), batches)
        r = np.array_split( np.asarray(g_list).reshape(-1), batches)
        s_next = np.array_split( s_n_list, batches)
        done = np.array_split( done_list, batches)

        #Running all batches through multiple epochs
        for epoch in range(self.HPs["Epochs"]):
            for i in range(batches):
            #Create a feedDict from the buffer
                feedDict = {
                    self.states_ : np.squeeze(np.asarray(s[i])),
                    self.next_states_ : np.squeeze(np.asarray(s_next[i])),
                    self.actions_ : np.squeeze(np.asarray(a_his[i])),
                    self.rewards_ : np.squeeze(np.asarray(r[i])),
                    self.done_ : np.squeeze(np.asarray(done[i],dtype=float))

                }
                out = self.sess.run(self.update_ops+self.losses+self.grads, feedDict)
                out = np.array_split(out,3)
                losses = out[1]
                grads = out[2]

                for i,loss in enumerate(losses):
                    self.loss_MA[i].append(loss)

                for i,grads_i in enumerate(grads):
                    total_counter = 1
                    vanish_counter = 0
                    for grad in grads_i:
                        total_counter += np.prod(grad.shape)
                        vanish_counter += (np.absolute(grad)<1e-8).sum()
                    self.grad_MA[i].append(vanish_counter/total_counter)

        self.ClearTrajectory()
        self.sess.run(self.push_ops, feedDict)


    def GetStatistics(self):
        dict ={}
        for i,label in enumerate(self.labels):
            dict["Training Results/Vanishing Gradient " + label] = self.grad_MA[i]()
            dict["Training Results/Loss " + label] = self.loss_MA[i]()
        return dict

    @property
    def getVars(self):
        return self.Model.getVars(self.scope)
Exemplo n.º 20
0
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.HPs = settings["NetworkHPs"]

        #Building the network.
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)

        #Creating appropriate buffer for the method.
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("PPO"):
                #Placeholders
                if len(stateShape) == 4:
                    self.s = tf.placeholder(tf.float32,
                                            [None] + stateShape[0:4], 'S')
                else:
                    self.s = tf.placeholder(tf.float32, [None] + stateShape,
                                            'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None],
                                                 dtype=tf.float32,
                                                 name='adv_hold')
                self.old_log_logits_ = tf.placeholder(shape=[None, actionSize],
                                                      dtype=tf.float32,
                                                      name='old_logit_hold')

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.a_prob = out["actor"]
                self.v = out["critic"]
                self.log_logits = out["log_logits"]

                # Entropy
                def _log(val):
                    return tf.log(tf.clip_by_value(val, 1e-10, 10.0))

                self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob),
                                               name='entropy')

                # Critic Loss
                td_error = self.td_target_ - self.v
                self.critic_loss = tf.reduce_mean(tf.square(td_error),
                                                  name='critic_loss')

                # Actor Loss
                action_OH = tf.one_hot(self.a_his,
                                       actionSize,
                                       dtype=tf.float32)
                log_prob = tf.reduce_sum(self.log_logits * action_OH, 1)
                old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH,
                                             1)

                # Clipped surrogate function
                ratio = tf.exp(log_prob - old_log_prob)
                surrogate = ratio * self.advantage_
                clipped_surrogate = tf.clip_by_value(
                    ratio, 1 - self.HPs["eps"],
                    1 + self.HPs["eps"]) * self.advantage_
                surrogate_loss = tf.minimum(surrogate,
                                            clipped_surrogate,
                                            name='surrogate_loss')
                self.actor_loss = -tf.reduce_mean(surrogate_loss,
                                                  name='actor_loss')

                loss = self.actor_loss + self.critic_loss * self.HPs[
                    "CriticBeta"]

                # Build Trainer
                if self.HPs["Optimizer"] == "Adam":
                    self.optimizerA = tf.keras.optimizers.Adam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizerA = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.RMSProp(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizerA = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adagrad(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizerA = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adadelta(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizerA = tf.keras.optimizers.Adamax(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Adamax(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizerA = tf.keras.optimizers.SGD(
                        self.HPs["LR Actor"])
                    self.optimizerE = tf.keras.optimizers.SGD(
                        self.HPs["LR Entropy"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizerA = tf.keras.optimizers.Nadam(
                        self.HPs["LR Actor"], amsgrad=True)
                    self.optimizerE = tf.keras.optimizers.Nadam(
                        self.HPs["LR Entropy"], amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                a_params = self.Model.GetVariables("Actor")
                c_params = self.Model.GetVariables("Critic")
                self.gradients_a = self.optimizerA.get_gradients(
                    loss, self.Model.trainable_variables)
                self.update_op_a = self.optimizerA.apply_gradients(
                    zip(self.gradients_a, self.Model.trainable_variables))

                entropy_loss = -self.entropy * self.HPs["EntropyBeta"]
                self.gradients_e = self.optimizerE.get_gradients(
                    entropy_loss, a_params)
                self.update_op_e = self.optimizerE.apply_gradients(
                    zip(self.gradients_e, a_params))

                total_counter = 1
                vanish_counter = 0
                for gradient in self.gradients_a:
                    total_counter += np.prod(gradient.shape)
                    stuff = tf.reduce_sum(
                        tf.cast(
                            tf.math.less_equal(tf.math.abs(gradient),
                                               tf.constant(1e-8)), tf.int32))
                    vanish_counter += stuff
                self.vanishing_gradient = vanish_counter / total_counter

        self.update_ops = [self.update_op_a, self.update_op_e]
        self.logging_ops = [
            self.actor_loss, self.critic_loss, self.entropy,
            tf.reduce_mean(self.advantage_),
            tf.reduce_mean(ratio), loss, self.vanishing_gradient
        ]
        self.labels = [
            "Loss Actor", "Loss Critic", "Entropy", "Advantage", "PPO Ratio",
            "Loss Total", "Vanishing Gradient"
        ]
        self.logging_MA = [
            MovingAverage(400) for i in range(len(self.logging_ops))
        ]
Exemplo n.º 21
0
class DQN(Method):

    def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs):
        """
        Initializes I/O placeholders used for Tensorflow session runs.
        Initializes and Actor and Critic Network to be used for the purpose of RL.
        """
        #Placeholders

        self.sess=sess
        self.scope="DQN"
        self.HPs = settings["NetworkHPs"]
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize)

        self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)]
        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(self.scope):
                if len(stateShape) == 4:
                    self.states_ = tf.placeholder(shape=[None]+stateShape[1:4], dtype=tf.float32, name='states')
                    self.next_states_ = tf.placeholder(shape=[None]+stateShape[1:4], dtype=tf.float32, name='next_states')
                else:
                    self.states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='states')
                    self.next_states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='next_states')
                self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold')
                self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='rewards_hold')
                self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold')

                input = {"state":self.states_}
                out = self.Model(input)
                self.q = out["Q"]

                out2 = self.Model({"state":self.next_states_})
                q_next = out2["Q"]

                with tf.name_scope('current_Q'):
                    oh_action = tf.one_hot(self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size]
                    curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent]

                with tf.name_scope('target_Q'):
                    max_next_q = tf.reduce_max(q_next, axis=-1)
                    td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q
                    # td_target = self.rewards_  + self.HPs["Gamma"] * max_next_q * (1. - self.done_)

                with tf.name_scope('td_error'):
                    loss = tf.keras.losses.MSE(td_target, curr_q)
                    softmax_q = tf.nn.softmax(curr_q)
                    self.entropy = -tf.reduce_mean(softmax_q * tf.log(softmax_q+ 1e-5))
                    self.loss=total_loss = loss + self.HPs["EntropyBeta"] * self.entropy

                if self.HPs["Optimizer"] == "Adam":
                    self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "RMS":
                    self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adagrad":
                    self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adadelta":
                    self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Adamax":
                    self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Nadam":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "SGD":
                    self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"])
                elif self.HPs["Optimizer"] == "Amsgrad":
                    self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True)
                else:
                    print("Not selected a proper Optimizer")
                    exit()
                self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

                self.gradients = self.optimizer.get_gradients(total_loss, self.params)
                self.update_op = self.optimizer.apply_gradients(zip(self.gradients, self.params))

                self.grads=[self.gradients]
                self.losses=[self.loss]
                self.update_ops=[self.update_op]

        self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))]
        self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))]
        self.entropy_MA = MovingAverage(400)
        self.labels = ["Critic"]

    def GetAction(self, state,episode,step):
        """
        Contains the code to run the network based on an input.
        """
        if len(state.shape) == 3:
            state = state[np.newaxis, :]
        if len(state.shape) == 1:
            state = state[np.newaxis, :]
        q = self.sess.run(self.q, {self.states_: state})
        if "Exploration" in self.HPs:
            if self.HPs["Exploration"]=="EGreedy":
                prob = 0.1 + 0.9*(np.exp(-episode/self.HPs["ExplorationDecay"]))
                if random.uniform(0, 1) < prob:
                    actions = random.randint(0,4)
                else:
                    actions = np.argmax(q, axis=-1)
        else:
            actions = np.argmax(q, axis=-1)
        return actions ,[]  # return a int and extra data that needs to be fed to buffer.

    def Update(self,episode=0):
        """
        The main update function for A3C. The function pushes gradients to the global AC Network.
        The second function is to Pull
        """
        #Process the data from the buffer
        samples=0
        for i in range(len(self.buffer)):
            samples +=len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):
            batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1
            s = np.array_split( self.buffer[traj][0], batches)
            a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches)
            r = np.array_split( np.asarray(self.buffer[traj][2]).reshape(-1), batches)
            s_next = np.array_split( self.buffer[traj][3], batches)
            done = np.array_split( self.buffer[traj][4], batches)

            for epoch in range(self.HPs["Epochs"]):
                for i in range(batches):
                #Create a feedDict from the buffer
                    feedDict = {
                        self.states_ : np.squeeze(np.asarray(s[i])),
                        self.next_states_ : np.squeeze(np.asarray(s_next[i])),
                        self.actions_ : np.squeeze(np.asarray(a_his[i])),
                        self.rewards_ : np.squeeze(np.asarray(r[i])),
                        self.done_ : np.squeeze(np.asarray(done[i],dtype=float))

                    }
                    out = self.sess.run(self.update_ops+self.losses+self.grads, feedDict)   # local grads applied to global net.
                    out = np.array_split(out,3)
                    losses = out[1]
                    grads = out[2]

                    for i,loss in enumerate(losses):
                        self.loss_MA[i].append(loss)

                    for i,grads_i in enumerate(grads):
                        total_counter = 1
                        vanish_counter = 0
                        for grad in grads_i:
                            total_counter += np.prod(grad.shape)
                            vanish_counter += (np.absolute(grad)<1e-8).sum()
                        self.grad_MA[i].append(vanish_counter/total_counter)

                    ent = self.sess.run(self.entropy, feedDict)   # local grads applied to global net.
                    entropy = np.average(np.asarray(ent))
                    self.entropy_MA.append(entropy)

        self.ClearTrajectory()


    def GetStatistics(self):
        dict ={}
        for i,label in enumerate(self.labels):
            dict["Training Results/Vanishing Gradient " + label] = self.grad_MA[i]()
            dict["Training Results/Loss " + label] = self.loss_MA[i]()
            dict["Training Results/Entropy"] = self.entropy_MA()
        return dict


    @property
    def getVars(self):
        return self.Model.getVars(self.scope)