示例#1
0
    def step(self, action, outdir='test'):

        """
        Action is a list of 6 inputs that is in the continous range [-1.0, 1.0]
        This is constrained so that we can use a hyperbolic tangent output activation for our opolicy network.
        This is particualarly helpful for stochastic policies (Gaussain) often used in SAC
        """

        #TODO Clean this routine
        #Translate action (list) to override params dict
        #Translation is min + ([-1,1] --> [0,1]) * (mx - min)
        mods = AttrDict()
        mods.real_cycle_time= self.min_action[0] + (0.5 * (action[0]+1)) * (self.max_action[0] - self.min_action[0]) 
        mods.vent_time_fract= self.min_action[1] + (0.5 * (action[1]+1)) * (self.max_action[1] - self.min_action[1]) 

        #Hardcoded
        mods.cycles=31
        mods.real_vent_time=mods.vent_time_fract*mods.real_cycle_time
        mods.input_orifice=2.78
        mods.vent_orifice=1.56
        mods.blowdown_orifice=3.5
        
        #Simulate
        data, ret, param, pickle_name,out_place= psa.simulate(mods, 
                                                        outdir=outdir,
                                                        params_file='params', 
                                                        verbose=False)

        #TODO Verify if we care about prod_y or counter_y? Also [-1] refers to the last timestep but why [1]? We care about the middle cylinder?
        reward = ret.prod_y[-1][1] 

        #New state
        state = psa.init(self.params, self.norm)

        return state, reward, True, {}
示例#2
0
 def reset(self):
     state = psa.init(self.params, self.norm)
     return state