def step(self, action, outdir='test'): """ Action is a list of 6 inputs that is in the continous range [-1.0, 1.0] This is constrained so that we can use a hyperbolic tangent output activation for our opolicy network. This is particualarly helpful for stochastic policies (Gaussain) often used in SAC """ #TODO Clean this routine #Translate action (list) to override params dict #Translation is min + ([-1,1] --> [0,1]) * (mx - min) mods = AttrDict() mods.real_cycle_time= self.min_action[0] + (0.5 * (action[0]+1)) * (self.max_action[0] - self.min_action[0]) mods.vent_time_fract= self.min_action[1] + (0.5 * (action[1]+1)) * (self.max_action[1] - self.min_action[1]) #Hardcoded mods.cycles=31 mods.real_vent_time=mods.vent_time_fract*mods.real_cycle_time mods.input_orifice=2.78 mods.vent_orifice=1.56 mods.blowdown_orifice=3.5 #Simulate data, ret, param, pickle_name,out_place= psa.simulate(mods, outdir=outdir, params_file='params', verbose=False) #TODO Verify if we care about prod_y or counter_y? Also [-1] refers to the last timestep but why [1]? We care about the middle cylinder? reward = ret.prod_y[-1][1] #New state state = psa.init(self.params, self.norm) return state, reward, True, {}
def reset(self): state = psa.init(self.params, self.norm) return state