def playonce(self, noise_level, _env): t = time.time() skip = ENV_SKIP env = fastenv(_env, skip) noise_source = one_fsq_noise() for j in range(200): noise_source.one((DIM_ACTION * 2,), noise_level) state = env.reset() n_steps = 0 ep_reward = 0 warmup = BATCH_SIZE * 32 noise_phase = int(np.random.uniform() * 999999) while True: action = self.agent.select_action(state) phased_noise_anneal_duration = 100 phased_noise_amplitude = ((-noise_phase-n_steps) % phased_noise_anneal_duration) / phased_noise_anneal_duration phased_noise_amplitude = max(0, phased_noise_amplitude * 2 - 1) phased_noise_amplitude = max(0.01, phased_noise_amplitude ** 2) exploration_noise = noise_source.one((DIM_ACTION * 2,), noise_level) * phased_noise_amplitude action += exploration_noise * 0.5 action = np.clip(action, 0, 1) next_state, reward, done, info = env.step(action.tolist()) self.agent.memory.push(deepcopy_all(state, action, [reward], next_state, [float(done)])) if len(self.agent.memory) >= warmup: with self.lock: self.agent.learn() state = next_state ep_reward += reward n_steps += 1 if done: break with self.lock: t = time.time() - t print('reward: {}, n_steps: {}, explore: {:.5f}, n_mem: {}, time: {:.2f}' \ .format(ep_reward, n_steps, noise_level, len(self.agent.memory), t)) global t0 self.plotter.pushys([ep_reward, noise_level, (time.time() - t0) % 3600 / 3600 - 2]) _env.rel() del env
def __init__(self, args): self.rpm = rpm(1000000) self.render = True self.training = True self.noise_source = one_fsq_noise() self.train_multiplier = args.train_multiplier self.inputdims = args.observation_space_dims low = 0.0 high = 1.0 num_of_actions = args.action_space self.action_bias = high / 2.0 + low / 2.0 self.action_multiplier = high - self.action_bias def clamper(actions): return np.clip(actions, a_max=high, a_min=low) self.clamper = clamper self.outputdims = args.action_space self.discount_factor = args.gamma ids, ods = self.inputdims, self.outputdims print('inputdims:{}, outputdims:{}'.format(ids, ods)) self.actor = models.create_actor_network(ids, ods).cuda() self.critic = models.create_critic_network(ids, ods).cuda() self.actor_target = models.create_actor_network(ids, ods).cuda() self.critic_target = models.create_critic_network(ids, ods).cuda() self.critic_criterion = nn.MSELoss().cuda() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=3e-4) self.plot_epoch = [0] self.plot_reward = [0] import threading as th self.lock = th.Lock()
def playonce(self, noise_level, _env): t = time.time() skip = ENV_SKIP env = fastenv(_env, skip) noise_source = one_fsq_noise() for j in range(200): noise_source.one((DIM_ACTION, ), noise_level) state = env.reset() info = {'step': 0} n_steps = 0 ep_reward = 0 warmup = BATCH_SIZE * 32 noise_phase = int(np.random.uniform() * 999999) while True: action = self.agent.select_action(state) phased_noise_anneal_duration = 100 phased_noise_amplitude = ( (-noise_phase - n_steps) % phased_noise_anneal_duration) / phased_noise_anneal_duration phased_noise_amplitude = max(0, phased_noise_amplitude * 2 - 1) phased_noise_amplitude = max(0.01, phased_noise_amplitude**2) exploration_noise = noise_source.one( (DIM_ACTION, ), noise_level) * phased_noise_amplitude action += exploration_noise * 0.5 action = np.clip(action, 0, 1) next_state, reward, done, info = env.step(action.tolist()) done1 = False if info['step'] == MAX_EP_STEPS else done self.agent.memory.push( deepcopy_all(state, action, [reward], next_state, [done1])) if n_steps >= 100 / ENV_SKIP: self.agent.memory.push( deepcopy_all(mirror_s(state), mirror_a(action), [reward], mirror_s(next_state), [done1])) else: self.agent.memory.push( deepcopy_all(state, action, [reward], next_state, [done1])) if len(self.agent.memory) >= warmup: with self.lock: self.agent.learn() state = next_state ep_reward += reward n_steps += 1 if done: break with self.lock: t = time.time() - t print('reward: {}, n_steps: {}, explore: {:.5f}, n_mem: {}, time: {:.2f}' \ .format(ep_reward, info['step'], noise_level, len(self.agent.memory), t)) _env.rel() del env
def __init__( self, observation_space_dims, action_space, stack_factor=1, discount_factor=.99, # gamma # train_skip_every=1, train_multiplier=1, ): self.rpm = rpm(1000000) # 1M history self.plotter = plotter(num_lines=3) self.render = True self.training = True self.noise_source = one_fsq_noise() self.train_counter = 0 # self.train_skip_every = train_skip_every self.train_multiplier = train_multiplier self.observation_stack_factor = stack_factor self.inputdims = observation_space_dims * self.observation_stack_factor # assume observation_space is continuous self.is_continuous = True if isinstance(action_space, Box) else False if self.is_continuous: # if action space is continuous low = action_space.low high = action_space.high num_of_actions = action_space.shape[0] self.action_bias = high / 2. + low / 2. self.action_multiplier = high - self.action_bias # say high,low -> [2,7], then bias -> 4.5 # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7] def clamper(actions): return np.clip(actions, a_max=action_space.high, a_min=action_space.low) self.clamper = clamper else: num_of_actions = action_space.n self.action_bias = .5 self.action_multiplier = .5 # map (-1,1) into (0,1) def clamper(actions): return np.clip(actions, a_max=1., a_min=0.) self.clamper = clamper self.outputdims = num_of_actions self.discount_factor = discount_factor ids, ods = self.inputdims, self.outputdims print('inputdims:{}, outputdims:{}'.format(ids, ods)) self.actor = self.create_actor_network(ids, ods) self.critic = self.create_critic_network(ids, ods) self.actor_target = self.create_actor_network(ids, ods) self.critic_target = self.create_critic_network(ids, ods) # print(self.actor.get_weights()) # print(self.critic.get_weights()) self.feed, self.joint_inference, sync_target = self.train_step_gen() sess = ct.get_session() sess.run(tf.global_variables_initializer()) sync_target() import threading as th self.lock = th.Lock() if not hasattr(self, 'wavegraph'): num_waves = self.outputdims * 2 + 1 def rn(): r = np.random.uniform() return 0.2 + r * 0.4 colors = [] for i in range(num_waves - 1): color = [rn(), rn(), rn()] colors.append(color) colors.append([0.2, 0.5, 0.9]) self.wavegraph = wavegraph(num_waves, 'actions/noises/Q', np.array(colors))
def play(self, env, max_steps=-1, realtime=False, noise_level=0.): # play 1 episode timer = time.time() noise_source = one_fsq_noise() for j in range(10): noise_source.one((self.outputdims, ), noise_level) max_steps = max_steps if max_steps > 0 else 50000 steps = 0 total_reward = 0 episode_memory = [] # removed: state stacking # moved: observation processing try: observation = env.reset() except Exception as e: print('(agent) something wrong on reset(). episode terminates now') traceback.print_exc() print(e) return while True and steps <= max_steps: steps += 1 observation_before_action = observation # s1 exploration_noise = noise_source.one((self.outputdims, ), noise_level) # exploration_noise -= noise_level * 1 # self.lock.acquire() # please do not disrupt. action = self.act(observation_before_action, exploration_noise) # a1 # self.lock.release() if self.is_continuous: # add noise to our actions, since our policy by nature is deterministic exploration_noise *= self.action_multiplier # print(exploration_noise,exploration_noise.shape) action += exploration_noise action = self.clamper(action) action_out = action else: raise RuntimeError( 'this version of ddpg is for continuous only.') # o2, r1, try: observation, reward, done, _info = env.step( action_out) # take long time except Exception as e: print( '(agent) something wrong on step(). episode teminates now') traceback.print_exc() print(e) return # d1 isdone = 1 if done else 0 total_reward += reward # feed into replay memory if self.training == True: episode_memory.append((observation_before_action, action, reward, isdone, observation)) # don't feed here since you never know whether the episode will complete without error. # self.feed_one(( # observation_before_action,action,reward,isdone,observation # )) # s1,a1,r1,isdone,s2 # self.lock.acquire() self.train(verbose=2 if steps == 1 else 0) # self.lock.release() # if self.render==True and (steps%30==0 or realtime==True): # env.render() if done: break # print('episode done in',steps,'steps',time.time()-timer,'second total reward',total_reward) totaltime = time.time() - timer print( 'episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}' .format(steps, totaltime, totaltime / steps, total_reward)) self.lock.acquire() for t in episode_memory: self.feed_one(t) self.plotter.pushys( [total_reward, noise_level, (time.time() % 3600) / 3600 - 2]) # self.noiseplotter.pushy(noise_level) self.lock.release() return
def play(self,env,max_steps=-1,realtime=False,noise_level=0.): # play 1 episode timer = time.time() noise_source = one_fsq_noise() noise_source.skip = 1 # freq adj for j in range(200): noise_source.one((self.outputdims,),noise_level) max_steps = max_steps if max_steps > 0 else 50000 steps = 0 total_reward = 0 episode_memory = [] # removed: state stacking # moved: observation processing noise_phase = int(np.random.uniform()*999999) try: observation = env.reset() except Exception as e: print('(agent) something wrong on reset(). episode terminates now') traceback.print_exc() print(e) return while True and steps <= max_steps: steps +=1 observation_before_action = observation # s1 phased_noise_anneal_duration = 100 # phased_noise_amplitude = ((-noise_phase-steps)%phased_noise_anneal_duration)/phased_noise_anneal_duration*2*np.pi # phased_noise_amplitude = max(0.1,np.sin(phased_noise_amplitude)) phased_noise_amplitude = ((-noise_phase-steps)%phased_noise_anneal_duration)/phased_noise_anneal_duration phased_noise_amplitude = max(0,phased_noise_amplitude*2-1) phased_noise_amplitude = max(0.01,phased_noise_amplitude**2) exploration_noise = noise_source.one((self.outputdims,),noise_level)*phased_noise_amplitude # exploration_noise = np.random.normal(size=(self.outputdims,))*noise_level*phased_noise_amplitude # exploration_noise -= noise_level * 1 # exploration_noise = np.random.normal(size=(self.outputdims,))*0. # # # we want to add some shot noise # shot_noise_prob = min(1, noise_level/5) # 0.05 => 1% shot noise # shot_noise_replace = (np.random.uniform(size=exploration_noise.shape)<shot_noise_prob).astype('float32') # 0 entries passes thru, 1 entries shot noise. # # shot_noise_amplitude = np.random.uniform(size=exploration_noise.shape)*2-1 # # [-1, 1] # # add shot noise! # exploration_noise = exploration_noise*(1-shot_noise_replace) + shot_noise_amplitude*shot_noise_replace # self.lock.acquire() # please do not disrupt. action = self.act(observation_before_action, exploration_noise) # a1 # self.lock.release() if self.is_continuous: # add noise to our actions, since our policy by nature is deterministic exploration_noise *= self.action_multiplier # print(exploration_noise,exploration_noise.shape) action += exploration_noise action = self.clamper(action) # don't clamp, see what happens. action_out = action else: raise RuntimeError('this version of ddpg is for continuous only.') # o2, r1, try: observation, reward, done, _info = env.step(action_out) # take long time except Exception as e: print('(agent) something wrong on step(). episode teminates now') traceback.print_exc() print(e) return # d1 isdone = 1 if done else 0 total_reward += reward # feed into replay memory if self.training == True: # episode_memory.append(( # observation_before_action,action,reward,isdone,observation # )) # don't feed here since you never know whether the episode will complete without error. # changed mind: let's feed here since this way the training dynamic is not disturbed self.feed_one(( observation_before_action,action,reward,isdone,observation )) # s1,a1,r1,isdone,s2 # self.lock.acquire() self.train(verbose=2 if steps==1 else 0) # self.lock.release() # if self.render==True and (steps%30==0 or realtime==True): # env.render() if done : break # print('episode done in',steps,'steps',time.time()-timer,'second total reward',total_reward) totaltime = time.time()-timer print('episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'.format( steps,totaltime,totaltime/steps,total_reward )) self.lock.acquire() # for t in episode_memory: # if np.random.uniform()>0.5: # self.feed_one(t) self.plotter.pushys([total_reward,noise_level,(time.time()%3600)/3600-2,steps/1000-1]) # self.noiseplotter.pushy(noise_level) self.lock.release() return
def __init__( self, observation_space, action_space, stack_factor=1, discount_factor=.99, # gamma train_skip_every=1, ): self.rpm = rpm(1000000) # 1M history self.render = True self.noise_source = one_fsq_noise() self.train_counter = 0 self.train_skip_every = train_skip_every self.observation_stack_factor = stack_factor self.inputdims = observation_space.shape[ 0] * self.observation_stack_factor # assume observation_space is continuous self.is_continuous = True if isinstance(action_space, Box) else False if self.is_continuous: # if action space is continuous low = action_space.low high = action_space.high num_of_actions = action_space.shape[0] self.action_bias = high / 2. + low / 2. self.action_multiplier = high - self.action_bias # say high,low -> [2,7], then bias -> 4.5 # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7] def clamper(actions): return np.clip(actions, a_max=action_space.high, a_min=action_space.low) self.clamper = clamper else: num_of_actions = action_space.n self.action_bias = .5 self.action_multiplier = .5 # map (-1,1) into (0,1) def clamper(actions): return np.clip(actions, a_max=1., a_min=0.) self.clamper = clamper self.outputdims = num_of_actions self.discount_factor = discount_factor ids, ods = self.inputdims, self.outputdims print('inputdims:{}, outputdims:{}'.format(ids, ods)) self.actor = self.create_actor_network(ids, ods) self.critic = self.create_critic_network(ids, ods) self.actor_target = self.create_actor_network(ids, ods) self.critic_target = self.create_critic_network(ids, ods) # print(self.actor.get_weights()) # print(self.critic.get_weights()) self.feed, self.joint_inference, sync_target = self.train_step_gen() sess = ct.get_session() sess.run(tf.global_variables_initializer()) sync_target()
def run_episode(self, fenv, max_steps=-1, training=False, render=False, noise_level=0., ac_id=0): time_start = time.time() noise_source = None if noise_level > 0.0: noise_source = one_fsq_noise() # warm up noise source for _ in range(2000): noise_source.one((self.outputdims, ), noise_level) max_steps = max_steps if max_steps > 0 else 50000 steps = 0 total_reward = 0 try: # this might be a remote env observation = np.array(fenv.reset()) except Exception as e: print('Bad things during reset. Episode terminated.', e) traceback.print_exc() return while True and steps <= max_steps: steps += 1 observation_before_action = observation # s1 exploration_noise = 0.0 if noise_level > 0.0: exploration_noise = noise_source.one((self.outputdims, ), noise_level) # get action action = None with self.lock_swap: if training: action = self.get_action(observation_before_action, ac_id) else: action = self.get_max_action(observation_before_action) # add noise to our actions, since our policy is deterministic if noise_level > 0.0: exploration_noise *= self.action_multiplier action += exploration_noise action = self.clamp_action(action) # step try: # can't send receive np arrays over pyro action_out = [float(action[i]) for i in range(len(action))] observation, reward, done, _info = fenv.step(action_out) observation = np.array(observation) except Exception as e: print('Bad things during step. Episode terminated.', e) traceback.print_exc() return # d1 isdone = 1 if done else 0 total_reward += reward # train if training == True: # The code works without this lock, but depending on training speed there is too much noise on updates. # The model always trains and is more stable with lock here with self.lock: self.append_memory(observation_before_action, action, reward, isdone, observation) # s1,a1,r1,isdone,s2 for i in range(self.nr_networks): self.train_batch(i) else: if render: fenv.render() if done: break totaltime = time.time() - time_start if training == True: self.global_step += 1 print( self.global_step, ': Episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}' .format(steps, totaltime, totaltime / steps, total_reward)) self.history.append_train(total_reward, noise_level, steps) else: print( 'Test done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}' .format(steps, totaltime, totaltime / steps, total_reward)) self.history.append_test(total_reward, self.global_step, steps) if render == False: # background test if total_reward > self.max_reward: self.max_reward = total_reward self.save_weights("max_model") print("Saved new max model with score: ", total_reward) return total_reward
def play(self, env, max_steps=-1, realtime=False, noise_Level=0.): timer = time.time() noise_source = one_fsq_noise() for j in range(200): noise_source.one((self.outputdims, ), noise_Level) max_steps = max_steps if max_steps > 0 else 50000 steps = 0 total_reward = 0 episode_memory = [] observation = env.reset() while True and steps <= max_steps: steps += 1 observation_before_action = observation exploration_noise = noise_source.one((self.outputdims, ), noise_Level) action = self.act(observation_before_action, exploration_noise) exploration_noise *= self.action_multiplier action = self.clamper(action) action_out = action observation, reward, done, _info = env.step(action_out) isdone = 1 if done else 0 total_reward += reward if self.training == True: episode_memory.append((observation_before_action, action, reward, isdone, observation)) self.train() if done: break totaltime = time.time() - timer print( 'episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}' .format(steps, totaltime, totaltime / steps, total_reward)) self.plot_epoch.append(self.plot_epoch[-1] + 1) self.plot_reward.append(total_reward) #epoch = range(0,3000) #rewards = range(0,6000,2) self.lock.acquire() for t in episode_memory: self.feed_one(t) #self.plotter.pushys([total_reward,noise_level,(time.time()%3600)/3600-2]) # self.noiseplotter.pushy(noise_level) self.lock.release() return