def main(): args = get_args() # create save directory save_dir = os.path.join('weights', args.exp_name) if not os.path.exists(save_dir): os.makedirs(save_dir) else: shutil.move(save_dir, save_dir + '.backup') os.makedirs(save_dir) state_transform = StateVelCentr(obstacles_mode='standard', exclude_centr=True, vel_states=[]) num_actions = 18 # build model model_params = { 'state_size': state_transform.state_size, 'num_act': num_actions, 'gamma': args.gamma, 'actor_lr': args.actor_lr, 'critic_lr': args.critic_lr, 'layer_norm': args.layer_norm } train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \ build_model(**model_params) actor = Agent(actor_fn, params_actor, params_crit) if args.weights is not None: actor.load(args.weights) actor_lr_step = (args.actor_lr - args.actor_lr_end) / args.max_steps critic_lr_step = (args.critic_lr - args.critic_lr_end) / args.max_steps # build actor weights = [p.get_value() for p in params_actor] # build replay memory memory = ReplayMemory(state_transform.state_size, 18, 5000000) # init shared variables global_step = Value('i', 0) updates = Value('i', 0) best_reward = Value('f', -1e8) testing = Value('i', 0) # init agents data_queue = Queue() workers = [] weights_queues = [] num_agents = args.n_threads - 2 print('starting {} agents'.format(num_agents)) for i in range(num_agents): w_queue = Queue() worker = Process(target=run_agent, args=(model_params, weights, state_transform, data_queue, w_queue, i, global_step, updates, best_reward, args.param_noise_prob, save_dir, args.max_steps) ) worker.daemon = True worker.start() sleep(args.sleep) workers.append(worker) weights_queues.append(w_queue) prev_steps = 0 start_save = time() start_test = time() weights_rew_to_check = [] while global_step.value < args.max_steps: # get all data try: i, batch, weights_check, reward = data_queue.get_nowait() if weights_check is not None: weights_rew_to_check.append((weights_check, reward)) weights_queues[i].put(weights) # add data to memory memory.add_samples(*batch) except queue.Empty: pass # training step # TODO: consider not training during testing model if len(memory) > args.start_train_steps: batch = memory.random_batch(args.batch_size) if np.random.rand() < args.flip_prob: states, actions, rewards, terminals, next_states = batch states_flip = state_transform.flip_states(states) next_states_flip = state_transform.flip_states(next_states) actions_flip = np.zeros_like(actions) actions_flip[:, :num_actions//2] = actions[:, num_actions//2:] actions_flip[:, num_actions//2:] = actions[:, :num_actions//2] states_all = np.concatenate((states, states_flip)) actions_all = np.concatenate((actions, actions_flip)) rewards_all = np.tile(rewards.ravel(), 2).reshape(-1, 1) terminals_all = np.tile(terminals.ravel(), 2).reshape(-1, 1) next_states_all = np.concatenate((next_states, next_states_flip)) batch = (states_all, actions_all, rewards_all, terminals_all, next_states_all) actor_loss, critic_loss = train_fn(*batch) updates.value += 1 if np.isnan(actor_loss): raise Value('actor loss is nan') if np.isnan(critic_loss): raise Value('critic loss is nan') target_update_fn() weights = actor.get_actor_weights() delta_steps = global_step.value - prev_steps prev_steps += delta_steps actor_lr.set_value(lasagne.utils.floatX(max(actor_lr.get_value() - delta_steps*actor_lr_step, args.actor_lr_end))) critic_lr.set_value(lasagne.utils.floatX(max(critic_lr.get_value() - delta_steps*critic_lr_step, args.critic_lr_end))) # check if need to save and test if (time() - start_save)/60. > args.save_period_min: fname = os.path.join(save_dir, 'weights_updates_{}.pkl'.format(updates.value)) actor.save(fname) start_save = time() # start new test process weights_rew_to_check = [(w, r) for w, r in weights_rew_to_check if r > best_reward.value and r > 0] weights_rew_to_check = sorted(weights_rew_to_check, key=lambda x: x[1]) if ((time() - start_test) / 60. > args.test_period_min or len(weights_rew_to_check) > 0) and testing.value == 0: testing.value = 1 print('start test') if len(weights_rew_to_check) > 0: _weights, _ = weights_rew_to_check.pop() else: _weights = weights worker = Process(target=test_agent, args=(testing, state_transform, args.num_test_episodes, model_params, _weights, best_reward, updates, global_step, save_dir) ) worker.daemon = True worker.start() start_test = time() # end all processes for w in workers: w.join()
def run_agent(model_params, weights, state_transform, data_queue, weights_queue, process, global_step, updates, best_reward, param_noise_prob, save_dir, max_steps=10000000): train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \ build_model(**model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames) random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.2, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) # prepare buffers for data states = [] actions = [] rewards = [] terminals = [] total_episodes = 0 start = time() action_noise = True while global_step.value < max_steps: seed = random.randrange(2**32-2) state = env.reset(seed=seed, difficulty=2) random_process.reset_states() total_reward = 0. total_reward_original = 0. terminal = False steps = 0 while not terminal: state = np.asarray(state, dtype='float32') action = actor.act(state) if action_noise: action += random_process.sample() next_state, reward, next_terminal, info = env.step(action) total_reward += reward total_reward_original += info['original_reward'] steps += 1 global_step.value += 1 # add data to buffers states.append(state) actions.append(action) rewards.append(reward) terminals.append(terminal) state = next_state terminal = next_terminal if terminal: break total_episodes += 1 # add data to buffers after episode end states.append(state) actions.append(np.zeros(env.noutput)) rewards.append(0) terminals.append(terminal) states_np = np.asarray(states).astype(np.float32) data = (states_np, np.asarray(actions).astype(np.float32), np.asarray(rewards).astype(np.float32), np.asarray(terminals), ) weight_send = None if total_reward > best_reward.value: weight_send = actor.get_actor_weights() # send data for training data_queue.put((process, data, weight_send, total_reward)) # receive weights and set params to weights weights = weights_queue.get() report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len {}, ' \ 'reward: {:.2f}, original_reward {:.4f}; best reward: {:.2f} noise {}'. \ format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params') print(report_str) with open(os.path.join(save_dir, 'train_report.log'), 'a') as f: f.write(report_str + '\n') actor.set_actor_weights(weights) action_noise = np.random.rand() < 1 - param_noise_prob if not action_noise: set_params_noise(actor, states_np, random_process.current_sigma) # clear buffers del states[:] del actions[:] del rewards[:] del terminals[:] if total_episodes % 100 == 0: env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
def run_agent(args, model_params, weights, data_queue, weights_queue, process, global_step, updates, best_reward, param_noise_prob, save_dir, max_steps=10000000): train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model( **model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=config.skip_frames) env.spec.timestep_limit = 3000 # ndrw # random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.3, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) sigma_rand = random.uniform(0.05, 0.5) dt_rand = random.uniform(0.002, 0.02) param_noise_prob = random.uniform(param_noise_prob * 0.25, min(param_noise_prob * 1.5, 1.)) random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=sigma_rand, dt=dt_rand, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) print('OUProcess_sigma = ' + str(sigma_rand) + ' OUProcess_dt = ' + str(dt_rand) + ' param_noise_prob = ' + str(param_noise_prob)) # prepare buffers for data states = [] actions = [] rewards = [] terminals = [] total_episodes = 0 start = time() action_noise = True while global_step.value < max_steps: seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=args.difficulty) random_process.reset_states() total_reward = 0. total_reward_original = 0. terminal = False steps = 0 while not terminal: state = np.asarray(state, dtype='float32') action = actor.act(state) if action_noise: action += random_process.sample() next_state, reward, next_terminal, info = env._step(action) total_reward += reward total_reward_original += info['original_reward'] steps += 1 global_step.value += 1 # add data to buffers states.append(state) actions.append(action) rewards.append(reward) terminals.append(terminal) state = next_state terminal = next_terminal if terminal: break total_episodes += 1 # add data to buffers after episode end states.append(state) actions.append(np.zeros(env.noutput)) rewards.append(0) terminals.append(terminal) states_np = np.asarray(states).astype(np.float32) data = ( states_np, np.asarray(actions).astype(np.float32), np.asarray(rewards).astype(np.float32), np.asarray(terminals), ) weight_send = None if total_reward > best_reward.value: weight_send = actor.get_actor_weights() # send data for training data_queue.put((process, data, weight_send, total_reward)) # receive weights and set params to weights weights = weights_queue.get() # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, original_reward {:.4f}, best reward: {:.2f}, noise: {}'. \ # format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params') # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \ # format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, best_reward.value, 'actions' if action_noise else 'params') report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, pelvis_Z: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \ format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis'][0], info['pelvis'][2], total_reward, best_reward.value, 'actions' if action_noise else 'params') print(report_str) try: with open(os.path.join(save_dir, 'train_report.log'), 'a') as f: f.write(report_str + '\n') except: print('#############################################') print( 'except » with open(os.path.join(save_dir, train_report.log), a) as f:' ) print('#############################################') actor.set_actor_weights(weights) action_noise = np.random.rand() < 1 - param_noise_prob if not action_noise: set_params_noise(actor, states_np, random_process.current_sigma) # clear buffers del states[:] del actions[:] del rewards[:] del terminals[:] if total_episodes % 100 == 0: env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=config.skip_frames)