def main(args): time_str = time.strftime("%Y%m%d-%H%M%S") logger_ins = logger.Logger( HOME + '/catkin_ws/src/Turtlebot3_Pheromone/src/log', output_formats=[logger.HumanOutputFormat(sys.stdout)]) board_logger = tensorboard_logging.Logger( os.path.join(logger_ins.get_dir(), "tf_board", time_str)) ######################################################## game_state = phero_turtlebot_exp2.Env( ) # game_state has frame_step(action) function actor_critic = ActorCritic(game_state) random.seed(args.random_seed) ######################################################## num_trials = 600 trial_len = 256 log_interval = 5 train_indicator = 1 tfirststart = time.time() # Reward Logging with open(HOME + '/catkin_ws/src/Turtlebot3_Pheromone/src/log/csv/{}.csv'.format( actor_critic.file_name), mode='w') as csv_file: csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(['Episode', 'Average Reward']) # Double ended queue with max size 100 to store episode info epinfobuf = deque(maxlen=100) # Experiment related num_robots = game_state.num_robots current_state = game_state.reset() # actor_critic.read_human_data() step_reward = np.array([0, 0]).reshape(1, 2) #step_Q = [0,0] step = 0 if (train_indicator == 2): for i in range(num_trials): print("trial:" + str(i)) #game_state.step(0.3, 0.2, 0.0) #game_state.reset() current_state = game_state.reset() ############################################################################################## total_reward = 0 for j in range(100): step = step + 1 #print("step is %s", step) ########################################################################################### #print('wanted value is %s:', game_state.observation_space.shape[0]) current_state = current_state.reshape( (1, game_state.observation_space.shape[0])) action, eps = actor_critic.act(current_state) action = action.reshape((1, game_state.action_space.shape[0])) print("action is speed: %s, angular: %s", action[0][1], action[0][0]) _, new_state, reward, done, _ = game_state.step( 0.1, action[0][1] * 5, action[0][0] * 5 ) # we get reward and state here, then we need to calculate if it is crashed! for 'dones' value total_reward = total_reward + reward if (train_indicator == 1): # actor_critic.actor_model.load_weights("actormodel-90-1000.h5") # actor_critic.critic_model.load_weights("criticmodel-90-1000.h5") step_reward = np.array([0, 0]).reshape(1, 2) for i in range(num_trials): print("trial:" + str(i)) #game_state.step(0.3, 0.2, 0.0) #game_state.reset() ''' Get states of multiple robots (num_robots x num_states) ''' _, current_states = game_state.reset() ############################################################################################## #total_reward = 0 epinfos = [] for j in range(trial_len): ########################################################################################### #print('wanted value is %s:', game_state.observation_space.shape[0]) current_states = current_states.reshape( (num_robots, game_state.observation_space.shape[0])) actions = [] for k in range(num_robots): action, eps = actor_critic.act(current_states[k]) action = action.reshape( (1, game_state.action_space.shape[0])) actions.append(action) actions = np.squeeze(np.asarray(actions)) #print("Actions: {}".format(actions)) #print("action is speed: %s, angular: %s", action[0][1], action[0][0]) _, new_states, rewards, dones, infos = game_state.step( actions, 0.1 ) # we get reward and state here, then we need to calculate if it is crashed! for 'dones' value #print("Rewards: {}".format(rewards)) #total_reward = total_reward + reward ########################################################################################### if j == (trial_len - 1): dones = np.array([True, True]).reshape(game_state.num_robots, 1) #print("this is reward:", total_reward) #print('eps is', eps) step = step + 1 #plot_reward(step,reward,ax,fig) #step_reward = np.append(step_reward,[step,reward]) #step_start = time.time() #sio.savemat('step_reward.mat',{'data':step_reward},True,'5', False, False,'row') #print("step is %s", step) #print("info: {}".format(info[0]['episode']['r'])) #Q_values = actor_critic.read_Q_values(current_state, action) #step_Q = np.append(step_Q,[step,Q_values[0][0]]) #print("step_Q is %s", Q_values[0][0]) #sio.savemat('step_Q.mat',{'data':step_Q},True,'5', False, False,'row') #print("Train_step time: {}".format(time.time() - step_start)) epinfos.append(infos[0]['episode']) start_time = time.time() if (j % 5 == 0): actor_critic.train(j) actor_critic.update_target() end_time = time.time() print("Train time: {}".format(end_time - start_time)) #print("new_state: {}".format(new_state)) new_states = new_states.reshape( (num_robots, game_state.observation_space.shape[0])) # print shape of current_state #print("current_state is %s", current_state) ########################################################################################## actor_critic.remember(current_states, actions, rewards, new_states, dones) actor_critic.replay_buffer.add(current_states, actions, rewards, new_states, dones) current_states = new_states ########################################################################################## if (i % 10 == 0): actor_critic.save_weight(i, trial_len) epinfobuf.extend(epinfos) tnow = time.time() #fps = int(nbatch / (tnow - tstart)) ################################################## ## Logging and saving model & weights ## ################################################## if i % log_interval == 0 or i == 0: #ev = explained_variance(values, returns) reward_mean = safemean([epinfo['r'] for epinfo in epinfobuf]) logger_ins.logkv("serial_timesteps", i * trial_len) logger_ins.logkv("nupdates", i) logger_ins.logkv("total_timesteps", i * trial_len) logger_ins.logkv('eprewmean', reward_mean) logger_ins.logkv( 'eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger_ins.logkv('time_elapsed', tnow - tfirststart) # for (lossval, lossname) in zip(lossvals, model.loss_names): # logger_ins.logkv(lossname, lossval) # logger_ins.dumpkvs() # for (lossval, lossname) in zip(lossvals, model.loss_names): # board_logger.log_scalar(lossname, lossval, update) board_logger.log_scalar("eprewmean", reward_mean, i) board_logger.flush() with open( HOME + '/catkin_ws/src/Turtlebot3_Pheromone/src/log/csv/{}.csv' .format(actor_critic.file_name), mode='a') as csv_file: csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(['%i' % i, '%0.2f' % reward_mean]) step_reward = np.append(step_reward, [[num_trials, reward_mean]], axis=0) sio.savemat( HOME + '/catkin_ws/src/Turtlebot3_Pheromone/src/log/MATLAB/step_reward_{}.mat' .format(actor_critic.time_str), {'data': step_reward}, True, '5', False, False, 'row') if train_indicator == 0: for i in range(num_trials): print("trial:" + str(i)) current_state = game_state.reset() actor_critic.actor_model.load_weights(path + "actormodel-2950-256.h5") actor_critic.critic_model.load_weights(path + "criticrmodel-2950-256.h5") ############################################################################################## total_reward = 0 for j in range(trial_len): ########################################################################################### current_state = current_state.reshape( (1, game_state.observation_space.shape[0])) start_time = time.time() action = actor_critic.play( current_state ) # need to change the network input output, do I need to change the output to be [0, 2*pi] action = action.reshape((1, game_state.action_space.shape[0])) end_time = time.time() print(1 / (end_time - start_time), "fps for calculating next step") _, new_state, reward, done = game_state.step( 0.1, action[0][1], action[0][0] ) # we get reward and state here, then we need to calculate if it is crashed! for 'dones' value total_reward = total_reward + reward ########################################################################################### if j == (trial_len - 1): done = 1 print("this is reward:", total_reward) # if (j % 5 == 0): # actor_critic.train() # actor_critic.update_target() new_state = new_state.reshape( (1, game_state.observation_space.shape[0])) # actor_critic.remember(cur_state, action, reward, new_state, done) # remember all the data using memory, memory data will be samples to samples automatically. # cur_state = new_state ########################################################################################## #actor_critic.remember(current_state, action, reward, new_state, done) current_state = new_state
def learn(self): # For logging time_str = time.strftime("%Y%m%d-%H%M%S") logger_ins = logger.Logger('/home/swn/catkin_ws/src/turtlebot3_waypoint_navigation/src/log', output_formats=[logger.HumanOutputFormat(sys.stdout)]) board_logger = tensorboard_logging.Logger(os.path.join(logger_ins.get_dir(), "tf_board", time_str)) # reassigning the members of class into this function for simplicity total_timesteps = int(self.total_timesteps) nenvs = 1 #nenvs = env.num_envs # for multiple instance training ob_space = self.env.observation_space ac_space = self.env.action_space nbatch = nenvs * self.nsteps nminibatches = self.nminibatches nbatch_train = nbatch // nminibatches noptepochs = self.noptepochs nsteps = self.nsteps save_interval = self.save_interval log_interval = self.log_interval restore_path = self.restore_path gamma = self.gamma lam = self.lam lr = self.lr cliprange = self.cliprange deterministic = self.deterministic step_reward = [[0.0, 0.0]] # Define a function to make Actor-Critic Model make_model = lambda : Model(policy=self.policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=self.nsteps, ent_coef=self.ent_coef, vf_coef=self.vf_coef, max_grad_norm=self.max_grad_norm, deterministic=self.deterministic) # Save function # if save_interval and logger_ins.get_dir(): # import cloudpickle # with open(osp.join(logger_ins.get_dir(), 'make_model.pkl'), 'wb') as fh: # fh.write(cloudpickle.dumps(make_model)) # Make a model model = make_model() # Restore when the path is provided if restore_path is not None: model.restore(restore_path) # Create a runner instance (generating samples with nsteps) runner = Runner(env=self.env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # Double ended queue with max size 100 to store episode info epinfobuf = deque(maxlen=100) # Get the start time tfirststart = time.time() # Calculate # for update (iteration) nupdates = total_timesteps//nbatch assert(nupdates > 0) ''' PPO (iterating) 1. Run policy in the environment for T timesteps 2. Compute advantage estimates (in Model class) 3. Optimise Loss w.r.t weights of policy, with K epochs and minibatch size M < N (# of actors) * T (timesteps) 4. Update weights (in Model class) ''' # In every update, one loop of PPO algorithm is executed for update in range(1, nupdates+1): # INITIALISE PARAMETERS assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) # 1. Run policy and get samples for nsteps ids, obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() epinfobuf.extend(epinfos) mblossvals = [] # Do not train or log if in deterministic mode: if deterministic: continue # 3. Optimise Loss w.r.t weights of policy, with K epochs and minibatch size M < N (# of actors) * T (timesteps) if states is None: # nonrecurrent version # inds = np.arange(nbatch) # Update weights using optimiser by noptepochs for _ in range(noptepochs): #np.random.shuffle(inds) # In each epoch, update weights using samples every minibatch in the total batch # epoch = m(32)*minibatch(4) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] returns_np = np.asarray(returns[mbinds]) # 4. Update weights mblossvals.append(model.train(lrnow, cliprangenow, ids[mbinds], [obs[i] for i in mbinds], returns[mbinds], masks[mbinds], actions[mbinds], values[mbinds], neglogpacs[mbinds])) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): #np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() print(mbflatinds) slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, [obs[i] for i in mbinds], returns[mbflatinds], masks[mbflatinds], actions[mbflatinds], values[mbflatinds], neglogpacs[mbflatinds], mbstates)) # Calculate mean loss lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) ''' Logging and saving model & weights ''' if update % log_interval == 0 or update == 1: #ev = explained_variance(values, returns) logger_ins.logkv("serial_timesteps", update*nsteps) logger_ins.logkv("nupdates", update) logger_ins.logkv("total_timesteps", update*nbatch) logger_ins.logkv("fps", fps) #logger.logkv("explained_variance", float(ev)) logger_ins.logkv('eprewmean', self.safemean([epinfo['r'] for epinfo in epinfobuf])) logger_ins.logkv('eplenmean', self.safemean([epinfo['l'] for epinfo in epinfobuf])) logger_ins.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger_ins.logkv(lossname, lossval) logger_ins.dumpkvs() for (lossval, lossname) in zip(lossvals, model.loss_names): board_logger.log_scalar(lossname, lossval, update) board_logger.log_scalar("eprewmean", self.safemean([epinfo['r'] for epinfo in epinfobuf]), update) board_logger.flush() reward_arr = np.asarray([epinfo['r'] for epinfo in epinfobuf]) #reward_new = np.delete(reward_arr, np.where(reward_arr == 0.0)) step_reward = np.append(step_reward,[[update, self.safemean([reward for reward in reward_arr])]], axis=0) sio.savemat('/home/swn/catkin_ws/src/Turtlebot3_Pheromone/src/log/MATLAB/step_reward_{}.mat'.format(self.time_str), {'data':step_reward},True,'5',False,False,'row') if save_interval and (update % save_interval == 0 or update == 1) and logger_ins.get_dir(): checkdir = osp.join(logger_ins.get_dir(), 'checkpoints', '{}'.format(self.time_str)) if not os.path.isdir(checkdir): os.makedirs(checkdir) savepath = osp.join(checkdir, '%.5i'%update +"r"+"{:.2f}".format(self.safemean([epinfo['r'] for epinfo in epinfobuf]))) print('Saving to', savepath) model.save(savepath) print("Done with training. Exiting.") self.env.close() return model