def cal_td_error(self, gamma, init_value): ''' 计算td error TD = r + gamma * (1- done) * v(s') - v(s) ''' assert 'value' in self.buffer.keys() self.buffer['td_error'] = sth.discounted_sum_minus( self.buffer['r'], gamma, init_value, self.buffer['done'], self.buffer['value'])
def calculate_statistics(self): self.data['total_reward'] = sth.discounted_sum(self.data.r.values, 1, 0, self.data.done.values) self.data['discounted_reward'] = sth.discounted_sum( self.data.r.values, self.gamma, self.data.next_value.values[-1], self.data.done.values) self.data['td_error'] = sth.discounted_sum_minus( self.data.r.values, self.gamma, self.data.next_value.values[-1], self.data.done.values, self.data.value.values) # GAE self.data['advantage'] = sth.discounted_sum(self.data.td_error.values, self.lambda_ * self.gamma, 0, self.data.done.values)
def calculate_statistics(self): init_value = np.squeeze( self._get_value(self.s_, self.visual_s_).numpy()) self.data['total_reward'] = sth.discounted_sum(self.data.r.values, 1, init_value, self.data.done.values) self.data['discounted_reward'] = sth.discounted_sum( self.data.r.values, self.gamma, init_value, self.data.done.values) self.data['td_error'] = sth.discounted_sum_minus( self.data.r.values, self.gamma, init_value, self.data.done.values, self.data.value.values) # GAE adv = np.asarray( sth.discounted_sum(self.data.td_error.values, self.lambda_ * self.gamma, 0, self.data.done.values)) self.data['advantage'] = list(standardization(adv))
def calculate_statistics(self): self.data['total_reward'] = sth.discounted_sum(self.data.r.values, 1, 0, self.data.done.values) init_value = np.squeeze( self.sess.run(self.value, feed_dict={ self.pl_visual_s: self.visual_s_, self.pl_s: self.s_, self.sigma_offset: np.full(self.a_counts, 0.01) })), self.data['discounted_reward'] = sth.discounted_sum( self.data.r.values, self.gamma, init_value, self.data.done.values) self.data['td_error'] = sth.discounted_sum_minus( self.data.r.values, self.gamma, init_value, self.data.done.values, self.data.value.values) # GAE self.data['advantage'] = sth.discounted_sum(self.data.td_error.values, self.lambda_ * self.gamma, 0, self.data.done.values)
def train_OnPolicy(sess, env, brain_name, begin_episode, model, recorder, cp_file, hyper_config, train_config): base_agents_num = train_config['reset_config']['copy'] sigma_offset = np.zeros(model.a_counts) + hyper_config['base_sigma'] for episode in range(begin_episode, train_config['max_episode']): recorder.logger.info('-' * 30 + str(episode) + ' ๑乛◡乛๑ ' + train_config['algorithm'].name + '-' * 30) if EXIT: return if (episode % train_config['save_frequency'] == 0): start = time.time() recorder.saver.save(sess, cp_file, global_step=episode, write_meta_graph=False) end = time.time() recorder.logger.info(f'save checkpoint cost time: {end - start}') model_lr = model.decay_lr(episode) step = 0 total_reward = 0. total_discounted_reward = 0. discounted_reward = 0 flag = False # start = time.time() # for i in range(agents_num): # data[f'{i}'].drop(data[f'{i}'].index, inplace=True) # end = time.time() # recorder.logger.info(f'clear dataframe cost time: {end - start}') start = time.time() obs = env.reset(config=train_config['reset_config'], train_mode=True)[brain_name] agents_num = len(obs.agents) end = time.time() recorder.logger.info(f'reset envs cost time: {end - start}') state_ = obs.vector_observations dones_flag = np.zeros(agents_num) dones_flag_sup = np.full( agents_num, -1 ) if train_config['start_continuous_done'] else np.zeros(agents_num) if not train_config['use_trick']: sigma_offset = np.zeros(model.a_counts) + \ hyper_config['base_sigma'] start = time.time() data = { f'{i}': pd.DataFrame(columns=[ 'state', 'action', 'old_prob', 'reward', 'next_state', 'done' ]) for i in range(agents_num) } end = time.time() recorder.logger.info(f'create dataframe cost time: {end - start}') start = time.time() while True: state = state_ prob, action = model.choose_action(s=state, sigma_offset=sigma_offset) obs = env.step(action)[brain_name] step += 1 reward = obs.rewards state_ = obs.vector_observations dones_flag += obs.local_done dones_flag_sup += obs.local_done for i in range(agents_num): data[f'{i}'] = data[f'{i}'].append( { 'state': state[i], 'action': action[i], 'old_prob': prob[i] + 1e-10, 'next_state': state_[i], 'reward': reward[i], 'done': obs.local_done[i] }, ignore_index=True) if train_config['till_all_done']: sample_time = time.time() - start if all(dones_flag) and all( dones_flag_sup ) or sample_time > train_config['max_sample_time']: train_config['init_max_step'] = step recorder.logger.info( f'(interactive)collect data cost time: {sample_time}') break elif step >= train_config['init_max_step']: sample_time = time.time() - start recorder.logger.info( f'(interactive)collect data cost time: {sample_time}') if sample_time > train_config['max_sample_time']: train_config['max_step'] = train_config['init_max_step'] break start = time.time() dones = 0 hits = 0 for i in range(agents_num): done_index = data[f'{i}'][data[f'{i}'].done == True].index.tolist() hit_index = data[f'{i}'][data[f'{i}'].reward > 0].index.tolist() dones += len(done_index) hits += len(hit_index) if len(done_index): recorder.logger.info( f'[Agent {i}] dones: {len(done_index)} \thits: {len(hit_index)} \thit ratio: {len(hit_index)/len(done_index):.2%}' ) else: recorder.logger.info(f'[Agent {i}] no done.') flag = True data[f'{i}']['value'] = model.get_state_value( s=data[f'{i}']['state'].values.tolist(), sigma_offset=sigma_offset) value_ = model.get_state_value(s=[state_[i]], sigma_offset=sigma_offset) if not data[f'{i}']['done'].values[-1]: discounted_reward = value_ data[f'{i}']['total_reward'] = sth.discounted_sum( data[f'{i}']['reward'], 1, data[f'{i}']['reward'].values[-1], done_index, train_config['init_max_step']) if train_config['algorithm'].value <= 3: data[f'{i}']['discounted_reward'] = sth.discounted_sum( data[f'{i}']['reward'], hyper_config['gamma'], discounted_reward, done_index, train_config['init_max_step']) data[f'{i}']['td_error'] = sth.discounted_sum_minus( data[f'{i}']['reward'].values, hyper_config['gamma'], value_, done_index, data[f'{i}']['value'].values, train_config['init_max_step']) data[f'{i}']['advantage'] = sth.discounted_sum( data[f'{i}']['td_error'], hyper_config['lambda'] * hyper_config['gamma'], 0, done_index, train_config['init_max_step']) else: data[f'{i}']['discounted_reward'] = sth.discounted_sum( data[f'{i}']['reward'], hyper_config['gamma'], discounted_reward, done_index, train_config['init_max_step'], data[f'{i}']['value']) data[f'{i}']['advantage'] = None total_reward += (data[f'{i}']['total_reward'][0] / agents_num) total_discounted_reward += (data[f'{i}']['discounted_reward'][0] / agents_num) if dones: recorder.logger.info( f'#Agents Num#: {agents_num} \ttotal_dones: {dones} \ttotal_hits: {hits} \tratio: {hits/dones:.2%}' ) else: recorder.logger.info( f'#Agents Num#: {agents_num} \tOMG! ALL AGENTS NO DONE.') end = time.time() recorder.logger.info(f'calculate cost time: {end - start}') ''' excel record ''' if train_config['excel_record'] and episode % train_config[ 'excel_record_frequency'] == 0: start = time.time() data['0'].to_excel(recorder.excel_writer, sheet_name=f'{episode}', index=True) recorder.excel_writer.save() end = time.time() recorder.logger.info( f'save data to excel cost time: {end - start}') ''' mongodb record ''' if train_config['mongo_record'] and episode % train_config[ 'mongo_record_frequency'] == 0: start = time.time() if train_config['mongo_record_all']: for i in range(agents_num): recorder.mongodb[f'e{episode}a{i}'].insert( json.loads(data[f'{i}'].T.to_json()).values()) else: recorder.mongodb[f'e{episode}a'].insert( json.loads(data['0'].T.to_json()).values()) end = time.time() recorder.logger.info( f'save data to MongoDB cost time: {end - start}') start = time.time() for j in range(agents_num): for _ in range(train_config['epoch']): for i in range(0, train_config['init_max_step'], train_config['batchsize']): if train_config['random_batch']: i_data = data[f'{j}'].sample( n=train_config['batchsize'] ) if train_config['batchsize'] < train_config[ 'init_max_step'] else data[f'{j}'] else: i_data = data[f'{j}'].iloc[ i:i + train_config['batchsize'], :] model.learn( s=i_data['state'].values.tolist(), a=i_data['action'].values.tolist(), r=i_data['reward'].values[:, np.newaxis], s_=i_data['next_state'].values.tolist(), dc_r=i_data['discounted_reward'].values[:, np.newaxis], episode=episode, sigma_offset=sigma_offset, old_prob=i_data['old_prob'].values.tolist(), advantage=i_data['advantage'].values[:, np.newaxis]) learn_time = time.time() - start recorder.logger.info(f'learn cost time: {learn_time}') if train_config['dynamic_allocation']: train_config['reset_config']['copy'] += 1 if hits > ( agents_num * 2 if train_config['start_continuous_done'] else agents_num) else (-2 if train_config['reset_config']['copy'] > base_agents_num else 0) start = time.time() a_loss = np.array([ model.get_actor_loss( s=data[f'{i}']['state'].values.tolist(), sigma_offset=sigma_offset, a=data[f'{i}']['action'].values.tolist(), old_prob=data[f'{i}']['old_prob'].values.tolist(), advantage=data[f'{i}']['advantage'].values[:, np.newaxis]) for i in range(agents_num) ]).mean() c_loss = np.array([ model.get_critic_loss( s=data[f'{i}']['state'].values.tolist(), a=data[f'{i}']['action'].values.tolist(), r=data[f'{i}']['reward'].values[:, np.newaxis], s_=data[f'{i}']['next_state'].values.tolist(), dc_r=data[f'{i}']['discounted_reward'].values[:, np.newaxis], sigma_offset=sigma_offset) for i in range(agents_num) ]).mean() entropy = np.array([ model.get_entropy(s=data[f'{i}']['state'].values.tolist(), sigma_offset=sigma_offset) for i in range(agents_num) ]).mean(axis=0) sigma = np.array([ model.get_sigma(s=data[f'{i}']['state'].values.tolist(), sigma_offset=sigma_offset) for i in range(agents_num) ]).mean(axis=0) sigma_offset = np.array( [np.log(c_loss + 1)] * model.a_counts) + hyper_config['base_sigma'] end = time.time() recorder.logger.info(f'get statistics cost time: {end - start}') writer_summary(recorder.writer, episode, [{ 'tag': 'TIME/sample_time', 'value': sample_time }, { 'tag': 'TIME/steps', 'value': step }, { 'tag': 'TIME/agents_num', 'value': agents_num }]) writer_summary(recorder.writer, episode, [{ 'tag': 'REWARD/discounted_reward', 'value': total_discounted_reward }, { 'tag': 'REWARD/reward', 'value': total_reward }, { 'tag': 'REWARD/accuracy', 'value': hits / dones if dones else dones }, { 'tag': 'LEARNING_RATE/lr', 'value': model_lr }, { 'tag': 'LOSS/actor_loss', 'value': a_loss }, { 'tag': 'LOSS/critic_loss', 'value': c_loss }, { 'tag': 'LOSS/actor_entropy_max', 'value': entropy.max() }, { 'tag': 'LOSS/actor_entropy_min', 'value': entropy.min() }, { 'tag': 'LOSS/actor_entropy_mean', 'value': entropy.mean() }, { 'tag': 'PARAMETERS/sigma', 'value': sigma.max() }]) if flag and train_config['init_max_step'] < train_config['max_step']: train_config['init_max_step'] += 10 else: train_config['init_max_step'] -= 10 recorder.logger.info( 'episede: {0} steps: {1} dc_reward: {2} reward: {3}'.format( episode, step, total_discounted_reward, total_reward))
def train_OnPolicy(sess, env, brain_name, begin_episode, model, hyper_config, train_config): sigma_offset = np.zeros(model.a_counts) + hyper_config['base_sigma'] for episode in range(begin_episode, train_config['max_episode']): print('-' * 30 + str(episode) + ' ๑乛◡乛๑ ' + train_config['algorithm'].name + '-' * 30) if EXIT: return step = 0 total_reward = 0. total_discounted_reward = 0. discounted_reward = 0 start = time.time() obs = env.reset(train_mode=True)[brain_name] agents_num = len(obs.agents) end = time.time() print(f'reset envs cost time: {end - start}') state_ = obs.vector_observations dones_flag = np.zeros(agents_num) start = time.time() data = { f'{i}': pd.DataFrame(columns=[ 'state', 'action', 'old_prob', 'reward', 'next_state', 'done' ]) for i in range(agents_num) } end = time.time() print(f'create dataframe cost time: {end - start}') start = time.time() while True: state = state_ prob, action = model.choose_action(s=state, sigma_offset=sigma_offset) obs = env.step(action)[brain_name] step += 1 reward = obs.rewards state_ = obs.vector_observations dones_flag += obs.local_done for i in range(agents_num): data[f'{i}'] = data[f'{i}'].append( { 'state': state[i], 'action': action[i], 'old_prob': prob[i] + 1e-10, 'next_state': state_[i], 'reward': reward[i], 'done': obs.local_done[i] }, ignore_index=True) if train_config['till_all_done']: sample_time = time.time() - start if all(dones_flag ) or sample_time > train_config['max_sample_time']: train_config['init_max_step'] = step print( f'(interactive)collect data cost time: {sample_time}') break elif step >= train_config['init_max_step']: sample_time = time.time() - start print(f'(interactive)collect data cost time: {sample_time}') if sample_time > train_config['max_sample_time']: train_config['max_step'] = train_config['init_max_step'] break start = time.time() dones = 0 hits = 0 for i in range(agents_num): done_index = data[f'{i}'][data[f'{i}'].done == True].index.tolist() hit_index = data[f'{i}'][data[f'{i}'].reward > 0].index.tolist() dones += len(done_index) hits += len(hit_index) if len(done_index): print( f'[Agent {i}] dones: {len(done_index)} \thits: {len(hit_index)} \thit ratio: {len(hit_index)/len(done_index):.2%}' ) else: print(f'[Agent {i}] no done.') data[f'{i}']['value'] = model.get_state_value( s=data[f'{i}']['state'].values.tolist(), sigma_offset=sigma_offset) value_ = model.get_state_value(s=[state_[i]], sigma_offset=sigma_offset) if not data[f'{i}']['done'].values[-1]: discounted_reward = value_ data[f'{i}']['total_reward'] = sth.discounted_sum( data[f'{i}']['reward'], 1, data[f'{i}']['reward'].values[-1], done_index, train_config['init_max_step']) if train_config['algorithm'].value <= 3: data[f'{i}']['discounted_reward'] = sth.discounted_sum( data[f'{i}']['reward'], hyper_config['gamma'], discounted_reward, done_index, train_config['init_max_step']) data[f'{i}']['td_error'] = sth.discounted_sum_minus( data[f'{i}']['reward'].values, hyper_config['gamma'], value_, done_index, data[f'{i}']['value'].values, train_config['init_max_step']) data[f'{i}']['advantage'] = sth.discounted_sum( data[f'{i}']['td_error'], hyper_config['lambda'] * hyper_config['gamma'], 0, done_index, train_config['init_max_step']) else: data[f'{i}']['discounted_reward'] = sth.discounted_sum( data[f'{i}']['reward'], hyper_config['gamma'], discounted_reward, done_index, train_config['init_max_step'], data[f'{i}']['value']) data[f'{i}']['advantage'] = None total_reward += (data[f'{i}']['total_reward'][0] / agents_num) total_discounted_reward += (data[f'{i}']['discounted_reward'][0] / agents_num) if dones: print( f'#Agents Num#: {agents_num} \ttotal_dones: {dones} \ttotal_hits: {hits} \tratio: {hits/dones:.2%}' ) else: print(f'#Agents Num#: {agents_num} \tOMG! ALL AGENTS NO DONE.') end = time.time() print(f'calculate cost time: {end - start}') start = time.time() for j in range(agents_num): for _ in range(train_config['epoch']): for i in range(0, train_config['init_max_step'], train_config['batchsize']): if train_config['random_batch']: i_data = data[f'{j}'].sample( n=train_config['batchsize'] ) if train_config['batchsize'] < train_config[ 'init_max_step'] else data[f'{j}'] else: i_data = data[f'{j}'].iloc[ i:i + train_config['batchsize'], :] model.learn( s=i_data['state'].values.tolist(), a=i_data['action'].values.tolist(), r=i_data['reward'].values[:, np.newaxis], s_=i_data['next_state'].values.tolist(), dc_r=i_data['discounted_reward'].values[:, np.newaxis], episode=episode, sigma_offset=sigma_offset, old_prob=i_data['old_prob'].values.tolist(), advantage=i_data['advantage'].values[:, np.newaxis]) learn_time = time.time() - start print(f'learn cost time: {learn_time}') print('episede: {0} steps: {1} dc_reward: {2} reward: {3}'.format( episode, step, total_discounted_reward, total_reward))