예제 #1
0
 def cal_td_error(self, gamma, init_value):
     '''
     计算td error
     TD = r + gamma * (1- done) * v(s') - v(s)
     '''
     assert 'value' in self.buffer.keys()
     self.buffer['td_error'] = sth.discounted_sum_minus(
         self.buffer['r'], gamma, init_value, self.buffer['done'],
         self.buffer['value'])
예제 #2
0
파일: ppo.py 프로젝트: familywei/RLs
 def calculate_statistics(self):
     self.data['total_reward'] = sth.discounted_sum(self.data.r.values, 1,
                                                    0,
                                                    self.data.done.values)
     self.data['discounted_reward'] = sth.discounted_sum(
         self.data.r.values, self.gamma, self.data.next_value.values[-1],
         self.data.done.values)
     self.data['td_error'] = sth.discounted_sum_minus(
         self.data.r.values, self.gamma, self.data.next_value.values[-1],
         self.data.done.values, self.data.value.values)
     # GAE
     self.data['advantage'] = sth.discounted_sum(self.data.td_error.values,
                                                 self.lambda_ * self.gamma,
                                                 0, self.data.done.values)
예제 #3
0
파일: trpo.py 프로젝트: Abluceli/RLs
 def calculate_statistics(self):
     init_value = np.squeeze(
         self._get_value(self.s_, self.visual_s_).numpy())
     self.data['total_reward'] = sth.discounted_sum(self.data.r.values, 1,
                                                    init_value,
                                                    self.data.done.values)
     self.data['discounted_reward'] = sth.discounted_sum(
         self.data.r.values, self.gamma, init_value, self.data.done.values)
     self.data['td_error'] = sth.discounted_sum_minus(
         self.data.r.values, self.gamma, init_value, self.data.done.values,
         self.data.value.values)
     # GAE
     adv = np.asarray(
         sth.discounted_sum(self.data.td_error.values,
                            self.lambda_ * self.gamma, 0,
                            self.data.done.values))
     self.data['advantage'] = list(standardization(adv))
예제 #4
0
파일: ppo.py 프로젝트: kasimte/RLs
 def calculate_statistics(self):
     self.data['total_reward'] = sth.discounted_sum(self.data.r.values, 1,
                                                    0,
                                                    self.data.done.values)
     init_value = np.squeeze(
         self.sess.run(self.value,
                       feed_dict={
                           self.pl_visual_s: self.visual_s_,
                           self.pl_s: self.s_,
                           self.sigma_offset: np.full(self.a_counts, 0.01)
                       })),
     self.data['discounted_reward'] = sth.discounted_sum(
         self.data.r.values, self.gamma, init_value, self.data.done.values)
     self.data['td_error'] = sth.discounted_sum_minus(
         self.data.r.values, self.gamma, init_value, self.data.done.values,
         self.data.value.values)
     # GAE
     self.data['advantage'] = sth.discounted_sum(self.data.td_error.values,
                                                 self.lambda_ * self.gamma,
                                                 0, self.data.done.values)
예제 #5
0
def train_OnPolicy(sess, env, brain_name, begin_episode, model, recorder,
                   cp_file, hyper_config, train_config):
    base_agents_num = train_config['reset_config']['copy']
    sigma_offset = np.zeros(model.a_counts) + hyper_config['base_sigma']
    for episode in range(begin_episode, train_config['max_episode']):
        recorder.logger.info('-' * 30 + str(episode) + ' ๑乛◡乛๑ ' +
                             train_config['algorithm'].name + '-' * 30)
        if EXIT:
            return
        if (episode % train_config['save_frequency'] == 0):
            start = time.time()
            recorder.saver.save(sess,
                                cp_file,
                                global_step=episode,
                                write_meta_graph=False)
            end = time.time()
            recorder.logger.info(f'save checkpoint cost time: {end - start}')
        model_lr = model.decay_lr(episode)
        step = 0
        total_reward = 0.
        total_discounted_reward = 0.
        discounted_reward = 0
        flag = False
        # start = time.time()
        # for i in range(agents_num):
        #     data[f'{i}'].drop(data[f'{i}'].index, inplace=True)
        # end = time.time()
        # recorder.logger.info(f'clear dataframe cost time: {end - start}')
        start = time.time()
        obs = env.reset(config=train_config['reset_config'],
                        train_mode=True)[brain_name]
        agents_num = len(obs.agents)
        end = time.time()
        recorder.logger.info(f'reset envs cost time: {end - start}')
        state_ = obs.vector_observations
        dones_flag = np.zeros(agents_num)
        dones_flag_sup = np.full(
            agents_num, -1
        ) if train_config['start_continuous_done'] else np.zeros(agents_num)
        if not train_config['use_trick']:
            sigma_offset = np.zeros(model.a_counts) + \
                hyper_config['base_sigma']
        start = time.time()
        data = {
            f'{i}': pd.DataFrame(columns=[
                'state', 'action', 'old_prob', 'reward', 'next_state', 'done'
            ])
            for i in range(agents_num)
        }
        end = time.time()
        recorder.logger.info(f'create dataframe cost time: {end - start}')

        start = time.time()
        while True:
            state = state_
            prob, action = model.choose_action(s=state,
                                               sigma_offset=sigma_offset)
            obs = env.step(action)[brain_name]
            step += 1
            reward = obs.rewards
            state_ = obs.vector_observations
            dones_flag += obs.local_done
            dones_flag_sup += obs.local_done
            for i in range(agents_num):
                data[f'{i}'] = data[f'{i}'].append(
                    {
                        'state': state[i],
                        'action': action[i],
                        'old_prob': prob[i] + 1e-10,
                        'next_state': state_[i],
                        'reward': reward[i],
                        'done': obs.local_done[i]
                    },
                    ignore_index=True)
            if train_config['till_all_done']:
                sample_time = time.time() - start
                if all(dones_flag) and all(
                        dones_flag_sup
                ) or sample_time > train_config['max_sample_time']:
                    train_config['init_max_step'] = step
                    recorder.logger.info(
                        f'(interactive)collect data cost time: {sample_time}')
                    break
            elif step >= train_config['init_max_step']:
                sample_time = time.time() - start
                recorder.logger.info(
                    f'(interactive)collect data cost time: {sample_time}')
                if sample_time > train_config['max_sample_time']:
                    train_config['max_step'] = train_config['init_max_step']
                break
        start = time.time()

        dones = 0
        hits = 0
        for i in range(agents_num):
            done_index = data[f'{i}'][data[f'{i}'].done == True].index.tolist()
            hit_index = data[f'{i}'][data[f'{i}'].reward > 0].index.tolist()
            dones += len(done_index)
            hits += len(hit_index)
            if len(done_index):
                recorder.logger.info(
                    f'[Agent {i}] dones: {len(done_index)} \thits: {len(hit_index)} \thit ratio: {len(hit_index)/len(done_index):.2%}'
                )
            else:
                recorder.logger.info(f'[Agent {i}] no done.')
                flag = True
            data[f'{i}']['value'] = model.get_state_value(
                s=data[f'{i}']['state'].values.tolist(),
                sigma_offset=sigma_offset)
            value_ = model.get_state_value(s=[state_[i]],
                                           sigma_offset=sigma_offset)
            if not data[f'{i}']['done'].values[-1]:
                discounted_reward = value_
            data[f'{i}']['total_reward'] = sth.discounted_sum(
                data[f'{i}']['reward'], 1, data[f'{i}']['reward'].values[-1],
                done_index, train_config['init_max_step'])
            if train_config['algorithm'].value <= 3:
                data[f'{i}']['discounted_reward'] = sth.discounted_sum(
                    data[f'{i}']['reward'], hyper_config['gamma'],
                    discounted_reward, done_index,
                    train_config['init_max_step'])
                data[f'{i}']['td_error'] = sth.discounted_sum_minus(
                    data[f'{i}']['reward'].values, hyper_config['gamma'],
                    value_, done_index, data[f'{i}']['value'].values,
                    train_config['init_max_step'])
                data[f'{i}']['advantage'] = sth.discounted_sum(
                    data[f'{i}']['td_error'],
                    hyper_config['lambda'] * hyper_config['gamma'], 0,
                    done_index, train_config['init_max_step'])
            else:
                data[f'{i}']['discounted_reward'] = sth.discounted_sum(
                    data[f'{i}']['reward'], hyper_config['gamma'],
                    discounted_reward, done_index,
                    train_config['init_max_step'], data[f'{i}']['value'])
                data[f'{i}']['advantage'] = None
            total_reward += (data[f'{i}']['total_reward'][0] / agents_num)
            total_discounted_reward += (data[f'{i}']['discounted_reward'][0] /
                                        agents_num)
        if dones:
            recorder.logger.info(
                f'#Agents Num#: {agents_num} \ttotal_dones: {dones} \ttotal_hits: {hits} \tratio: {hits/dones:.2%}'
            )
        else:
            recorder.logger.info(
                f'#Agents Num#: {agents_num} \tOMG! ALL AGENTS NO DONE.')
        end = time.time()
        recorder.logger.info(f'calculate cost time: {end - start}')
        '''
        excel record
        '''
        if train_config['excel_record'] and episode % train_config[
                'excel_record_frequency'] == 0:
            start = time.time()
            data['0'].to_excel(recorder.excel_writer,
                               sheet_name=f'{episode}',
                               index=True)
            recorder.excel_writer.save()
            end = time.time()
            recorder.logger.info(
                f'save data to excel cost time: {end - start}')
        '''
        mongodb record
        '''
        if train_config['mongo_record'] and episode % train_config[
                'mongo_record_frequency'] == 0:
            start = time.time()
            if train_config['mongo_record_all']:
                for i in range(agents_num):
                    recorder.mongodb[f'e{episode}a{i}'].insert(
                        json.loads(data[f'{i}'].T.to_json()).values())
            else:
                recorder.mongodb[f'e{episode}a'].insert(
                    json.loads(data['0'].T.to_json()).values())
            end = time.time()
            recorder.logger.info(
                f'save data to MongoDB cost time: {end - start}')

        start = time.time()
        for j in range(agents_num):
            for _ in range(train_config['epoch']):
                for i in range(0, train_config['init_max_step'],
                               train_config['batchsize']):
                    if train_config['random_batch']:
                        i_data = data[f'{j}'].sample(
                            n=train_config['batchsize']
                        ) if train_config['batchsize'] < train_config[
                            'init_max_step'] else data[f'{j}']
                    else:
                        i_data = data[f'{j}'].iloc[
                            i:i + train_config['batchsize'], :]
                    model.learn(
                        s=i_data['state'].values.tolist(),
                        a=i_data['action'].values.tolist(),
                        r=i_data['reward'].values[:, np.newaxis],
                        s_=i_data['next_state'].values.tolist(),
                        dc_r=i_data['discounted_reward'].values[:, np.newaxis],
                        episode=episode,
                        sigma_offset=sigma_offset,
                        old_prob=i_data['old_prob'].values.tolist(),
                        advantage=i_data['advantage'].values[:, np.newaxis])
        learn_time = time.time() - start
        recorder.logger.info(f'learn cost time: {learn_time}')

        if train_config['dynamic_allocation']:
            train_config['reset_config']['copy'] += 1 if hits > (
                agents_num * 2 if train_config['start_continuous_done'] else
                agents_num) else (-2 if train_config['reset_config']['copy'] >
                                  base_agents_num else 0)

        start = time.time()
        a_loss = np.array([
            model.get_actor_loss(
                s=data[f'{i}']['state'].values.tolist(),
                sigma_offset=sigma_offset,
                a=data[f'{i}']['action'].values.tolist(),
                old_prob=data[f'{i}']['old_prob'].values.tolist(),
                advantage=data[f'{i}']['advantage'].values[:, np.newaxis])
            for i in range(agents_num)
        ]).mean()
        c_loss = np.array([
            model.get_critic_loss(
                s=data[f'{i}']['state'].values.tolist(),
                a=data[f'{i}']['action'].values.tolist(),
                r=data[f'{i}']['reward'].values[:, np.newaxis],
                s_=data[f'{i}']['next_state'].values.tolist(),
                dc_r=data[f'{i}']['discounted_reward'].values[:, np.newaxis],
                sigma_offset=sigma_offset) for i in range(agents_num)
        ]).mean()
        entropy = np.array([
            model.get_entropy(s=data[f'{i}']['state'].values.tolist(),
                              sigma_offset=sigma_offset)
            for i in range(agents_num)
        ]).mean(axis=0)
        sigma = np.array([
            model.get_sigma(s=data[f'{i}']['state'].values.tolist(),
                            sigma_offset=sigma_offset)
            for i in range(agents_num)
        ]).mean(axis=0)
        sigma_offset = np.array(
            [np.log(c_loss + 1)] * model.a_counts) + hyper_config['base_sigma']
        end = time.time()
        recorder.logger.info(f'get statistics cost time: {end - start}')

        writer_summary(recorder.writer, episode, [{
            'tag': 'TIME/sample_time',
            'value': sample_time
        }, {
            'tag': 'TIME/steps',
            'value': step
        }, {
            'tag': 'TIME/agents_num',
            'value': agents_num
        }])
        writer_summary(recorder.writer, episode,
                       [{
                           'tag': 'REWARD/discounted_reward',
                           'value': total_discounted_reward
                       }, {
                           'tag': 'REWARD/reward',
                           'value': total_reward
                       }, {
                           'tag': 'REWARD/accuracy',
                           'value': hits / dones if dones else dones
                       }, {
                           'tag': 'LEARNING_RATE/lr',
                           'value': model_lr
                       }, {
                           'tag': 'LOSS/actor_loss',
                           'value': a_loss
                       }, {
                           'tag': 'LOSS/critic_loss',
                           'value': c_loss
                       }, {
                           'tag': 'LOSS/actor_entropy_max',
                           'value': entropy.max()
                       }, {
                           'tag': 'LOSS/actor_entropy_min',
                           'value': entropy.min()
                       }, {
                           'tag': 'LOSS/actor_entropy_mean',
                           'value': entropy.mean()
                       }, {
                           'tag': 'PARAMETERS/sigma',
                           'value': sigma.max()
                       }])
        if flag and train_config['init_max_step'] < train_config['max_step']:
            train_config['init_max_step'] += 10
        else:
            train_config['init_max_step'] -= 10
        recorder.logger.info(
            'episede: {0} steps: {1} dc_reward: {2} reward: {3}'.format(
                episode, step, total_discounted_reward, total_reward))
예제 #6
0
def train_OnPolicy(sess, env, brain_name, begin_episode, model, hyper_config,
                   train_config):
    sigma_offset = np.zeros(model.a_counts) + hyper_config['base_sigma']
    for episode in range(begin_episode, train_config['max_episode']):
        print('-' * 30 + str(episode) + ' ๑乛◡乛๑ ' +
              train_config['algorithm'].name + '-' * 30)
        if EXIT:
            return
        step = 0
        total_reward = 0.
        total_discounted_reward = 0.
        discounted_reward = 0
        start = time.time()
        obs = env.reset(train_mode=True)[brain_name]
        agents_num = len(obs.agents)
        end = time.time()
        print(f'reset envs cost time: {end - start}')
        state_ = obs.vector_observations
        dones_flag = np.zeros(agents_num)
        start = time.time()
        data = {
            f'{i}': pd.DataFrame(columns=[
                'state', 'action', 'old_prob', 'reward', 'next_state', 'done'
            ])
            for i in range(agents_num)
        }
        end = time.time()
        print(f'create dataframe cost time: {end - start}')

        start = time.time()
        while True:
            state = state_
            prob, action = model.choose_action(s=state,
                                               sigma_offset=sigma_offset)
            obs = env.step(action)[brain_name]
            step += 1
            reward = obs.rewards
            state_ = obs.vector_observations
            dones_flag += obs.local_done
            for i in range(agents_num):
                data[f'{i}'] = data[f'{i}'].append(
                    {
                        'state': state[i],
                        'action': action[i],
                        'old_prob': prob[i] + 1e-10,
                        'next_state': state_[i],
                        'reward': reward[i],
                        'done': obs.local_done[i]
                    },
                    ignore_index=True)
            if train_config['till_all_done']:
                sample_time = time.time() - start
                if all(dones_flag
                       ) or sample_time > train_config['max_sample_time']:
                    train_config['init_max_step'] = step
                    print(
                        f'(interactive)collect data cost time: {sample_time}')
                    break
            elif step >= train_config['init_max_step']:
                sample_time = time.time() - start
                print(f'(interactive)collect data cost time: {sample_time}')
                if sample_time > train_config['max_sample_time']:
                    train_config['max_step'] = train_config['init_max_step']
                break
        start = time.time()

        dones = 0
        hits = 0
        for i in range(agents_num):
            done_index = data[f'{i}'][data[f'{i}'].done == True].index.tolist()
            hit_index = data[f'{i}'][data[f'{i}'].reward > 0].index.tolist()
            dones += len(done_index)
            hits += len(hit_index)
            if len(done_index):
                print(
                    f'[Agent {i}] dones: {len(done_index)} \thits: {len(hit_index)} \thit ratio: {len(hit_index)/len(done_index):.2%}'
                )
            else:
                print(f'[Agent {i}] no done.')
            data[f'{i}']['value'] = model.get_state_value(
                s=data[f'{i}']['state'].values.tolist(),
                sigma_offset=sigma_offset)
            value_ = model.get_state_value(s=[state_[i]],
                                           sigma_offset=sigma_offset)
            if not data[f'{i}']['done'].values[-1]:
                discounted_reward = value_
            data[f'{i}']['total_reward'] = sth.discounted_sum(
                data[f'{i}']['reward'], 1, data[f'{i}']['reward'].values[-1],
                done_index, train_config['init_max_step'])
            if train_config['algorithm'].value <= 3:
                data[f'{i}']['discounted_reward'] = sth.discounted_sum(
                    data[f'{i}']['reward'], hyper_config['gamma'],
                    discounted_reward, done_index,
                    train_config['init_max_step'])
                data[f'{i}']['td_error'] = sth.discounted_sum_minus(
                    data[f'{i}']['reward'].values, hyper_config['gamma'],
                    value_, done_index, data[f'{i}']['value'].values,
                    train_config['init_max_step'])
                data[f'{i}']['advantage'] = sth.discounted_sum(
                    data[f'{i}']['td_error'],
                    hyper_config['lambda'] * hyper_config['gamma'], 0,
                    done_index, train_config['init_max_step'])
            else:
                data[f'{i}']['discounted_reward'] = sth.discounted_sum(
                    data[f'{i}']['reward'], hyper_config['gamma'],
                    discounted_reward, done_index,
                    train_config['init_max_step'], data[f'{i}']['value'])
                data[f'{i}']['advantage'] = None
            total_reward += (data[f'{i}']['total_reward'][0] / agents_num)
            total_discounted_reward += (data[f'{i}']['discounted_reward'][0] /
                                        agents_num)
        if dones:
            print(
                f'#Agents Num#: {agents_num} \ttotal_dones: {dones} \ttotal_hits: {hits} \tratio: {hits/dones:.2%}'
            )
        else:
            print(f'#Agents Num#: {agents_num} \tOMG! ALL AGENTS NO DONE.')
        end = time.time()
        print(f'calculate cost time: {end - start}')

        start = time.time()
        for j in range(agents_num):
            for _ in range(train_config['epoch']):
                for i in range(0, train_config['init_max_step'],
                               train_config['batchsize']):
                    if train_config['random_batch']:
                        i_data = data[f'{j}'].sample(
                            n=train_config['batchsize']
                        ) if train_config['batchsize'] < train_config[
                            'init_max_step'] else data[f'{j}']
                    else:
                        i_data = data[f'{j}'].iloc[
                            i:i + train_config['batchsize'], :]
                    model.learn(
                        s=i_data['state'].values.tolist(),
                        a=i_data['action'].values.tolist(),
                        r=i_data['reward'].values[:, np.newaxis],
                        s_=i_data['next_state'].values.tolist(),
                        dc_r=i_data['discounted_reward'].values[:, np.newaxis],
                        episode=episode,
                        sigma_offset=sigma_offset,
                        old_prob=i_data['old_prob'].values.tolist(),
                        advantage=i_data['advantage'].values[:, np.newaxis])
        learn_time = time.time() - start
        print(f'learn cost time: {learn_time}')
        print('episede: {0} steps: {1} dc_reward: {2} reward: {3}'.format(
            episode, step, total_discounted_reward, total_reward))