Пример #1
0
    def __init__(self, poi_info, user_KG, params):
        self.poi_info = poi_info
        self.user_KG = user_KG
        self.visit_counter = 0
        self.ll = params.ll
        self.lc = params.lc
        self.lp = params.lp
        self.poi_cat_dict = poi_info.poi_cat_dict
        self.poi_loc_dict = poi_info.poi_loc_dict
        self.poi_dist_mat = poi_info.poi_dist_mat
        self.cat_sim_mat = poi_info.cat_sim_mat

        self.memory_capacity = params.memory_capacity

        self.environment = Environment(user_KG.s_u.shape[1],
                                       self.poi_info.env_nt_1,
                                       self.poi_info.env_nt_2)

        self.dqn = DQN(self.environment,
                       user_KG.s_u.shape[1] + user_KG.s_KG.x.shape[1],
                       user_KG.s_KG.num_POI,
                       params.memory_capacity,
                       params.lr,
                       params.epsilon,
                       params.batch_size,
                       params.gamma,
                       params.target_replace_iter,
                       mode=params.priority_mode)

        self.predict_POI_index = np.random.randint(user_KG.s_KG.num_POI)

        self.r = reward(params.ll, params.lc, params.lp,
                        self.predict_POI_index, 0, poi_info.poi_cat_dict,
                        poi_info.poi_loc_dict, poi_info.poi_dist_mat,
                        poi_info.cat_sim_mat)
Пример #2
0
    def fit(self, train_loader):

        num_visits = len(train_loader) * train_loader.batch_size
        for batch_ndx, sample in enumerate(train_loader):
            for i in range(len(sample.user_list)):
                s_u_, s_KG_ = self.environment(
                    self.user_KG.s_u, self.user_KG.s_KG,
                    sample.temporal_list[i].view(self.poi_info.env_nt_1,
                                                 self.poi_info.env_nt_2),
                    sample.user_list[i], sample.poi_list[i])
                self.dqn.store_transition(
                    (self.user_KG.s_u[sample.user_list[i]], self.user_KG.s_KG),
                    self.predict_POI_index, self.r,
                    (s_u_[sample.user_list[i]], s_KG_))

                if self.dqn.memory_counter > self.memory_capacity:
                    self.dqn.learn()

                if self.visit_counter < num_visits - 1:
                    self.user_KG.s_u = s_u_
                    self.user_KG.s_KG = s_KG_
                    self.visit_counter += 1
                    self.predict_POI_index = self.dqn.choose_action(
                        (self.user_KG.s_u[sample.user_list[i]],
                         self.user_KG.s_KG))
                    self.r = reward(
                        self.ll, self.lc, self.lp, self.predict_POI_index,
                        int(sample.poi_list[
                            self.visit_counter %
                            (train_loader.batch_size * batch_ndx + 1)]),
                        self.poi_cat_dict, self.poi_loc_dict,
                        self.poi_dist_mat, self.cat_sim_mat)

            logging.info(
                "The {} training batch has been done!".format(batch_ndx))
Пример #3
0
    x[:, 0] = np.array([pi / 2, pi / 2, 0, 0])
    y = np.zeros([num_actions, max_it])
    mean_reward = 0
    success = False
    for i in range(1, max_it - 1):
        state = x[:, i]
        qval = Q.predict(state.reshape([1, n]))
        if (random.random() < rand_eps):
            action = np.random.randint(0, num_actions)
        else:
            action = np.argmax(qval)
        u = actions[action]

        new_state = utils.rung_kutta4(myarm, state.reshape([n, 1]), u, dt)
        x[:, i + 1] = new_state.reshape([4])
        reward = utils.reward(new_state, u, target_state)
        mean_reward += reward / max_it

        newQ = Q.predict(new_state.reshape([1, n]))
        maxQ = np.max(newQ)
        y[:, i + 1] = qval[:]

        if utils.close_enough(new_state, target_state):
            bonus = 1000
        else:
            bonus = 0
        y[action, i] = reward + gamma * maxQ + bonus
        if i != 0 and i % batch_size == 0:
            Q.update(np.transpose(x[:, i - batch_size:i]),
                     np.transpose(y[:, i - batch_size:i]))
Пример #4
0
#agent2 = cfr_plus_agent2.CFRAPlusgent(env, isAbs=False, CFR_num=1, tra_num=2)
agent3 = RandomAgent(action_num=env.action_num)
l = []

from rlcard.utils.logger import Logger
root_path = './model_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'
logger = Logger(xlabel='iteration',
                ylabel='exploitability',
                legend='DeepCFR+_model',
                log_path=log_path,
                csv_path=csv_path)

r = utils.reward()
'''
start = time.perf_counter()
e1 = np.mean(r.computer_reward(agent0, agent2, evaluate_num*20, Process_num, eval_env))
e2 = np.mean(r.computer_reward(agent1, agent2, evaluate_num*20, Process_num, eval_env))
end = time.perf_counter()
logger.log('eposide {}:{:.5f},{:.5f} test time:{}'.format(0, e1, e2, end-start))
'''

for i in range(100):
    start = time.perf_counter()
    agent0.deepCFR(i, 8)
    #agent1.train(i,8)#20*8*1*1
    #agent2.train(i,8)
    e1 = np.mean(
        r.computer_reward(agent0, agent3, evaluate_num * 50, Process_num,
Пример #5
0
 def play(action):
     ctr = s.realize(action)
     r, c = reward(ctr, s.gamma, s.disj)
     return r, c if cascade else [int(click) for click in ctr], s.regret(action)
Пример #6
0
 def play(action):
     ctr = s.realize(action)
     r, c = reward(ctr, s.gamma, s.disj)
     return r, c if cascade else [int(click)
                                  for click in ctr], s.regret(action)
Пример #7
0
        action = utils.select_action(transition[pos[0],pos[1],:])

        pos = utils.get_neighbors(pos,level)[action]
        path.append( pos )
        actions.append( action )

        if all( pos==finish ):
            completed = True
            break
        #
        #_ = input()
    #

    path = np.array(path)

    rv = utils.reward(path, completed, maxit)
    transition = utils.update_transitions(transition, path, actions, rv)

    pathlens.append( len(path) )

    if j%1==0:
        utils.vis_path(ax,path)
        utils.vis_transition(ax,transition)

        for entry in ax.texts:
            entry.remove()

        bbox=dict(facecolor='k', edgecolor='black', alpha=0.3)
        ax.text(0,0,'iteration %i'%(j+1), fontsize=14, va='center', ha='left', color='w',bbox=bbox)

        # save frames for animation.
Пример #8
0
import numpy as np
import matlab.engine
from utils import reward

step_size = .2
epsilon = 1.0e-3
n_steps = 2000
frac_sec = 0.5

action_set = np.linspace(-.25, .25, 11)

for i in range(10):

    a = np.random.choice(action_set, 1)[0]
    frac_sec += a

    if frac_sec > 1 or frac_sec < 0:
        continue

    eng = matlab.engine.start_matlab()

    pressure_seq, prms = eng.sim_main(float(frac_sec), n_steps, nargout=2)
    pressure_seq = np.array(pressure_seq)
    prms = np.array(prms)
    r = reward(prms)
    print("Iteration: {}, Burn Fraction: {}, reward: {}".format(
        i, frac_sec, r))

eng.quit()