class DDPGPrioritizedReplay(DDPG): def __init__(self, s_dim, a_dim, a_bound, a_lr=0.001, c_lr=0.001, tau=0.001, gamma=0.9, memory_capacity=5000, batch_size=64, train={'train': True, 'save_iter': None, 'load_point': -1}, model_dir='./model', ): super(DDPGPrioritizedReplay, self).__init__( s_dim=s_dim, a_dim=a_dim, a_bound=a_bound, a_lr=a_lr, c_lr=c_lr, tau=tau, gamma=gamma, memory_capacity=memory_capacity, batch_size=batch_size, train=train, model_dir=model_dir,) self.memory = Memory(capacity=memory_capacity, batch_size=batch_size, s_dim=s_dim, a_dim=a_dim) def learn(self, lock=None): # hard replacement self._soft_rep_target() self._check_save() for _ in range(self.update_times): self.learn_counter += 1 if lock is not None: lock.acquire() tree_idx, bt, ISWeights = self.memory.sample() bs, ba, br, bs_ = bt['s'], bt['a'], bt['r'], bt['s_'] Vbs, Vba, Vbr, Vbs_, VISW = \ Variable(torch.from_numpy(bs).float()), Variable(torch.from_numpy(ba).float()), \ Variable(torch.from_numpy(br).float()), Variable(torch.from_numpy(bs_).float()),\ Variable(torch.from_numpy(ISWeights).float()) target_q = Vbr + self.gamma * self.cnet_(Vbs_, self.anet_(Vbs_)).detach() # not train td_errors = self.cnet(Vbs, Vba) - target_q # update priority abs_errors = torch.abs(td_errors).data.numpy() self.memory.batch_update(tree_idx, abs_errors) if lock is not None: lock.release() c_loss = torch.mean(VISW * torch.pow(td_errors, 2)) self.copt.zero_grad() c_loss.backward() self.copt.step() policy_loss = -self.cnet(Vbs, self.anet(Vbs)).mean() self.aopt.zero_grad() policy_loss.backward() self.aopt.step() def store_transition(self, s, a, r, s_): if a.ndim < 2: a = a[:, None] if r.ndim < 2: r = r[:, None] s = np.ascontiguousarray(s) s_ = np.ascontiguousarray(s_) self.memory.store(s, a, r, s_)
class DeepQNetwork: def __init__(self, n_action, n_width, n_height, n_channel, learning_rate=0.0001, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=500, batch_size=32, e_greedy_increment=None, output_graph=True, double_q=True, dueling=True, prioritized=True, sess=None, load_memory=False): self.n_action = n_action self.n_width = n_width self.n_height = n_height self.n_channel = n_channel self.n_l1 = 64 self.lr = learning_rate self.gamma = reward_decay self.epsilon_max = e_greedy self.replace_target_iter = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.epsilon_increment = e_greedy_increment self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max self.double_q = double_q self.dueling = dueling self.prioritized = prioritized self.output_graph = output_graph self.learn_step_counter = 0 if self.prioritized: self.memory = Memory(capacity=memory_size) else: self.memory = np.zeros((self.memory_size, n_width * 2 + 2)) self.graph = tf.Graph() self._build_net() with self.graph.as_default() as graph: self.init = tf.global_variables_initializer() self.saver = tf.train.Saver() t_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_net') e_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='eval_net') self.replace_target_op = [ tf.assign(t, e) for t, e in zip(t_params, e_params) ] if sess is None: self.sess = tf.Session(graph=self.graph) self.sess.run(self.init) # self.sess = tf.Session() # self.sess.run(tf.global_variables_initializer()) else: self.sess = sess if self.output_graph: self.summary_writer = tf.summary.FileWriter("log/", self.graph) # self.summary_writer = tf.summary.FileWriter("log/", self.sess.graph) self.cost_his = [] def _build_net(self): def build_layers(s, c_names, n_l1, w_initializer, b_initializer): # s = tf.reshape(s, [-1, 1, self.n_width, self.n_channel]) n_filter = 32 with tf.variable_scope('conv1') as scope: k1 = tf.get_variable('kernel1', shape=[1, 1, self.n_channel, n_filter], collections=c_names) conv1 = tf.nn.conv2d(s, k1, strides=[1, 1, 1, 1], padding='SAME') with tf.variable_scope('conv2') as scope: k2_1 = tf.get_variable('kernel2_1', shape=[1, 1, self.n_channel, n_filter], collections=c_names) conv2 = tf.nn.conv2d(s, k2_1, strides=[1, 1, 1, 1], padding='SAME') k2_2 = tf.get_variable('kernel2_2', shape=[3, 3, n_filter, n_filter], collections=c_names) conv2 = tf.nn.conv2d(conv2, k2_2, strides=[1, 1, 1, 1], padding='SAME') k2_3 = tf.get_variable('kernel2_3', shape=[3, 3, n_filter, n_filter], collections=c_names) conv2 = tf.nn.conv2d(conv2, k2_3, strides=[1, 1, 1, 1], padding='SAME') with tf.variable_scope('conv3') as scope: k3_1 = tf.get_variable('kernel3_1', shape=[1, 1, self.n_channel, n_filter], collections=c_names) conv3 = tf.nn.conv2d(s, k3_1, strides=[1, 1, 1, 1], padding='SAME') k3_2 = tf.get_variable('kernel3_2', shape=[5, 5, n_filter, n_filter], collections=c_names) conv3 = tf.nn.conv2d(conv3, k3_2, strides=[1, 1, 1, 1], padding='SAME') with tf.variable_scope('conv4') as scrope: conv4 = tf.layers.average_pooling2d(s, [1, 3], [1, 1], padding='SAME') k4 = tf.get_variable('kernel4', shape=[1, 1, self.n_channel, n_filter], collections=c_names) conv4 = tf.nn.conv2d(conv4, k4, strides=[1, 1, 1, 1], padding='SAME') with tf.variable_scope('concat') as scope: inception1 = tf.concat([conv1, conv2, conv3, conv4], axis=3) bias = tf.get_variable(name='biases', initializer=tf.constant_initializer(), shape=[4 * n_filter], collections=c_names) inception1 = tf.nn.relu(tf.nn.bias_add(inception1, bias)) fc = tf.layers.average_pooling2d(inception1, [1, 8], [1, 8], padding='SAME') # fc = tf.contrib.layers.flatten(fc) fc = tf.reshape(fc, (tf.shape(fc)[0], self.n_width, n_filter * 4)) with tf.variable_scope('rnn') as scope: cell = tf.contrib.rnn.BasicLSTMCell(num_units=n_filter, state_is_tuple=True) state_in = cell.zero_state(tf.shape(fc)[0], tf.float32) # cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob) rnn, state = tf.nn.dynamic_rnn(inputs=fc, cell=cell, dtype=tf.float32, initial_state=state_in) fc = state[1] # fc = tf.contrib.layers.flatten(rnn) with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [n_filter, n_l1], initializer=w_initializer, collections=c_names) b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) fc = tf.nn.relu(tf.matmul(fc, w1) + b1) if self.dueling: with tf.variable_scope('Value'): w_out = tf.get_variable('w_out', [n_l1, 1], initializer=w_initializer, collections=c_names) b_out = tf.get_variable('b_out', [1, 1], initializer=b_initializer, collections=c_names) self.V = tf.matmul(fc, w_out) + b_out with tf.variable_scope('Advantage'): w_out = tf.get_variable('w_out', [n_l1, self.n_action], initializer=w_initializer, collections=c_names) b_out = tf.get_variable('b_out', [1, self.n_action], initializer=b_initializer, collections=c_names) self.A = tf.matmul(fc, w_out) + b_out with tf.variable_scope('Q'): out = self.V + (self.A - tf.reduce_mean( self.A, axis=1, keep_dims=True)) else: with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [n_l1, self.n_action], initializer=w_initializer, collections=c_names) b2 = tf.get_variable('b2', [1, self.n_action], initializer=b_initializer, collections=c_names) out = tf.matmul(l1, w2) + b2 return out # ------------------ build evaluate_net ------------------ with self.graph.as_default() as graph: if self.prioritized: self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') self.s = tf.placeholder( tf.float32, [None, self.n_width, self.n_height, self.n_channel], name='s') # input self.q_target = tf.placeholder( tf.float32, [None, self.n_action], name='Q_target') # for calculating loss with tf.variable_scope('eval_net'): c_names, n_l1, w_initializer, b_initializer = \ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], self.n_l1, \ tf.contrib.layers.xavier_initializer(), tf.random_normal_initializer() self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer) with tf.variable_scope('loss'): if self.prioritized: self.abs_errors = tf.reduce_sum( tf.abs(self.q_target - self.q_eval), axis=1) # for updating Sumtree self.loss = tf.reduce_mean( self.ISWeights * tf.squared_difference(self.q_target, self.q_eval)) else: self.loss = tf.reduce_mean( tf.squared_difference(self.q_target, self.q_eval)) with tf.variable_scope('train'): self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize( self.loss) # ------------------ build target_net ------------------ self.s_ = tf.placeholder( tf.float32, [None, self.n_width, self.n_height, self.n_channel], name='s_') # input with tf.variable_scope('target_net'): c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer) with tf.variable_scope('summary') as scope: scalar_summary_tags = ['loss_avg', 'e_balance', \ 'q_max', 'q_total', 'epsilon', \ 'sharpe_ratio', 'n_trades', \ 'win', 'win_buy', 'win_sell', \ 'max_profit', 'avg_profit', 'max_loss', 'avg_loss', \ 'total_profit', 'total_loss', \ 'max_holding_period', 'avg_holding_period', \ 'avg_profit_holding_period', 'avg_loss_holding_period', \ 'max_floating_profit', 'max_floating_loss', \ 'max_total_balance', 'profit_make_good', \ 'up_buy', 'down_sell', \ 'n_buy', 'n_sell', 'reward', 'diff_sharpe'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder( tf.float32, None, name=tag.replace(' ', '_') + '_0') self.summary_ops[tag] = tf.summary.scalar( tag, self.summary_placeholders[tag]) # with tf.variable_scope('training_step'): # training_step_mse = tf.summary.scalar('mse', self.loss) histogram_summary_tags = ['r_actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = tf.placeholder( 'float32', None, name=tag.replace(' ', '_') + '_0') self.summary_ops[tag] = tf.summary.histogram( tag, self.summary_placeholders[tag]) def store_transition(self, s, a, r, s_): # transition = np.hstack((s, [a, r], s_)) transition = {'s': s, 'a': a, 'r': r, 's_': s_} if self.prioritized: # prioritized replay self.memory.store( transition) # have high priority for newly arrived transition else: if not hasattr(self, 'memory_counter'): self.memory_counter = 0 index = self.memory_counter % self.memory_size self.memory[index, :] = transition self.memory_counter += 1 def choose_action(self, observation, random=False): if np.random.uniform( ) > self.epsilon or random is True: # choosing action action = np.random.randint(0, self.n_action) else: observation = observation[np.newaxis, :] actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) action = np.argmax(actions_value) return action def learn(self): if self.learn_step_counter % self.replace_target_iter == 0: self.sess.run(self.replace_target_op) print('\ntarget_params_replaced\n') if self.prioritized: tree_idx, batch_memory, ISWeights = self.memory.sample( self.batch_size) else: if self.memory_counter > self.memory_size: sample_index = np.random.choice(self.memory_size, size=self.batch_size) else: sample_index = np.random.choice(self.memory_counter, size=self.batch_size) batch_memory = self.memory[sample_index, :] s = np.array([batch_memory[i]['s'] for i in range(self.batch_size)]) s_ = np.array([batch_memory[i]['s_'] for i in range(self.batch_size)]) q_next, q_eval4next = self.sess.run([self.q_next, self.q_eval], feed_dict={ self.s_: s_, self.s: s_ }) # feed_dict={self.s_: batch_memory[:, -self.n_width:], # next observation # self.s: batch_memory[:, -self.n_width:]}) # next observation # q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_width]}) q_eval = self.sess.run(self.q_eval, feed_dict={self.s: s}) q_target = q_eval.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) # eval_act_index = batch_memory[:, self.n_width].astype(int) eval_act_index = np.array( [batch_memory[i]['a'] for i in range(self.batch_size)], dtype=np.int32) # reward = batch_memory[:, self.n_width + 1] reward = np.array( [batch_memory[i]['r'] for i in range(self.batch_size)]) if self.double_q: max_act4next = np.argmax( q_eval4next, axis=1 ) # the action that brings the highest value is evaluated by q_eval selected_q_next = q_next[ batch_index, max_act4next] # Double DQN, select q_next depending on above actions else: selected_q_next = np.max(q_next, axis=1) # the natural DQN q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next if self.prioritized: _, abs_errors, self.cost = self.sess.run( [self._train_op, self.abs_errors, self.loss], # feed_dict={self.s: batch_memory[:, :self.n_width], feed_dict={ self.s: s, self.q_target: q_target, self.ISWeights: ISWeights }) self.memory.batch_update(tree_idx, abs_errors) else: _, self.cost = self.sess.run([self._train_op, self.loss], feed_dict={ self.s: batch_memory[:, :self.n_width], self.q_target: q_target }) self.cost_his.append(self.cost) self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max self.learn_step_counter += 1 def inject_summary(self, tag_dict, episode): summary_str_lists = self.sess.run( [self.summary_ops[tag] for tag in tag_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in tag_dict.items() }) for summary_str in summary_str_lists: self.summary_writer.add_summary(summary_str, episode) # self.summary_writer.add_summary(self.param_summary, episode) def finish_episode(self, episode, stat): if episode > 0: injectDict = { # scalar 'loss_avg': self.totalLoss, # 'r_balance': realBalance, 'epsilon': self.epsilon, 'q_max': self.totalMaxQ, 'q_total': self.totalQ, 'r_actions': self.r_actions } if self.output_graph: self.inject_summary(injectDict, episode) # self.saveParam(mode = 0) # if episode % self.ckptSavePeriod == 0: # self.saveParam(dir = '%d' % (episode), mode = 1) self.r_actions = deque() self.totalLoss = 0.0 self.totalQ = 0.0 self.totalMaxQ = 0.0 def load(self, step=0): print(sys.path) # checkpoint_dir = '/Users/cc/Project/Lean/Launcher/bin/Debug/python/oracle/data/' checkpoint_dir = './data' try: ckpt = tf.train.get_checkpoint_state(checkpoint_dir) self.learn_step_counter = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) except: ckpt = None if not (ckpt and ckpt.model_checkpoint_path): print('Cannot find any saved sess in checkpoint_dir') #sys.exit(2) else: try: # self.saver = tf.train.Saver() self.saver.restore(self.sess, ckpt.model_checkpoint_path) self.summary_writer.add_session_log( tf.SessionLog(status=tf.SessionLog.START), global_step=step) print('Sess restored successfully: {}'.format( ckpt.model_checkpoint_path)) except Exception as e: print('Failed to load sess: {}'.format(str(e))) # sys.exit(2) self.learn_step_counter = 1 def save(self, path=None): if (path is not None): save_path = path else: save_path = './data/sess.ckpt' self.saver.save(self.sess, save_path, global_step=self.learn_step_counter) print('Saving sess to {}: {}'.format(save_path, self.learn_step_counter))
class Agent(object): def __init__(self, n_s, n_a, hiddens=(128, 64), epsilon=1.0, epsilon_min=0.005, epsilon_decay=0.05, gamma=0.99, batch_size=64, memory_capacity=100000, lr=0.001, is_dueling=False, is_prioritize=True, replace_iter=100, is_soft=False, tau=0.01, e=0.01, a=0.6, b=0.4): self.n_s = n_s self.n_a = n_a self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.replace_iter = replace_iter self.lr = lr self.gamma = gamma self.batch_size = batch_size self.memory_capacity = memory_capacity self.is_soft = is_soft self.is_prioritize = is_prioritize self.tau = tau if use_gpu: self.eval_net = Net(n_s, n_a, hiddens=hiddens, is_dueling=is_dueling).cuda() self.target_net = Net(n_s, n_a, hiddens=hiddens, is_dueling=is_dueling).cuda() else: self.eval_net = Net(n_s, n_a, hiddens=hiddens, is_dueling=is_dueling) self.target_net = Net(n_s, n_a, hiddens=hiddens, is_dueling=is_dueling) if is_prioritize: self.memory = Memory(memory_capacity, e, a, b) else: self.memory = np.zeros((memory_capacity, self.n_s * 2 + 2)) self.memory_count = 0 self.learn_count = 0 self.loss_func = nn.MSELoss() self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.lr) def act(self, s): if np.random.random() <= self.epsilon: # random return np.random.randint(self.n_a) else: # max s = FloatTensor(s) action_value = self.eval_net(s) a = torch.max(action_value, 1)[1].data.cpu().numpy()[0] return a def step(self, s, a, r, s_, done): if self.is_prioritize: # experience = s, a, r, s_, done experience = np.hstack((s, [a, r], s_)) self.memory.store(experience) self.memory_count += 1 if np.count_nonzero(self.memory.tree.tree) > self.batch_size: tree_idx, batch, ISWeights_mb = self.memory.sample( self.batch_size) self.learn(batch, tree_idx, ISWeights_mb) else: transition = np.hstack((s, [a, r], s_)) # replace the old memory with new memory index = self.memory_count % self.memory_capacity self.memory[index, :] = transition self.memory_count += 1 if self.memory_count < self.memory_capacity: return # sample batch transitions sample_index = np.random.choice(self.memory_capacity, self.batch_size) batch = self.memory[sample_index, :] self.learn(batch) def learn(self, batch, tree_idx=None, ISWeights_mb=None): b_s = torch.squeeze(FloatTensor(batch[:, :self.n_s]), 0) b_a = torch.squeeze(LongTensor(batch[:, self.n_s:self.n_s + 1]), 0) b_r = torch.squeeze(FloatTensor(batch[:, self.n_s + 1:self.n_s + 2]), 0) b_s_ = torch.squeeze(FloatTensor(batch[:, -self.n_s:]), 0) temp = self.eval_net(b_s) eval_q = torch.gather(temp, 1, b_a) next_max_from_eval = self.eval_net(b_s_) next_max_from_eval_index = next_max_from_eval.max(1)[1].unsqueeze(1) next_actions = self.target_net(b_s_).detach() next_max = next_actions.gather(1, next_max_from_eval_index) target_q = b_r + self.gamma * next_max # * (1 - b_done) abs_errors = numpy(torch.sum(torch.abs(target_q - eval_q), dim=1)) loss = self.loss_func(eval_q, target_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.is_prioritize: self.memory.batch_update(tree_idx=tree_idx, abs_errors=abs_errors) self.update() self.learn_count += 1 def update(self): next_epsilon = self.epsilon * self.epsilon_decay if next_epsilon <= self.epsilon_min: self.epsilon = self.epsilon_min else: self.epsilon = next_epsilon if self.is_soft: for target_param, local_param in zip(self.target_net.parameters(), self.eval_net.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) else: if self.learn_count % self.replace_iter == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) # save all net def save(self, name): torch.save(self.eval_net, name) # load all net def load(self, name): return torch.load(name)
Qs_next_state = sess.run(dqn.output, feed_dict = {dqn.inputs_: next_state_mb}) target_Qs_next_state = sess.run(targetnet.output, feed_dict = {targetnet.inputs_: next_state_mb}) for i in range(0, batch_size): terminal = done_mb[i] action = np.argmax(Qs_next_state[i]) if done: target = reward_mb[i] else: target = reward_mb[i] + gamma * target_Qs_next_state[i][action] target_Qs_batch.append(target) target_mb = np.array([each for each in target_Qs_batch]) _, loss, abbs_error = sess.run([dqn.optim, dqn.loss, dqn.absolute_errors], feed_dict = {dqn.inputs_:state_mb, dqn.actions_ : action_mb, dqn.ISWeights: ISWeights_mb, dqn.target_Q: target_mb}) memory.batch_update(tree_idx, abbs_error) if tau > max_tau: op_holder = update_target_graph() sess.run(op_holder)
class Agent: def __init__(self, demo_transitions=None): replay_buffer_size = config.REPLAY_BUFFER_SIZE demo_buffer_size = config.DEMO_BUFFER_SIZE # replay_memory stores both demo data and generated data self.replay_memory = Memory(capacity=replay_buffer_size, permanent_size=len(demo_transitions)) # demo_memory only store demo data self.demo_memory = Memory(capacity=demo_buffer_size, permanent_size=demo_buffer_size) self.epsilon = config.INITIAL_EPSILON self.steps_done = 0 # self.target_net = DQN().to(device, dtype=torch.double) self.policy_net = DQN().to(device,dtype=torch.double) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=config.LEARNING_RATE, weight_decay=1) def replay_memory_push(self, transitions): """ Add transitions to replay_memory :param transitions: List of transitions :return: """ for t in transitions: self.replay_memory.push(np.array(t, dtype=object)) def demo_memory_push(self, transitions): """ Add transitions to demo_memory :param transitions: List of transitions :return: """ for t in transitions: self.demo_memory.push(np.array(t, dtype=object)) def e_greedy_select_action(self, state): """ :param state: :return: """ self.epsilon = config.FINAL_EPSILON + (config.INITIAL_EPSILON - config.FINAL_EPSILON) * \ np.exp(-1. * self.steps_done / config.EPSILON_DECAY) self.steps_done += 1 if random.random() <= self.epsilon or state is None: return random.randint(0, config.ACTION_DIM - 1) else: if isinstance(state, np.ndarray): state = torch.from_numpy(state).to(device, dtype=torch.double) return self.policy_net(state.to(device, dtype=torch.double)).max(1)[1].view(1, 1).item() # TODO: def pre_train(self): """ pre train :return: """ k = config.PRE_TRAIN_STEP_NUM print("Pre training for %d steps." % k) # for i in tqdm(range(k)): for i in range(k): self.train(pre_train=True) print('Pretrain steps: %d' % i) if i % config.TARGET_UPDATE == 0: self.update_target_net() print('Target network updated!') print("Pre training done for %d steps." % k) def train(self, pre_train=False): """ train Q network :param pre_train: if used for pre train or not :return: """ # choose which memory to use mem = self.demo_memory if pre_train else self.replay_memory # sample batch_id, batch_data, batch_weight = mem.sample(config.BATCH_SIZE) # extract data from each column batch = Transition(*zip(*batch_data.tolist())) #array to list to transform # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.uint8)# TODO: change to target state when appropreiate non_final_next_states = torch.cat([torch.Tensor(s.double()) for s in batch.next_state if s is not None]).double() state_batch = torch.cat(batch.state).double() action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward).double() n_reward_batch = torch.cat(batch.n_reward).double() # # Compute Q(s_t, a) - the model computes Q(s_t), the action to take for the next state state_action_values = self.policy_net(state_batch).gather(1, action_batch) # calculate Q(s_t, a, \theta) under the current actions next_state_values = torch.zeros(config.BATCH_SIZE, device=device) # Compute V(s_{t+1}) for all # next_state_values[non_final_mask] = self.policy_net(non_final_next_states).data.max(1)[0] #next maximum state values #DQN action_batch_next_state = self.policy_net(non_final_next_states).max(1)[1].unsqueeze(1) #DDQN next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, action_batch_next_state).squeeze().detach() #DDQN expected_state_action_values = (next_state_values * config.Q_GAMMA) + reward_batch.squeeze(1) # calculating the q loss and n-step return loss q_loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1), size_average=False) n_step_loss = F.mse_loss(state_action_values, n_reward_batch.unsqueeze(1), size_average=False) n_step_loss = 0 # calculating the supervised loss if pre_train: action_dim = config.ACTION_DIM margins = (torch.ones(action_dim, action_dim) - torch.eye(action_dim)) * config.SU_LOSS_MARGIN batch_margins = margins[action_batch.data.squeeze().cpu()] state_action_values_with_margin = self.policy_net(state_batch) + batch_margins supervised_loss = (state_action_values_with_margin.max(1)[0].unsqueeze(1) - state_action_values).pow(2).sum() else: supervised_loss = 0.0 loss = q_loss + config.SU_LOSS_LAMBDA * supervised_loss + config.N_STEP_LOSS_LAMBDA * n_step_loss # optimization step and logging self.optimizer.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 100) # self.optimizer.step() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() with torch.no_grad(): abs_errors = torch.sum(torch.abs(state_action_values - expected_state_action_values.unsqueeze(1)), dim=1) abs_errors = abs_errors.detach().numpy() self.replay_memory.batch_update(batch_id, abs_errors) # update priorities for data in memory def update_target_net(self): """ :return: """ # Update the target network self.target_net.load_state_dict(self.policy_net.state_dict())
class Agent: sigma = 0.2 alpha = 1.01 epsilon = 0.5 min_epsilon = 0.01 name = ["iqn_e", "iqn.h5"] custom_objects = custom_objects def __init__(self, action_size=3, lr=1e-3, n=3, spread=5, step_size=1000, money=10000, leverage=500, restore=False): self.n = n self.spread = spread self.action_size = action_size self.step_size = step_size self.lr = lr self.money = money self.leverage = leverage self.restore = restore self.build_model = model self.memory = Memory(50000) self.state() self.build() self.w = self.model.get_weights() self.reset = 0 self.e = [] def build(self): if self.restore: self.i = np.load(f"{self.name[0]}.npy") self.model = tf.keras.models.load_model(self.name[1], custom_objects=self.custom_objects) else: self.i = 0 self.model = self.build_model(self.x.shape[-2:], self.action_size) opt = tfa.optimizers.Lookahead(tf.keras.optimizers.Nadam(self.lr)) # opt = self.model.compile(opt) self.target_model = self.build_model(self.x.shape[-2:], self.action_size) self.target_model.set_weights(self.model.get_weights()) get = self.model.get_layer self.q = tf.keras.backend.function([get("i").input,get("t").input], get("q").output) get = self.target_model.get_layer self.target_q = tf.keras.backend.function([get("i").input,get("t").input], get("q").output) def state(self): t = 1 x = np.load(f"x{t}.npy") shape = x.shape self.x = x.reshape((shape[0], -1, shape[-2], shape[-1])) y = np.load(f"target{t}.npy") shape = y.shape y = y.reshape((shape[0], y.shape[2], -1)) self.y, self.v, self.atr, self.high, self.low = \ y[:, 0], y[:, 1], y[:, 2], y[:, 3], y[:, 4] self.train_step = np.arange(0, int(self.x.shape[1] - self.x.shape[1] * 0.2 - self.step_size), self.step_size) # self.train_step = np.arange(0, int(self.x.shape[1] - self.x.shape[1] * 0.2 - self.step_size)) self.test_step = self.train_step[-1] + self.step_size, self.x.shape[1] - self.step_size self.test_step2 = np.arange(self.test_step[0], self.test_step[1], self.step_size) def train(self, b = 128): tree_idx, replay, isw = self.memory.sample(b) self.states = states = np.array([a[0][0] for a in replay], np.float32) new_states = np.array([a[0][3] for a in replay], np.float32) actions = np.array([a[0][1] for a in replay]).reshape((-1, 1)) rewards = np.array([a[0][2] for a in replay], np.float32).reshape((-1, 1)) gamma = np.array([a[0][4] for a in replay]).reshape((-1, 1)) self.tau = tau = np.random.uniform(0, 1, (len(tree_idx), 32)) target_tau = np.random.uniform(0, 1, (len(tree_idx), 32)) target_q = self.target_q([new_states, target_tau]) target_a = np.argmax(np.sum(self.q([new_states, tau]), -1), -1) with tf.GradientTape() as tape: q = self.model([states, tau]) q_backup = q.numpy() for i in range(len(tree_idx)): q_backup[i, actions[i]] = rewards[i] + gamma[i] * target_q[i, target_a[i]] error = q_backup - q tau = tau.reshape((-1, 1, 32)) huber = tf.where(abs(error) <= 2, error ** 2 * .5, .5 * 2 ** 2 + 2 * tf.abs(error) - 2) loss = tf.maximum(tau * huber, (tau - 1) * huber) error = tf.reduce_sum(tf.reduce_sum(loss, 1), -1) loss = tf.reduce_mean(error) # loss = tf.reduce_mean(error * isw) self.e.append(loss) gradients = tape.gradient(loss, self.model.trainable_variables) # gradients = [tf.clip_by_value(g, -1, 1) for g in gradients] self.model.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) ae = error.numpy().reshape((-1,)) self.ae = ae self.memory.batch_update(tree_idx, ae) self.target_model.set_weights(0.005 * np.array(self.model.get_weights()) + 0.995 * np.array(self.target_model.get_weights())) def step(self, types=0): train = True if types == 0 else False step = range(25) if train else range(10) self.exp = [] for _ in step: s = 0 if types == 2: h = np.random.randint(self.test_step[0], self.test_step[1]) else: h = np.random.choice(self.train_step) self.df = df = self.x[s, h:h + self.step_size] self.trend = trend = self.y[s, h:h + self.step_size] v = self.v[s, h:h + self.step_size] if not train: old_a = 0 lot = 0 money = self.money self.pip = [] tau = np.random.uniform(0, 1, (self.step_size, 32)) q = self.q([df, tau]) q = np.mean(q, -1) / (np.sqrt(np.std(q, -1)) + 1e-10) self.a = action = np.argmax(q, -1) # action = np.argmax( np.sum( self.q([df, tau]), -1 ), -1) for idx, action in zip(range(len(trend) - 1), action): action = 0 if action == 0 else -1 if action == 1 else 1 if (action == 1 or action == -1) and lot == 0: lot = (money * 0.05 / (trend[idx] / self.leverage)) r = trend[idx + 1] - trend[idx] r = (action * r - self.spread * np.abs(old_a - action)) * lot money += r money = np.clip(money, 0, None) self.pip.append(r) if old_a != action: lot = 0 if money <= 0: break old_a = action g = ((money - self.money) / self.money) * 100 self.exp.append(g) else: gammas = [] position = 0 actions = [] rewards = [] old_a = 0 noise_w = [w + np.random.normal(0, self.sigma, w.shape) for w in self.w] noise = np.random.normal(0, 0.1, self.action_size) self.model.set_weights(noise_w) for idx in range(len(trend) -1): df_t = np.array([df[idx]]) df_t = np.random.normal(df_t, 0.005) if np.random.rand() > 0.1: tau = np.random.uniform(0, 1, (1, 32)) q = self.q([df_t, tau]) q = np.mean(q, -1) action = np.argmax(q, -1)[0] else: tau = np.random.uniform(0, 1, (1, 32)) q = self.q([df_t, tau]) q = np.mean(q, -1) q = np.abs(q) / np.sum(np.abs(q), 1).reshape((-1, 1)) * (np.abs(q) / q) q += noise action = np.argmax(q, -1)[0] action = int(action) actions.append(action) action = action if action == 0 else -1 if action == 1 else 1 if old_a == action: r = 0 # r = trend[idx + 1] - trend[idx] # r = action * r - self.spread * np.abs(old_a - action) gamma = 0.99 elif position != 0: r = trend[idx + 1] - position r = action * r - self.spread# * np.abs(old_a - action) gamma = 0 position = 0 else: r = 0 gamma = 0.99 if (action == -1 or action == 1) and position == 0: position = trend[idx] gammas.append(gamma) rewards.append(r) old_a = action if len(rewards) > self.n: r = np.sum(rewards[-self.n:]) * 0.99 ** self.n if gammas[idx - (self.n - 1)] == 0.99 and 0 in gammas[-self.n:]: gammas[idx - (self.n - 1)] = 0.1 try: e = df[idx - (self.n - 1)], actions[idx - (self.n - 1)], r, df[idx + self.n], gammas[ idx - (self.n - 1)] self.memory.store(e) if (self.restore + 1) % 64 == 0: self.model.set_weights(self.w) self.train() self.w = self.model.get_weights() noise = np.random.normal(0, 0.1, self.action_size) self.restore += 1 except: pass if (idx + 1) % (self.step_size // 2) == 0: # 計算コストが高い self.epsilon = np.clip(self.epsilon * 0.99999, 0.05, None) self.threshold = -np.log(1 - self.epsilon + self.epsilon / self.action_size) self.model.set_weights(self.w) q = self.q([self.states, self.tau]) q = tf.reduce_mean(q, -1) noise_w = [w + np.random.normal(0, self.sigma, w.shape) for w in self.w] self.model.set_weights(noise_w) qe = self.q([self.states, self.tau]) qe = tf.reduce_mean(qe, -1) kl = tf.reduce_sum( tf.nn.softmax(q) * ( tf.math.log(tf.nn.softmax(q) + 1e-10) - tf.math.log(tf.nn.softmax(qe) + 1e-10)), axis=-1) mean_kl = np.mean(kl.numpy()) self.sigma = self.alpha * self.sigma if mean_kl < self.threshold else 1 / self.alpha * self.sigma noise_w = [w + np.random.normal(0, self.sigma, w.shape) for w in self.w] self.model.set_weights(noise_w) self.i += 1 if train: self.model.set_weights(self.w) def run(self): train_h = [] test_h = [] for idx in range(10000): start = time.time() if idx % 10 == 0: self.h = np.random.choice(self.train_step) self.step(0) train = [] test = [] for _ in range(1): self.step(1) train.extend(self.exp) self.step(2) test.extend(self.exp) print(f"epoch {self.i}") print(f"speed {time.time() - start}sec") plt.cla() train_h.append(np.median(train)) test_h.append(np.median(test)) plt.plot(train_h, label="train") plt.plot(test_h, label="test") plt.show() df = pd.DataFrame({"train": np.array(train), "test": np.array(test)}) print(df.describe()) np.save(self.name[0], self.i) self.model.save(self.name[1]) try: _ = shutil.copy(f"/content/{self.name[1]}", "/content/drive/My Drive") _ = shutil.copy(f"/content/{self.name[0]}.npy", "/content/drive/My Drive") except: pass
class Model(object): def __init__(self, id, env, action_noise=None, action_bounds=(-1., 1.)): self.reward_dict = defaultdict(float) self.id = id self.env = env self.pi_fn = PI_FEATURE_NUM self.critic_fn = CRITIC_FEATURE_NUM self.action_bounds = action_bounds self.actor = Actor(self.pi_fn, AGENT_ACTION_CNT) self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR) self.reward_net = RewardNet(self.critic_fn, CRITIC_ACTION_NUM) self.rn_optim = Adam(self.reward_net.parameters(), lr=CRITIC_LR) self.memory = Memory() self.action_noise = action_noise def pi(self, state, all_memory_ready, apply_noise=True, done=False): if done: return np.array([0.]) if not all_memory_ready and apply_noise: sigma = np.clip(self.memory.size() / float(MEMORY_MIN_SIZE), 0, 1) * ACTION_NOISE_STDDEV self.action_noise.set_sigma(sigma) return np.clip( np.array([0.]) + self.action_noise(), self.action_bounds[0], self.action_bounds[1]) with torch.no_grad(): obs = get_pi_obs(state, self.id) action = float(self.actor(to_tensor(obs)).detach().numpy()) if self.action_noise is not None and apply_noise: noise = self.action_noise() action += noise action = np.clip(action, self.action_bounds[0], self.action_bounds[1]) return action def is_memory_ready(self): return self.memory.size() > MEMORY_MIN_SIZE def get_joint_info(self, batch, cur_date): interval = int(batch.shape[1] / MAX_GC_CNT) index = list(self.env.date_gc_index[cur_date]) index.sort() index = np.array(index) if interval == 1: return batch[:, index] else: all_index = index for i in range(len(batch) - 1): all_index = np.concatenate( [all_index, index + (i + 1) * MAX_GC_CNT]) return np.reshape( np.reshape(batch, (-1, interval))[all_index], (-1, interval * len(index))) def train(self, all_agents, cur_date): if self.memory.size() < BATCH_SIZE * 50: return None # Get a batch. idx, batch, isw = self.memory.sample(batch_size=BATCH_SIZE) # Get latest reward, since the reward is updating during iteration get_latest_reward(self.reward_dict, batch, self.id) self.rn_optim.zero_grad() s = to_tensor(batch['obs0'], requires_grad=True) a = to_tensor(batch['actions'], requires_grad=True) r = to_tensor(batch['rewards'], requires_grad=True) q = self.reward_net([s, a + ACTION_SCALE]) q_loss = torch.nn.MSELoss()(q, r) q_loss.backward() self.rn_optim.step() tderr = np.abs((q - r).detach().numpy()) self.memory.batch_update(idx, tderr) self.actor_optim.zero_grad() s = to_tensor(batch['obs0'], requires_grad=True) step_left = s[:, 0] * (MAX_STEP + 1) a_loss = -(self.reward_net([s, self.actor(s) + ACTION_SCALE]) * step_left).mean() a_loss.backward() self.actor_optim.step() return q_loss.detach().numpy(), a_loss.detach().numpy()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed = 0, buffer_size = int(1e4), batch_size = 64, gamma = 0.99, tau = 1e-3, lr = 7e-4, update_every = 4): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units=32, fc2_units=8).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units=32, fc2_units=8).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = Memory(buffer_size, state_size, alpha = 0.6) # replay buffer size # Parameters self.batch_size = batch_size # minibatch size self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.update_every = update_every # how often to update the network # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, i_episode): # Save experience in replay memory self.memory.store(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: if (len(self.memory) >= self.batch_size): # If enough samples are available in memory, get radom subset and learn self.learn(self.gamma, i_episode) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, gamma, episode_n): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ tree_id, states, actions, rewards, next_states, dones, ISWeights = self.memory.sample(self.batch_size) # Double DQN # Use local network to select max Q for actions in every experience Q_expected_next_max = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1) # Use gather to get the same actions but from the Q on target network Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_expected_next_max) # Normal DQN # use target network for selecting next Q value # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) dt_errors = Q_targets - Q_expected self.memory.batch_update(tree_id, (abs(dt_errors) + 1e-5).cpu().detach().numpy().flatten()) # Compute loss loss = torch.mul(dt_errors.pow(2), ISWeights) loss = torch.mean(loss) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
class DQfD: def __init__(self, env, config, demo_transitions=None): self.sess = tf.InteractiveSession() self.config = config # replay_memory stores both demo data and generated data, while demo_memory only store demo data self.replay_memory = Memory(capacity=self.config.replay_buffer_size, permanent_data=len(demo_transitions)) self.demo_memory = Memory(capacity=self.config.demo_buffer_size, permanent_data=self.config.demo_buffer_size) self.add_demo_to_memory( demo_transitions=demo_transitions ) # add demo data to both demo_memory & replay_memory self.time_step = 0 self.epsilon = self.config.INITIAL_EPSILON self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.action_batch = tf.placeholder("int32", [None]) self.y_input = tf.placeholder("float", [None, self.action_dim]) self.ISWeights = tf.placeholder("float", [None, 1]) self.n_step_y_input = tf.placeholder( "float", [None, self.action_dim]) # for n-step reward self.isdemo = tf.placeholder("float", [None]) self.eval_input = tf.placeholder("float", [None, self.state_dim]) self.select_input = tf.placeholder("float", [None, self.state_dim]) self.Q_eval self.Q_select self.loss self.optimize self.update_target_net self.abs_errors self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) self.save_model() self.restore_model() def add_demo_to_memory(self, demo_transitions): # add demo data to both demo_memory & replay_memory for t in demo_transitions: self.demo_memory.store(np.array(t, dtype=object)) self.replay_memory.store(np.array(t, dtype=object)) assert len(t) == 10 # use the expert-demo-data to pretrain def pre_train(self): print('Pre-training ...') for i in range(self.config.PRETRAIN_STEPS): self.train_Q_network(pre_train=True) if i % 200 == 0 and i > 0: print('{} th step of pre-train finish ...'.format(i)) self.time_step = 0 print('All pre-train finish.') # TODO: How to add the variable created in tf.layers.dense to the customed collection? # def build_layers(self, state, collections, units_1, units_2, w_i, b_i, regularizer=None): # with tf.variable_scope('dese1'): # dense1 = tf.layers.dense(tf.contrib.layers.flatten(state), activation=tf.nn.relu, units=units_1, # kernel_initializer=w_i, bias_initializer=b_i, # kernel_regularizer=regularizer, bias_regularizer=regularizer) # with tf.variable_scope('dens2'): # dense2 = tf.layers.dense(dense1, activation=tf.nn.relu, units=units_2, # kernel_initializer=w_i, bias_initializer=b_i, # kernel_regularizer=regularizer, bias_regularizer=regularizer) # with tf.variable_scope('dene3'): # dense3 = tf.layers.dense(dense2, activation=tf.nn.relu, units=self.action_dim, # kernel_initializer=w_i, bias_initializer=b_i, # kernel_regularizer=regularizer, bias_regularizer=regularizer) # return dense3 def build_layers(self, state, c_names, units_1, units_2, w_i, b_i, reg=None): a_d = self.action_dim with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [a_d, units_1], initializer=w_i, collections=c_names, regularizer=reg) b1 = tf.get_variable('b1', [1, units_1], initializer=b_i, collections=c_names, regularizer=reg) dense1 = tf.nn.relu(tf.matmul(state, w1) + b1) with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [units_1, units_2], initializer=w_i, collections=c_names, regularizer=reg) b2 = tf.get_variable('b2', [1, units_2], initializer=b_i, collections=c_names, regularizer=reg) dense2 = tf.nn.relu(tf.matmul(dense1, w2) + b2) with tf.variable_scope('l3'): w3 = tf.get_variable('w3', [units_2, a_d], initializer=w_i, collections=c_names, regularizer=reg) b3 = tf.get_variable('b3', [1, a_d], initializer=b_i, collections=c_names, regularizer=reg) dense3 = tf.matmul(dense2, w3) + b3 return dense3 @lazy_property def Q_select(self): with tf.variable_scope('select_net') as scope: c_names = ['select_net_params', tf.GraphKeys.GLOBAL_VARIABLES] w_i = tf.random_uniform_initializer(-0.1, 0.1) b_i = tf.constant_initializer(0.1) reg = tf.contrib.layers.l2_regularizer( scale=0.2) # Note: only parameters in select-net need L2 return self.build_layers(self.select_input, c_names, 24, 24, w_i, b_i, reg) @lazy_property def Q_eval(self): with tf.variable_scope('eval_net') as scope: c_names = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES] w_i = tf.random_uniform_initializer(-0.1, 0.1) b_i = tf.constant_initializer(0.1) return self.build_layers(self.eval_input, c_names, 24, 24, w_i, b_i) def loss_l(self, ae, a): return 0.0 if ae == a else 0.8 def loss_jeq(self, Q_select): jeq = 0.0 for i in range(self.config.BATCH_SIZE): ae = self.action_batch[i] max_value = float("-inf") for a in range(self.action_dim): max_value = tf.maximum(Q_select[i][a] + self.loss_l(ae, a), max_value) jeq += self.isdemo[i] * (max_value - Q_select[i][ae]) return jeq @lazy_property def loss(self): l_dq = tf.reduce_mean( tf.squared_difference(self.Q_select, self.y_input)) l_n_dq = tf.reduce_mean( tf.squared_difference(self.Q_select, self.n_step_y_input)) # l_n_step_dq = self.loss_n_step_dq(self.Q_select, self.n_step_y_input) l_jeq = self.loss_jeq(self.Q_select) l_l2 = tf.reduce_sum([ tf.reduce_mean(reg_l) for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) ]) return self.ISWeights * tf.reduce_sum([ l * λ for l, λ in zip([l_dq, l_n_dq, l_jeq, l_l2], self.config.LAMBDA) ]) @lazy_property def abs_errors(self): return tf.reduce_sum(tf.abs(self.y_input - self.Q_select), axis=1) # only use 1-step R to compute abs_errors @lazy_property def optimize(self): optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE) return optimizer.minimize( self.loss) # only parameters in select-net is optimized here @lazy_property def update_target_net(self): select_params = tf.get_collection('select_net_params') eval_params = tf.get_collection('eval_net_params') return [tf.assign(e, s) for e, s in zip(eval_params, select_params)] def save_model(self): print("Model saved in : {}".format( self.saver.save(self.sess, self.config.MODEL_PATH))) def restore_model(self): self.saver.restore(self.sess, self.config.MODEL_PATH) print("Model restored.") def perceive(self, transition): self.replay_memory.store(np.array(transition)) # epsilon->FINAL_EPSILON(min_epsilon) if self.replay_memory.full(): self.epsilon = max(self.config.FINAL_EPSILON, self.epsilon * self.config.EPSILIN_DECAY) def train_Q_network(self, pre_train=False, update=True): """ :param pre_train: True means should sample from demo_buffer instead of replay_buffer :param update: True means the action "update_target_net" executes outside, and can be ignored in the function """ if not pre_train and not self.replay_memory.full( ): # sampling should be executed AFTER replay_memory filled return self.time_step += 1 assert self.replay_memory.full() or pre_train actual_memory = self.demo_memory if pre_train else self.replay_memory tree_idxes, minibatch, ISWeights = actual_memory.sample( self.config.BATCH_SIZE) np.random.shuffle(minibatch) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] done_batch = [data[4] for data in minibatch] demo_data = [data[5] for data in minibatch] n_step_reward_batch = [data[6] for data in minibatch] n_step_state_batch = [data[7] for data in minibatch] n_step_done_batch = [data[8] for data in minibatch] actual_n = [data[9] for data in minibatch] # provide for placeholder,compute first Q_select = self.Q_select.eval( feed_dict={self.select_input: next_state_batch}) Q_eval = self.Q_eval.eval( feed_dict={self.eval_input: next_state_batch}) n_step_Q_select = self.Q_select.eval( feed_dict={self.select_input: n_step_state_batch}) n_step_Q_eval = self.Q_eval.eval( feed_dict={self.eval_input: n_step_state_batch}) y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim)) n_step_y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim)) for i in range(self.config.BATCH_SIZE): # state, action, reward, next_state, done, demo_data, n_step_reward, n_step_state, n_step_done = t temp = self.Q_select.eval( feed_dict={ self.select_input: state_batch[i].reshape((-1, self.state_dim)) })[0] temp_0 = np.copy(temp) # add 1-step reward action = np.argmax(Q_select[i]) temp[action_batch[i]] = reward_batch[i] + ( 1 - int(done_batch[i])) * self.config.GAMMA * Q_eval[i][action] y_batch[i] = temp # add n-step reward action = np.argmax(n_step_Q_select[i]) q_n_step = ( 1 - int(n_step_done_batch[i]) ) * self.config.GAMMA**actual_n[i] * n_step_Q_eval[i][action] temp_0[action_batch[i]] = n_step_reward_batch[i] + q_n_step n_step_y_batch[i] = temp_0 _, abs_errors = self.sess.run( [self.optimize, self.abs_errors], feed_dict={ self.y_input: y_batch, self.n_step_y_input: n_step_y_batch, self.select_input: state_batch, self.action_batch: action_batch, self.isdemo: demo_data, self.ISWeights: ISWeights }) self.replay_memory.batch_update( tree_idxes, abs_errors) # update priorities for data in memory # 此例中一局步数有限,因此可以外部控制一局结束后update ,update为false时在外部控制 if update and self.time_step % self.config.UPDATE_TARGET_NET == 0: self.sess.run(self.update_target_net) def egreedy_action(self, state): if random.random() <= self.epsilon: return random.randint(0, self.action_dim - 1) return np.argmax( self.Q_select.eval(feed_dict={self.select_input: [state]})[0])
class Dqn_agent: def __init__(self, asset_num, division, feature_num, gamma, network_topology, learning_rate, epsilon, epsilon_Min, epsilon_decay_period, update_tar_period, history_length, memory_size, batch_size, save_period, name, save): self.epsilon = epsilon self.epsilon_min = epsilon_Min self.epsilon_decay_period = epsilon_decay_period self.asset_num = asset_num self.division = division self.gamma = gamma self.name = name self.update_tar_period = update_tar_period self.history_length = history_length self.feature_num = feature_num self.global_step = tf.Variable(0, trainable=False) self.lr = learning_rate self.cnn_trainable = True self.action_num, self.actions = action_discretization( self.asset_num, self.division) config = tf.ConfigProto() self.sess = tf.Session(config=config) network_topology['output_num'] = self.action_num self.network_config = network_topology self.initialize_graph() t_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_net') e_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='estm_net') # assign parameters of estimate Q-net to target Q-net self.update_target = [ tf.assign(t, l) for t, l in zip(t_params, e_params) ] self.sess.run(tf.global_variables_initializer()) self.memory = Memory(self.action_num, self.actions, memory_size=memory_size, batch_size=batch_size) if save: self.save = save self.save_period = save_period self.name = name self.saver = tf.train.Saver() else: self.save = False # initialize variables that will be used in the training process def initialize_graph(self): # current price tensor self.price_his = tf.placeholder(dtype=tf.float32, shape=[ None, self.asset_num - 1, self.history_length, self.feature_num ], name="ob") # price tensor of next step self.price_his_ = tf.placeholder(dtype=tf.float32, shape=[ None, self.asset_num - 1, self.history_length, self.feature_num ], name="ob_") # weight vector of current step self.addi_inputs = tf.placeholder(dtype=tf.float32, shape=[None, self.asset_num], name='addi_inputs') # weight vector of next step self.addi_inputs_ = tf.placeholder(dtype=tf.float32, shape=[None, self.asset_num], name='addi_inputs_') # the actions chose by the DQN agent self.a = tf.placeholder(dtype=tf.int32, shape=[ None, ], name='a') self.input_num = tf.placeholder(dtype=tf.int32, shape=[]) # weight of each memory from the memory pool self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') # Q-values of extimate net with tf.variable_scope('estm_net'): self.fc_input, self.q_pred = self.build_graph( self.price_his, self.addi_inputs, self.cnn_trainable) # Q-values of target net with tf.variable_scope('target_net'): _, self.tar_pred = self.build_graph(self.price_his_, self.addi_inputs_, self.cnn_trainable) # a holder to contain the target Q-value with tf.variable_scope('q_tar'): self.q_target = tf.placeholder(dtype=tf.float32, shape=[None], name='q_target') # select the largest estimate Q-values with tf.variable_scope('q_estm_wa'): a_indices = tf.stack( [tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) self.q_estm_wa = tf.gather_nd(params=self.q_pred, indices=a_indices) # loss function with tf.name_scope('loss'): error = tf.abs(self.q_target - self.q_estm_wa) self.abs_errors = error square = tf.square(error) self.loss = tf.reduce_mean(self.ISWeights * square) # update the parameters of estimate Q-net with tf.name_scope('train'): self.optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = self.optimizer.minimize( self.loss, global_step=self.global_step) # network topology def build_graph(self, price_his, addi_input, trainable): kernels = self.network_config['kernels'] strides = self.network_config['strides'] filters = self.network_config['filters'] fc1_size = self.network_config['fc1_size'] # choose activate function def set_activation(activation): if activation == 'relu': activation = tf.nn.relu elif activation == 'selu': activation = tf.nn.selu else: activation = tf.nn.leaky_relu return activation cnn_activation = set_activation(self.network_config['cnn_activation']) w_initializer = tf.random_uniform_initializer(-0.05, 0.05) b_initializer = tf.constant_initializer( self.network_config['b_initializer']) regularizer = layers.l2_regularizer(self.network_config['regularizer']) conv = price_his # first cnn layer conv = tf.layers.conv2d(conv, filters=filters[0], kernel_size=kernels[0], strides=strides[0], trainable=trainable, activation=cnn_activation, kernel_regularizer=regularizer, bias_regularizer=regularizer, kernel_initializer=w_initializer, bias_initializer=b_initializer, padding='same', name=self.name + 'conv' + str(0)) # second cnn layer conv = tf.layers.conv2d(conv, filters=filters[1], kernel_size=kernels[1], strides=strides[1], trainable=trainable, activation=cnn_activation, kernel_regularizer=regularizer, bias_regularizer=regularizer, kernel_initializer=w_initializer, bias_initializer=b_initializer, padding='same', name=self.name + 'conv' + str(1)) # weight vector with the weight of cash removed addi_input1 = addi_input[:, 1:] # insert weight vector into the feature maps conv = tf.concat([conv, addi_input1[:, :, np.newaxis, np.newaxis]], axis=3) # third cnn layer conv = tf.layers.conv2d(conv, filters=filters[2], kernel_size=kernels[2], strides=strides[2], trainable=trainable, activation=cnn_activation, kernel_regularizer=regularizer, bias_regularizer=regularizer, kernel_initializer=w_initializer, bias_initializer=b_initializer, padding='same', name=self.name + 'conv' + str(2)) cash_bias = tf.ones((self.input_num, 1)) conv = tf.layers.flatten(conv) fc_input = tf.concat([cash_bias, conv], 1) fc1 = layers.fully_connected(fc_input, num_outputs=fc1_size, activation_fn=None, weights_initializer=w_initializer, trainable=True, scope=self.name + 'fc1') output_state = layers.fully_connected( fc1, num_outputs=1, activation_fn=None, weights_initializer=w_initializer, trainable=True, scope=self.name + 'output_state') output_action = layers.fully_connected( fc1, num_outputs=self.action_num, activation_fn=None, weights_initializer=w_initializer, trainable=True, scope=self.name + 'output_action') output = output_state + (output_action - tf.reduce_mean( output_action, axis=1, keep_dims=True)) return fc_input, output def replay(self): obs, action_batch, reward_batch, obs_, tree_idx, ISWeights = self.memory.sample( ) q_values_next = self.sess.run(self.q_pred, feed_dict={ self.price_his: obs_['history'], self.addi_inputs: obs_['weights'], self.input_num: obs_['history'].shape[0] }) best_actions = np.argmax(q_values_next, axis=1) q_values_next_target = self.sess.run(self.tar_pred, feed_dict={ self.price_his_: obs_['history'], self.addi_inputs_: obs_['weights'], self.input_num: obs_['history'].shape[0] }) targets_batch = reward_batch + self.gamma * q_values_next_target[ np.arange(len(action_batch)), best_actions] fd = { self.q_target: targets_batch, self.price_his: obs['history'], self.addi_inputs: obs['weights'], self.a: action_batch, self.input_num: obs['history'].shape[0], self.ISWeights: ISWeights } _, abs_errors, global_step = self.sess.run( [self.train_op, self.abs_errors, self.global_step], feed_dict=fd) self.memory.batch_update(tree_idx, abs_errors) if global_step % self.update_tar_period == 0: self.sess.run(self.update_target) if self.save and global_step % self.save_period == 0: self.saver.save(self.sess, abspath + 'logs/checkpoint/' + self.name, global_step=global_step) if self.epsilon > self.epsilon_min: self.epsilon -= (1 - self.epsilon_min) / self.epsilon_decay_period def choose_action(self, observation, test): def action_max(): fc_input, action_values = self.sess.run( [self.fc_input, self.q_pred], feed_dict={ self.price_his: observation['history'][np.newaxis, :, :, :], self.addi_inputs: observation['weights'][np.newaxis, :], self.input_num: 1 }) return np.argmax(action_values), fc_input if not test: if np.random.rand() > self.epsilon: action_idx, fc_input = action_max() else: action_idx = np.random.randint(0, self.action_num) action_idx_, fc_input = action_max() else: action_idx, fc_input = action_max() action_weights = self.actions[action_idx] return action_idx, action_weights, fc_input def store(self, ob, a, r, ob_): self.memory.store(ob, a, r, ob_) def get_training_step(self): a = self.sess.run(self.global_step) return a def restore(self, name): self.saver.restore(self.sess, abspath + 'logs/checkpoint/' + name) def start_replay(self): return self.memory.start_replay() def memory_cnt(self): return self.memory.tree.data_pointer