def __init__(self,name,N_station,h_size,tau,sess,ckpt_path=None): #config is the parameter setting #ckpt_path is the path for load models self.name=name self.buffer=network.experience_buffer() #each agent holds its own experience replay buffer self.action=-1 #remember the most recent action taken self.ckpt_path=ckpt_path self.sess=sess; self.drqn_build(N_station,h_size,tau) #build the network
e = startE stepDrop = (startE-endE)/anneling_steps # create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 # network number nn = 0 # Make a path for our model to be saved in. if not os.path.exists(path): os.makedirs(path) linucb_agent=bandit.linucb_agent(N_station,N_station*4) exp_replay = network.experience_buffer(15000) # a single buffer holds everything bandit_buffer = network.bandit_buffer(15000) bandit_swap_e=1; linucb_agent_backup=bandit.linucb_agent(N_station, N_station * 4) # # this step loads the model from the model that has been saved # if load_model == True: # print('Loading Model...') # ckpt = tf.train.get_checkpoint_state(path) # saver.restore(sess, ckpt.model_checkpoint_path) # this example equals target network to the original network after every few episodes # we may want to modify this with tf.Session(config=config1) as sess: # one DRQN per station is needed, different network requires a different scope (name)
def train(self): init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=1, reshape=True) trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, self.tau) rList = [] #portfolio_list=[] total_steps = 0 myBuffer = experience_buffer(self.buffer_size) episode_buffer = experience_buffer() e = self.startE stepDrop = (self.startE - self.endE) / self.anneling_steps with tf.Session() as sess: # 변수를 초기화한다. sess.run(init) if self.load_model == True: print('Loading Model...') # 모델을 불러온다 ckpt = tf.train.get_checkpoint_state(self.path) saver.restore(sess, ckpt.model_checkpoint_path) e = self.endE # 주요 신경망과 동일하게 타겟 신경망을 설정한다 updateTarget(targetOps, sess) # 에피소드 시작 for ii in range(self.num_episodes): rAll = 0 d = False j = 0 episode_buffer.buffer = [] episode_reward_buffer = [] self.environment.reset() self.agent.reset() rnn_state = np.array( [mainQN.state_init for mainQN in self.mainQN]) #print('%d 번째 episode 초기화 :' % ii,self.environment.idx, self.environment.KOSPI_idx, 'total num :',total_steps, '종목코드',self.environment.chart_code) s = [ self.environment.get_image(days) for days in self.network_type ] s_potfol = np.array(self.agent.get_states()) episode_step = 1 while j < self.max_epLength and not d: j += 1 #입력값으로 행동선택하기(베이시안 + 볼트만) all_Q_d = np.zeros([self.agent.NUM_ACTIONS]) before_rnn_state = rnn_state[:] for i, mainQN in enumerate(self.mainQN): Q_d, rnn_state[i] = sess.run( [mainQN.Q_dist, mainQN.state_out], feed_dict={ mainQN.inImage: [s[i]], mainQN.portfolio_state: [s_potfol], mainQN.state_in[0]: rnn_state[i][0], mainQN.state_in[1]: rnn_state[i][1], mainQN.temp: e, mainQN.keep_per: (1 - e) + 0.1, mainQN.phase: True }) all_Q_d += Q_d[0] #모든 신경망의 확률값을 더한 뒤 나눔 #print(np.sum(all_Q_d)) all_Q_d /= len(self.network_type) all_Q_d /= np.sum(all_Q_d) #print(np.sum(all_Q_d)) a = np.random.choice(all_Q_d, p=all_Q_d) action = np.argmax(all_Q_d == a) #정책에 행동전달 delayed_reward = self.agent.act(action=action, confidence=all_Q_d[action]) d = self.environment.step() if e > self.endE and total_steps > self.pre_train_steps: e -= stepDrop ''' immediate_reward, delayed_reward = self.agent.act(action=action, confidence=all_Q_d[action]) if e > self.endE and total_steps > self.pre_train_steps: e -= stepDrop #다음 인덱스로 넘어가기 d = self.environment.step() if (delayed_reward == 0 and episode_step % 5 == 0) or d: delayed_reward = immediate_reward self.agent.base_portfolio_value = self.agent.portfolio_value ''' #다음이미지,포폴 받기 #print('total step :', total_steps, 'current episode step : ', j, 'idx :', self.environment.idx, 'kospi_idx', self.environment.KOSPI_idx, '종목코드',self.environment.chart_code) s1 = [ self.environment.get_image(days) for days in self.network_type ] s1_potfol = np.array(self.agent.get_states()) episode_reward_buffer.append(delayed_reward) #버퍼에 저장 # 원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부 # 수정 버퍼 : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 # 재수정 버퍼 : 현재이미지 액션 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 보상(디스카운트 펙터설정) #episode_buffer.add([s, action, delayed_reward, s1, s1_potfol, d, before_rnn_state, rnn_state, s_potfol ] ) episode_buffer.add([ s, action, s1, s1_potfol, d, before_rnn_state, rnn_state, s_potfol ]) if total_steps > self.pre_train_steps and total_steps % self.training_step == 0: try: #버퍼에서 데이터 가져오기 # 학습 모드이고 지연 보상이 존재할 경우 정책 신경망 갱신 #원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부 # 수정 버퍼 : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 # 배치 학습 데이터 크기 trainBatch, size = myBuffer.sample( self.replay_memory, rList) #(self.batch_size) #print('훈련데이터 추출 결과 : ', trainBatch.shape) #보상을 전행동에 영향이 가도록 할인인자로 곱해야함 for i in range(len(self.network_type)): # 아래는 target Q-value를 업데이트하는 Double-DQN을 수행한다 # 주요 신경망에서 행동을 고른다. #학습 시 베이시안과 볼트만을 사용하지 않음 #LSTM 학습을 위해서 랜덤한 에피소드에 랜덤한 날짜부터 replay memory만큼 선정하고 사용함 # 재수정 버퍼 : 현재이미지 액션 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 보상(디스카운트 펙터설정) feed_dict = { self.mainQN[i].inImage: [datas[i] for datas in trainBatch[:, 2]], self.mainQN[i].portfolio_state: [data for data in trainBatch[:, 3]], self.mainQN[i].state_in[0]: trainBatch[0, 6][i][0], self.mainQN[i].state_in[1]: trainBatch[0, 6][i][1], self.mainQN[i].keep_per: 1.0, self.mainQN[i].phase: True } Q1 = sess.run(self.mainQN[i].predict, feed_dict=feed_dict) del feed_dict feed_dict_2 = { self.targetQN[i].inImage: [datas[i] for datas in trainBatch[:, 2]], self.targetQN[i].portfolio_state: [data for data in trainBatch[:, 3]], self.targetQN[i].state_in[0]: trainBatch[0, 6][i][0], self.targetQN[i].state_in[1]: trainBatch[0, 6][i][1], self.targetQN[i].keep_per: 1.0, self.targetQN[i].phase: True } Q2 = sess.run( self.targetQN[i].Qout, # feed_dict 수정해야함 feed_dict=feed_dict_2) del feed_dict_2 ''' Q1 = sess.run(self.mainQN[i].predict, feed_dict={self.mainQN[i].inImage: np.vstack(trainBatch[:, 3])}) # 타겟 신경망에서 Q 값들을 얻는다. Q2 = sess.run(self.targetQN[i].Qout, #feed_dict 수정해야함 feed_dict={self.targetQN[i].inImage: np.vstack(trainBatch[:, 3])}) ''' # 종료 여부에 따라 가짜 라벨을 만들어준다 end_multiplier = -(trainBatch[:, 4] - 1) # 타겟 신경망의 Q 값들 중에 주요 신경망에서 고른 행동 번째의 Q 값들을 가져온다.(이부분이 doubleQ) doubleQ = Q2[range(size), Q1] # 보상에 대한 더블 Q 값을 더해준다. y는 할인 인자 # targetQ 는 즉각적인 보상 + 다음 상태의 최대 보상(doubleQ) targetQ = trainBatch[:, 8] + ( self.y * doubleQ * end_multiplier) # 우리의 타겟 값들과 함께 신경망을 업데이트해준다. # 행동들에 대해서 targetQ 값과의 차이를 통해 손실을 구하고 업데이트 # 원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부 # 수정 버퍼 : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 feed_dict = { self.mainQN[i].inImage: [datas[i] for datas in trainBatch[:, 0]], self.mainQN[i].portfolio_state: [data for data in trainBatch[:, 7]], self.mainQN[i].targetQ: targetQ, self.mainQN[i].actions: trainBatch[:, 1], self.mainQN[i].keep_per: 1.0, self.mainQN[i].state_in[0]: trainBatch[0, 5][i][0], self.mainQN[i].state_in[1]: trainBatch[0, 5][i][1], self.mainQN[i].phase: True } _ = sess.run(self.mainQN[i].updateModel, \ feed_dict=feed_dict) del feed_dict ''' _ = sess.run(self.mainQN[i].updateModel, \ feed_dict={self.mainQN[i].inImage: np.vstack(trainBatch[:, 0]), self.mainQN[i].targetQ: targetQ, self.mainQN[i].actions: trainBatch[:, 1]}) ''' updateTarget(targetOps, sess) except IndexError as e: print(trainBatch) rAll += delayed_reward #rAll = delayed_reward # 상태를 바꾼다. del s s = s1 del s_potfol s_potfol = s1_potfol total_steps += 1 episode_step += 1 #portfolio_list.append(self.agent.portfolio_value) #할인인자 적용한 보상을 에피소드 버퍼에 추가 accumulate = 0 episode_reward_buffer.reverse() #print('%s episode_reward_len : ' % ii, len(episode_reward_buffer), 'episode_buffer_len :', len(episode_buffer.buffer)) for i, reward in enumerate(episode_reward_buffer): accumulate = self.discount_factor * accumulate + reward idx = -(i + 1) episode_buffer.buffer[idx] += [accumulate] #print(idx, len(episode_buffer.buffer[idx])) myBuffer.add(episode_buffer.buffer) if len(rList) + 1 >= self.buffer_size: # self.buffer[0:1] = [] del rList[0] rList.append(rAll) self.environment.chartcode_value[ self.environment. chart_code] += 1 if self.agent.portfolio_value > self.agent.initial_balance else -1 print("%d %s %d %d %d %d" % (ii, self.environment.chart_code, rAll, self.agent.portfolio_value, self.agent.minimum_portfolio_value, self.agent.maximum_portfolio_value)) #print("%d %4f %d %4f %4f %d %d"% (total_steps, np.mean(rList[-10:]), np.mean(portfolio_list), np.max(rList[-10:]),np.min(rList[-10:]),np.max(portfolio_list),np.min(portfolio_list)))#e) #print(sys.getsizeof(myBuffer.buffer), sys.getsizeof(episode_buffer.buffer)) #portfolio_list= [] if total_steps > self.pre_train_steps and ii % 50 == 0: try: saver.save(sess, self.path + '/model-' + str(ii) + '.cptk') with open('./value_chart.txt', 'w') as f: data = json.dumps(self.environment.chartcode_value) f.write(data) del data #print("Saved Model") except: pass sleep(2) # 학습 끝 평균 보상을 표시 saver.save(sess, self.path + '/model-' + str(ii) + '.cptk') print("평균 episode 별 보상 값 : " + str(sum(rList) / self.num_episodes))
man = agent(n_control, train_set[train_index, 8:]) # network configurations I_size = 12 O_size = 40 # learning objects mainQN = Qnetwork(I_size, O_size, n_control) targetQN = Qnetwork(I_size, O_size, n_control) init = tf.global_variables_initializer() saver = tf.train.Saver() trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, tau) myBuffer = experience_buffer() # ---------------------------- # training starts here # ---------------------------- #Set the rate of random action decrease. e = startE stepDrop = (startE - endE) / anneling_steps #create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 #Make a path for our model to be saved in.
def test(self): init = tf.global_variables_initializer() saver = tf.train.Saver() trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, self.tau) rList = [] total_steps = 0 myBuffer = experience_buffer() episode_buffer = experience_buffer() e = self.startE stepDrop = (self.startE - self.endE) / self.anneling_steps with tf.Session() as sess: if self.load_model == True: print('Loading Model...') # 모델을 불러온다 ckpt = tf.train.get_checkpoint_state(self.path) self.saver.restore(sess, ckpt.model_checkpoint_path) # 변수를 초기화한다. sess.run(init) # 주요 신경망과 동일하게 타겟 신경망을 설정한다 updateTarget(targetOps, sess) # 에피소드 시작 for ii in range(self.num_episodes): rAll = 0 d = False j = 0 experience_buffer.buffer = [] self.environment.reset() self.agent.reset() rnn_state = np.array( [mainQN.state_init for mainQN in self.mainQN]) print('%d 번째 episode 초기화 :' % ii, self.environment.idx, self.environment.KOSPI_idx, 'total num :', total_steps, '종목코드', self.environment.chart_code) s = [ self.environment.get_image(days) for days in self.network_type ] s_potfol = np.array(self.agent.get_states()) while j < self.max_epLength and not d: j += 1 # 입력값으로 행동선택하기(베이시안 + 볼트만) all_Q_d = np.zeros([self.agent.NUM_ACTIONS]) before_rnn_state = rnn_state[:] for i, mainQN in enumerate(self.mainQN): Q_d, rnn_state[i] = sess.run( [mainQN.Q_dist, mainQN.state_out], feed_dict={ mainQN.inImage: [s[i]], mainQN.portfolio_state: [s_potfol], mainQN.state_in[0]: rnn_state[i][0], mainQN.state_in[1]: rnn_state[i][1], mainQN.temp: e, mainQN.keep_per: (1 - e) + 0.1, mainQN.phase: True }) all_Q_d += Q_d[0] # 모든 신경망의 확률값을 더한 뒤 나눔 # print(np.sum(all_Q_d)) all_Q_d /= len(self.network_type) all_Q_d[0] += 1 - np.sum(all_Q_d) # print(np.sum(all_Q_d)) a = np.random.choice(all_Q_d, p=all_Q_d) action = np.argmax(all_Q_d == a) # 정책에 행동전달 immediate_reward, delayed_reward = self.agent.act( action=action, confidence=all_Q_d[action]) if e > self.endE and total_steps > self.pre_train_steps: e -= stepDrop if delayed_reward == 0 and total_steps % 5 == 0: delayed_reward = immediate_reward self.agent.base_portfolio_value = self.agent.portfolio_value # 다음 인덱스로 넘어가기 d = self.environment.step() # 다음이미지,포폴 받기 # print('total step :', total_steps, 'current episode step : ', j, 'idx :', self.environment.idx, 'kospi_idx', self.environment.KOSPI_idx, '종목코드',self.environment.chart_code) s1 = [ self.environment.get_image(days) for days in self.network_type ] s1_potfol = np.array(self.agent.get_states()) # 버퍼에 저장 # 원래 버퍼 순서: 상태 액션 보상 다음상태 종료여부 # 수정 버퍼 : 현재이미지 액션 보상 다음이미지, 다음포폴상태 종료여부 이전상태LSTM, 상태LSTM 현재포폴 rAll += delayed_reward # 상태를 바꾼다. del s s = s1 del s_potfol s_potfol = s1_potfol total_steps += 1 if total_steps > self.pre_train_steps and ii % 50 == 0: saver.save(sess, self.path + '/model-' + str(ii) + '.cptk') print("Saved Model") sleep(3) rList.append(rAll) myBuffer.add(episode_buffer.buffer) if len(rList) % 10 == 0: print(total_steps, np.mean(rList[-10:]), e) sleep(2) # saver.save(sess, self.path + '/model-' + str(i) + '.cptk') # 평균 보상을 표시 print("평균 episode 별 보상 값 : " + str(sum(rList) / self.num_episodes))