def replay(track, width, height, rand): sess = tf.Session() game = Game(track, width, height, show_game=True) brain = DQN(sess, width, height, CHANNEL, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: if rand and np.random.rand() < 0.1: action = random.randrange(NUM_ACTION) else: action = brain.get_action() state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) time.sleep(0.15) print('Games: %d Score: %d' % (episode + 1, total_reward))
def replay(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: action = brain.get_action() # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) # 게임 진행을 인간이 인지할 수 있는 속도로^^; 보여줍니다. time.sleep(0.3) print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
def replay(): sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, OBS_NUM, BUN_NUM, show_game=True) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, CHANNEL, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: action = brain.get_action() state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) time.sleep(0.3) print('Games: %d Score: %d' % (episode + 1, total_reward))
def replay(): print('wake up the brain...') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: action = brain.get_action() state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) time.sleep(0.3) print('episode: %d, score: %d' % (episode + 1, total_reward))
def train(): print("뇌세포 깨우는 중..") sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all brain.update_target_network() epsilon = 1.0 time_step = 0 total_reward_list = [] for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() if episode > OBSERVE: rpdilon -= 1 / 1000 state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 print('게임횟수 : %d, 점수 : %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def replay(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) # 최종 결과값 갯수 '선택할 행동의 갯수' NUM_ACTION 설정 brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) saver = tf.train.Saver() # 저장해둔 모델 읽어옴 ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: action = brain.get_action() state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) time.sleep(0.3) print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
def train(): print('뇌세포 꺠우는 중..') sess = tf.Session() game = Game(screenWidth, screenHeight, show_game=False) brain = DQN(sess, screenWidth, screenHeight, numAction) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summaryMerged = tf.summary.merge_all() brain.updateTargetNetwork() timeStep = 0 totalRewardList = [] for episode in range(maxEpisode): terminal = False totalReward = 0 epsilon = 1.0 state = game.reset() brain.initState(state) while not terminal: if np.random.rand() < epsilon: action = random.randrange(numAction) else: action = brain.getAction() if episode > observe: epsilon -= 1 / 1000 state, reward, terminal = game.step(action) totalReward += reward brain.remember(state, action, reward, terminal) if timeStep > observe and timeStep % trainInterval == 0: brain.train() if timeStep % targetUpdateInterval == 0: brain.updateTargetNetwork() timeStep += 1 totalRewardList.append(totalReward) if episode % 10 == 0: summary = sess.run(summaryMerged, feed_dict={rewards: totalRewardList}) writer.add_summary(summary, timeStep) if episode % 100 == 99: print("게임횟수 : {0}, 점수 : {1:.4f}".format(episode + 1, totalReward)) saver.save(sess, './model/dqn.ckpt', global_step=timeStep)
def train(): with tf.Session() as sess: tf.set_random_seed(GLOBAL_SEED) brain = DQN(sess, observation_size, action_size) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() brain.update_target_network() time_step = 0 total_reward_list = [] for episode in range(MAX_EPISODE): done = False total_reward = 0 epsilon = 1. / ((episode / 10) + 1) observation = env.reset() brain.init_state(observation) while not done: if np.random.rand() < epsilon: action = random.randrange(action_size) else: action = brain.get_action() observation, reward, done, _ = env.step(action) # print(observation, reward, done) total_reward += reward brain.remember(observation, action, reward, done) if time_step > 0: if time_step % TRAIN_INTERVAL_FRAMES == 0: _, loss = brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 print('episode: %d total_reward: %d' % (episode, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) sess.run(tf.global_variables_initializer()) # 타겟 네트웍을 초기화합니다. brain.update_target_network() time_step = 0 epsilon = 1.0 for episode in range(MAX_EPISODE): # 게임을 시작합니다. terminal = False # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. _, state, _, _ = game.first_step() brain.init_state(state) while not terminal: # 게임 기록을 가져옵니다. action, state, reward, terminal = game.step() # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) if (time_step > OBSERVE) and (time_step % TRAIN_INTERVAL) == 0: brain.train() # 타겟 네트웍을 업데이트 해 줍니다. # if (time_step % TARGET_UPDATE_INTERVAL) == 0: # brain.update_target_network() time_step += 1 # if episode % 50 == 0: print(episode) save_model(sess)
def replay(): print('Loading..') sess = tf.Session() global_step = tf.Variable(0, trainable=False, name='global_step') brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION, global_step) #brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(MODEL_PATH) saver.restore(sess, ckpt.model_checkpoint_path) server.accept() # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 #state = game.reset() id, _, _, _, state = server.readStatus() state = reshapeFromPacket(state) brain.init_state(state) while not terminal: action = brain.get_action() # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. #state, reward, terminal = game.step(action) server.sendX(id, action) id, reward, totalScore, terminal, state = server.readStatus() state = reshapeFromPacket(state) total_reward += reward brain.remember(state, action, reward, terminal) # 게임 진행을 인간이 인지할 수 있는 속도로^^; 보여줍니다. #time.sleep(0.3) print( 'Count of Play: %d total reward: %d' % (episode + 1, total_reward), "Action", action)
def replay(): sess = tf.Session() brain = DQN(sess, observation_size, action_size) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) for episode in range(MAX_EPISODE): done = False total_reward = 0 observation = env.reset() brain.init_state(observation) while not done: action = brain.get_action() observation, reward, done, _ = env.step(action) total_reward += reward brain.remember(observation, action, reward, done) time.sleep(0.3) print('episode: %d total_reward: %d' % (episode, total_reward))
def train_rl(images, targets, folds, stochastic = False, test = False, base_rand = False): print('start train rl') #print(images.shape) #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold) #X_train = X_train.astype("float32", casting='unsafe') #X_val = X_val.astype("float32", casting='unsafe') #X_test = X_test.astype("float32", casting='unsafe') #print('check') #print(X_train.shape) with tf.Session() as sess: #onfig = get_config(FLAGS) or FLAGS model = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, n_act) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() print('total %s folds', len(folds)) #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold) # X_train = X_train.astype("float32", casting='unsafe') # X_val = X_val.astype("float32", casting='unsafe') # X_test = X_test.astype("float32", casting='unsafe') ### # init target network model.update_target_network() # get next action from DQN epsilon = 1.0 # def frame N t_step = 0 tot_reward_list = [] MAX_EPISODE = 10000 n_img = len(targets) n_epi = n_img if stochastic: n_epi = MAX_EPISODE # call pred & loss n_test = 3 if test: #for debugging pred_all, loss_all = predict_all(images[0:n_test, :], targets[0:test, :]) if not stochastic: n_epi = n_test else: pred_all, loss_all = predict_all(images, targets) #pred_all_train, loss_all_train = predict_all(X_train, y_train) #print(pred_all) # run simulation pred_rl = [] for epi in range(n_epi): terminal = False tot_reward = 0 #init game & get current state #state parsing state = np.expand_dims(images[epi], 0) #state = np.expand_dims(X_train[epi], 0) model.init_state(state) if np.random.rand() < epsilon: act = random.randrange(n_act) else: act = model.get_action() if epi > OBSERVE: epsilon -= 1/100 if base_rand: act = random.randrange(n_act) #stochastic define if stochastic: ii = random.randrange(n_img) state = np.expand_dims(images[ii], 0) #state = np.expand_dims(X_train[ii], 0) state_i = ii else: state = np.expand_dims(images[epi], 0) #state = np.expand_dims(X_train[epi], 0) state_i = epi # get model str by act choosen_model = model_list[act] # reward function if pred_all[choosen_model][state_i] == 1: reward = 1 pred_rl.append(1) else: reward = -2 pred_rl.append(0) tot_reward += reward model.remember(state, act, reward, terminal) if t_step > OBSERVE and t_step % TRAIN_INTERVAL == 0: # DQN train model.train() if t_step % TARGET_UPDATE_INTERVAL == 0: # target update model.update_target_network() t_step += 1 print('epi: %d score: %d' % ((epi+1), tot_reward)) tot_reward_list.append(tot_reward) if epi % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: tot_reward_list}) writer.add_summary(summary, t_step) tot_reward_list = [] if epi % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=t_step) return tot_reward_list, pred_rl, pred_all
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) # 최종 결과값 갯수 '선택할 행동의 갯수' NUM_ACTION 설정 brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) # 학습결과 저장 및 확인 # 한판마다 얻는 점수를 저장하고 확인 rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) # 파일 저장 saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 목표신경망 초기화 brain.update_target_network() # 행동을 선택할떄 DQN을 이용할 시점 정함 # 일정시간이 지나기전에 행동을 무작위 선택하고 게임 진행중 epsilon값 줄여 나감 epsilon = 1.0 # 학습진행 조절을 위한 진행된 프레임 횟수 time_step = 0 # 학습결과를 확인하기 위한 점수 저장 배열 total_reward_list = [] # 학습 시작 for episode in range(MAX_EPISODE): terminal = False # 게임 종료 total_reward = 0 # 한게임당 얻은 총 점수 state = game.reset() # 게임 초기화 brain.init_state(state) # DQN에 게임 초기화 # 녹색사각형이 다른 사각형에 충돌할때까지 게임 수행 while not terminal: # 학습 초반 (100회 이전)은 무작위로 수행 if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() # 100회 이상이면 무작위값 사용비율을 줄여가면서 수행 if episode > OBSERVE: epsilon -= 1 / 1000 # 게임상태, 보상과 게임종료여부 받음 state, reward, terminal = game.step(action) total_reward += reward # 현재상태를 신경망 객체에 기억 # 기억된 정보를 이용하여 신경망 학습 시킴 brain.remember(state, action, reward, terminal) # 프레임 100번이 넘으면 4프레임마다 한번씩 학습 if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() # 1000프레임 마다 한번씩 목표 신경망 갱신 if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 # 게임 종료시 획득점수 출력하고 점수 저장 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) # 에피소드 10번마다 받은점수를 로그에 저장, 100마다 학습모델 저장 if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(cont): sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, OBS_NUM, BUN_NUM, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, CHANNEL, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() if cont: sess.run(tf.global_variables_initializer()) ckpt = str(tf.train.get_checkpoint_state('model')) i = ckpt.find("\"") + 1 j = ckpt.find("\"", i) reader = pywrap_tensorflow.NewCheckpointReader(ckpt[i:j]) var_to_shape_map = reader.get_variable_to_shape_map() target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for key in var_to_shape_map: if "conv2d" in key and "Adam" not in key: for key_f in target_vars: if key in key_f.name: sess.run(key_f.assign(reader.get_tensor(key))) break # saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 타겟 네트웍을 초기화합니다. brain.update_target_network() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 total_reward_list = [] for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) if episode > OBSERVE: epsilon = 0.01 while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() epsilon += 0.00001 state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 if episode % 10 == 0: print('Games: %d Score: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 10000 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=episode)
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 타겟 네트웍을 초기화합니다. brain.update_target_network() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 total_reward_list = [] # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. state = game.reset() brain.init_state(state) while not terminal: if game.previous_price == 0 : now_price = driver.find_element_by_xpath( '// *[ @ id = "cont_coin_info"] / div[1] / span[1]' ).text now_price = float(str(now_price).replace(",", "")) game.previous_price = now_price print("prepare..") time.sleep(0.5) # 1. 현재 가격 저장 now_price = driver.find_element_by_xpath( '// *[ @ id = "cont_coin_info"] / div[1] / span[1]' ).text now_price = float(str(now_price).replace(",", "")) game.now_price = now_price # 2. 전체 매도량, 전체 매수량 total_sell = driver.find_element_by_xpath( '// *[ @ id = "txt_total_bid"]' ).text total_buy = driver.find_element_by_xpath( '//*[@id="txt_total_ask"]' ).text total_trade = float(str(total_sell).replace(",", "")) + float(str(total_buy).replace(",", "")) selling = [0 for _ in range(10)] buying = [0 for _ in range(10)] for num in range(1,11): _xpath = '//*[@id="contSellCoin"]/li['+ str(num) +']/div/p' bar = driver.find_element_by_xpath( _xpath ).text percent = 100 * float(bar) / total_trade selling[num-1] = percent for num in range(1,11): _xpath = '//*[@id="contBuyCoin"]/li[' + str(num) + ']/div/p' bar = driver.find_element_by_xpath( _xpath ).text percent = 100 * float(bar) / total_trade buying[num-1] = percent # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고 # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다. # 초반엔 학습이 적게 되어 있기 때문입니다. # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어 # 나중에는 거의 사용하지 않게됩니다. if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다. # 초반에는 학습이 전혀 안되어 있기 때문입니다. if episode > OBSERVE: epsilon -= 1 / 1000 # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. state, reward, terminal = game.step(action, selling, buying) total_reward += reward # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) time.sleep(0.3) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: # DQN 으로 학습을 진행합니다. brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: # 타겟 네트웍을 업데이트 해 줍니다. brain.update_target_network() time_step += 1 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward), "({})".format(game.seq)) total_reward_list.append(total_reward) if terminal == True :# 게임 종료 print("game over!") if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def test_simulation(data): print("Test mode") session = tf.Session() simulation = Simulation(data) network = DQN(session, data) saver = tf.train.Saver() ckeckpoint = tf.train.get_checkpoint_state('model') saver.restore(session, ckeckpoint.model_checkpoint_path) # 테스트 시작 for episode in range(MAX_TEST): time = 0 list_connection = [[] for i in range(data['NUM_AP'])] total_reward = 0 before_reward = 0 simulation.reset() simulation.make_state() network.init_state(simulation.state) start = timeit.default_timer() # UE 차례로 AP에 할당 for ue in range(data['NUM_UE']): action = network.get_action() list_connection[action].append(ue) fairness, error = simulation.step(ue, action) reward = fairness - before_reward before_reward = fairness total_reward += reward if error: network.remember(simulation.state, action, reward, True) else: network.remember(simulation.state, action, reward, (ue == (data['NUM_UE'] - 1))) if error: break time += (timeit.default_timer() - start) print() print("Fairness:", total_reward) print() print("== Before adjustment ==") for ap in range(data['NUM_AP']): print("AP %d Timeslot: %.2f" % (ap, simulation.state[SUM_TIMESLOT][ap])) print("Connection:", end=" ") for ue in list_connection[ap]: print("UE %d(%dkbps)" % (ue, data['LIST_RATE'][int( simulation.info[ue][ap][CONST_REQUEST])]), end=" ") print() # AP에 할당된 Timeslot이 허용 Timeslot보다 넘치는 경우 if simulation.state[SUM_TIMESLOT][ap] > data['VAL_TIMESLOT']: start = timeit.default_timer() simulation.adjust_bitrate(ap, list_connection[ap]) time += (timeit.default_timer() - start) print() total_dqn_psnr = 0 total_ideal_psnr = 0 print("== After adjustment ==") for ap in range(data['NUM_AP']): print("AP %d Timeslot: %.2f" % (ap, simulation.state[SUM_TIMESLOT][ap])) print("Connection:", end=" ") for ue in list_connection[ap]: support_index = int(simulation.info[ue][ap][CONST_SUPPORT]) support_rate = data['LIST_RATE'][support_index] total_dqn_psnr += simulation.get_PSNR(support_rate) request_index = int(simulation.info[ue][ap][CONST_REQUEST]) request_rate = data['LIST_RATE'][request_index] total_ideal_psnr += simulation.get_PSNR(request_rate) print("UE %d(%dkbps)" % (ue, support_rate), end=" ") print() print() list_dqn_psnr.append(total_dqn_psnr / data['NUM_UE']) list_dqn_time.append(time) print("%s\tPSNR: %.2f %.4f" % ("DQN".ljust(20), total_dqn_psnr / data['NUM_UE'], time)) performance, time = simulation.solve_fract() print("%s\tPSNR: %.2f %.4f" % ("Fractional".ljust(20), performance / data['NUM_UE'], time)) performance, time = simulation.solve_random() list_random_psnr.append(performance / data['NUM_UE']) list_random_time.append(time) print("%s\tPSNR: %.2f %.4f" % ("Random".ljust(20), performance / data['NUM_UE'], time)) performance, time = simulation.solve_greedy() list_greedy_psnr.append(performance / data['NUM_UE']) list_greedy_time.append(time) print("%s\tPSNR: %.2f %.4f" % ("Greedy".ljust(20), performance / data['NUM_UE'], time)) performance, time = simulation.solve_mthm() list_mthm_psnr.append(performance / data['NUM_UE']) list_mthm_time.append(time) print("%s\tPSNR: %.2f %.4f" % ("Knapsack(MTHM)".ljust(20), performance / data['NUM_UE'], time)) #""" performance, time = simulation.solve_mtm() list_mtm_psnr.append(performance / data['NUM_UE']) list_mtm_time.append(time) print("%s\tPSNR: %.2f %.4f" % ("Knapsack(MTM)".ljust(20), performance / data['NUM_UE'], time)) performance, time = simulation.solve_bb() list_bb_psnr.append(performance / data['NUM_UE']) list_bb_time.append(time) print( "%s\tPSNR: %.2f %.4f" % ("Branch and Bound".ljust(20), performance / data['NUM_UE'], time)) #""" list_ideal_psnr.append(total_ideal_psnr / data['NUM_UE']) print("%s\tPSNR: %.2f" % ("Ideal".ljust(20), total_ideal_psnr / data['NUM_UE'])) # 테스트 종료 """
def train(): print('wake up the brain...') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() brain.update_target_network() epsilon = 1.0 time_step = 0 total_reward_list = [] for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() if episode > OBSERVE: epsilon -= 1 / 1000. state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 print('episode: %d, score: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(IS_IMPORT): print('Loading ...') sess = tf.Session() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 global_step = tf.Variable(0, trainable=False, name='global_step') brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION, global_step) #brain = DQN(sess, 61, global_step) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) totalScores = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.totalScore/ep.', tf.reduce_mean(totalScores)) total_reward_list = [] total_score_list = [] saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state(MODEL_PATH) writer = tf.summary.FileWriter(LOG_PATH, sess.graph) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) summary_merged = tf.summary.merge_all() if IS_IMPORT == True: fs = FileLoad('F:\work\cocos\dqnTest\Resources\scenario - Copy.sce') else: server.accept() brain.update_target_network() print('global_step:', sess.run(global_step)) # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 weight = 0 # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. #state = game.reset() if IS_IMPORT: id, _, _, _, state = fs.readState() if id == -1: sys.exit(1) else: id, _, _, _, state = server.readStatus() if id == -1: continue state = reshapeFromPacket(state) ''' state.append(state[2]) state.append(state[2]) ''' brain.init_state(state) while not terminal: actionType = "Action:" if IS_IMPORT: action = fs.readAction() if action == -1: sys.exit(1) id, reward, totalScore, terminal, state = fs.readState() if id == -1: sys.exit(1) else: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) print("Random action:", action) #action = -1 #action = random.uniform(-1, 1) else: action = brain.get_action() #action = brain.get_action() if episode > OBSERVE: epsilon -= 1 / 1000 server.sendX(id, action) if action == -1: id2, action = server.readAction() actionType = "Random Action:" if id != id2: print("Invalid Packet", id, id2) id, reward, totalScore, terminal, state = server.readStatus() reward = reward + (weight * 0.1) weight = weight + 1 print(time.strftime("%H:%M:%S", time.localtime()), id, actionType, action, "totalScore:", totalScore, "reward:", reward, "terminal", terminal) if id == -1: break if terminal == True: total_score_list.append(totalScore) state = reshapeFromPacket(state) total_reward += reward # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: # DQN 으로 학습을 진행합니다. brain.train() ''' try: except: print("Train Error!!") time_step -= 1 ''' if time_step % TARGET_UPDATE_INTERVAL == 0: # 타겟 네트웍을 업데이트 해 줍니다. brain.update_target_network() time_step += 1 print('\t Count of Play: %d Score: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if (episode) % 10 == 0: summary = sess.run(summary_merged, feed_dict={ rewards: total_reward_list, totalScores: total_score_list }) writer.add_summary(summary, sess.run(global_step)) total_reward_list = [] total_score_list = [] if (episode + 1) % 100 == 0: saver.save(sess, MODEL_PATH + '/dqn.ckpt', global_step=global_step) #모두 학습한 후에 tflite 파일로 저장 converter = tf.lite.TFLiteConverter.from_session(sess, [brain.input_X], [brain.Q]) tflite_model = converter.convert() open(MODEL_PATH + "/dqn.tflite", "wb").write(tflite_model) sys.exit(1)
def train(): print('Training... 뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 타겟 네트웍을 초기화합니다. brain.update_target_network() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 total_reward_list = [] # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. state = game.reset() brain.init_state(state) while not terminal: # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고 # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다. # 초반엔 학습이 적게 되어 있기 때문입니다. # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어 # 나중에는 거의 사용하지 않게됩니다. if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다. # 초반에는 학습이 전혀 안되어 있기 때문입니다. if episode > OBSERVE: epsilon -= 1 / 1000 # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. state, reward, terminal = game.step(action) total_reward += reward # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: # DQN 으로 학습을 진행합니다. brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: # 타겟 네트웍을 업데이트 해 줍니다. brain.update_target_network() time_step += 1 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train_simulation(data): print("Training mode") session = tf.Session() simulation = Simulation(data) network = DQN(session, data) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('reward average / episode', tf.reduce_mean(rewards)) saver = tf.train.Saver() session.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', session.graph) summary = tf.summary.merge_all() # 네트워크 초기화 network.update_target_network() epsilon = 1.0 time = 0 # 학습 시작 for episode in range(MAX_EPISODE): total_reward = 0 list_reward = [] before_reward = 0 simulation.reset() simulation.make_state() network.init_state(simulation.state) # UE 차례로 AP에 할당 for ue in range(data['NUM_UE']): if np.random.rand() < epsilon: action = np.random.randint(data['NUM_AP']) else: action = network.get_action() epsilon -= 1 / DELTA_EPSILON fairness, error = simulation.step(ue, action) reward = fairness - before_reward before_reward = fairness total_reward += reward if error: network.remember(simulation.state, action, reward, True) else: network.remember(simulation.state, action, reward, (ue == (data['NUM_UE'] - 1))) if time > THRESH_OBSERVE and (time % INTERVAL_TRAINING == 0): network.train() if time % INTERVAL_UPDATE == 0: network.update_target_network() time += 1 if error: break list_reward.append(total_reward) print(episode, total_reward) if episode % 10 == 0: result = session.run(summary, feed_dict={rewards: list_reward}) writer.add_summary(result, time) list_reward = [] if episode % 100 == 0: saver.save(session, 'model/dqn.ckpt', global_step=time)
def train(track, width, height, cont): sess = tf.Session() game = Game(track, width, height, show_game=False) brain = DQN(sess, width, height, CHANNEL, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() if cont: ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() brain.update_target_network() epsilon = 1.0 time_step = 0 total_reward_list = [] if cont: OBSERVE = 100 else: OBSERVE = 5000 for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) if episode > OBSERVE: epsilon = 2000 / episode while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if episode > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if episode > OBSERVE and time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 if episode % 10 == 0: print('Games: %d Score: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode > OBSERVE and episode % 10000 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=episode)
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 타겟 네트웍을 초기화합니다. brain.update_target_network() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 total_reward_list = [] # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. state = game.reset() brain.init_state(state) while not terminal: # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고 # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다. # 초반엔 학습이 적게 되어 있기 때문입니다. # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어 # 나중에는 거의 사용하지 않게됩니다. if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다. # 초반에는 학습이 전혀 안되어 있기 때문입니다. if episode > OBSERVE: epsilon -= 1 / 1000 # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. state, reward, terminal = game.step(action) total_reward += reward # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: # DQN 으로 학습을 진행합니다. brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: # 타겟 네트웍을 업데이트 해 줍니다. brain.update_target_network() time_step += 1 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)