def train(): print("뇌세포 깨우는 중..") sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all brain.update_target_network() epsilon = 1.0 time_step = 0 total_reward_list = [] for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() if episode > OBSERVE: rpdilon -= 1 / 1000 state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 print('게임횟수 : %d, 점수 : %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(): with tf.Session() as sess: tf.set_random_seed(GLOBAL_SEED) brain = DQN(sess, observation_size, action_size) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() brain.update_target_network() time_step = 0 total_reward_list = [] for episode in range(MAX_EPISODE): done = False total_reward = 0 epsilon = 1. / ((episode / 10) + 1) observation = env.reset() brain.init_state(observation) while not done: if np.random.rand() < epsilon: action = random.randrange(action_size) else: action = brain.get_action() observation, reward, done, _ = env.step(action) # print(observation, reward, done) total_reward += reward brain.remember(observation, action, reward, done) if time_step > 0: if time_step % TRAIN_INTERVAL_FRAMES == 0: _, loss = brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 print('episode: %d total_reward: %d' % (episode, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) sess.run(tf.global_variables_initializer()) # 타겟 네트웍을 초기화합니다. brain.update_target_network() time_step = 0 epsilon = 1.0 for episode in range(MAX_EPISODE): # 게임을 시작합니다. terminal = False # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. _, state, _, _ = game.first_step() brain.init_state(state) while not terminal: # 게임 기록을 가져옵니다. action, state, reward, terminal = game.step() # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) if (time_step > OBSERVE) and (time_step % TRAIN_INTERVAL) == 0: brain.train() # 타겟 네트웍을 업데이트 해 줍니다. # if (time_step % TARGET_UPDATE_INTERVAL) == 0: # brain.update_target_network() time_step += 1 # if episode % 50 == 0: print(episode) save_model(sess)
def train_rl(images, targets, folds, stochastic = False, test = False, base_rand = False): print('start train rl') #print(images.shape) #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold) #X_train = X_train.astype("float32", casting='unsafe') #X_val = X_val.astype("float32", casting='unsafe') #X_test = X_test.astype("float32", casting='unsafe') #print('check') #print(X_train.shape) with tf.Session() as sess: #onfig = get_config(FLAGS) or FLAGS model = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, n_act) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() print('total %s folds', len(folds)) #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold) # X_train = X_train.astype("float32", casting='unsafe') # X_val = X_val.astype("float32", casting='unsafe') # X_test = X_test.astype("float32", casting='unsafe') ### # init target network model.update_target_network() # get next action from DQN epsilon = 1.0 # def frame N t_step = 0 tot_reward_list = [] MAX_EPISODE = 10000 n_img = len(targets) n_epi = n_img if stochastic: n_epi = MAX_EPISODE # call pred & loss n_test = 3 if test: #for debugging pred_all, loss_all = predict_all(images[0:n_test, :], targets[0:test, :]) if not stochastic: n_epi = n_test else: pred_all, loss_all = predict_all(images, targets) #pred_all_train, loss_all_train = predict_all(X_train, y_train) #print(pred_all) # run simulation pred_rl = [] for epi in range(n_epi): terminal = False tot_reward = 0 #init game & get current state #state parsing state = np.expand_dims(images[epi], 0) #state = np.expand_dims(X_train[epi], 0) model.init_state(state) if np.random.rand() < epsilon: act = random.randrange(n_act) else: act = model.get_action() if epi > OBSERVE: epsilon -= 1/100 if base_rand: act = random.randrange(n_act) #stochastic define if stochastic: ii = random.randrange(n_img) state = np.expand_dims(images[ii], 0) #state = np.expand_dims(X_train[ii], 0) state_i = ii else: state = np.expand_dims(images[epi], 0) #state = np.expand_dims(X_train[epi], 0) state_i = epi # get model str by act choosen_model = model_list[act] # reward function if pred_all[choosen_model][state_i] == 1: reward = 1 pred_rl.append(1) else: reward = -2 pred_rl.append(0) tot_reward += reward model.remember(state, act, reward, terminal) if t_step > OBSERVE and t_step % TRAIN_INTERVAL == 0: # DQN train model.train() if t_step % TARGET_UPDATE_INTERVAL == 0: # target update model.update_target_network() t_step += 1 print('epi: %d score: %d' % ((epi+1), tot_reward)) tot_reward_list.append(tot_reward) if epi % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: tot_reward_list}) writer.add_summary(summary, t_step) tot_reward_list = [] if epi % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=t_step) return tot_reward_list, pred_rl, pred_all
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) # 최종 결과값 갯수 '선택할 행동의 갯수' NUM_ACTION 설정 brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) # 학습결과 저장 및 확인 # 한판마다 얻는 점수를 저장하고 확인 rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) # 파일 저장 saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 목표신경망 초기화 brain.update_target_network() # 행동을 선택할떄 DQN을 이용할 시점 정함 # 일정시간이 지나기전에 행동을 무작위 선택하고 게임 진행중 epsilon값 줄여 나감 epsilon = 1.0 # 학습진행 조절을 위한 진행된 프레임 횟수 time_step = 0 # 학습결과를 확인하기 위한 점수 저장 배열 total_reward_list = [] # 학습 시작 for episode in range(MAX_EPISODE): terminal = False # 게임 종료 total_reward = 0 # 한게임당 얻은 총 점수 state = game.reset() # 게임 초기화 brain.init_state(state) # DQN에 게임 초기화 # 녹색사각형이 다른 사각형에 충돌할때까지 게임 수행 while not terminal: # 학습 초반 (100회 이전)은 무작위로 수행 if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() # 100회 이상이면 무작위값 사용비율을 줄여가면서 수행 if episode > OBSERVE: epsilon -= 1 / 1000 # 게임상태, 보상과 게임종료여부 받음 state, reward, terminal = game.step(action) total_reward += reward # 현재상태를 신경망 객체에 기억 # 기억된 정보를 이용하여 신경망 학습 시킴 brain.remember(state, action, reward, terminal) # 프레임 100번이 넘으면 4프레임마다 한번씩 학습 if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() # 1000프레임 마다 한번씩 목표 신경망 갱신 if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 # 게임 종료시 획득점수 출력하고 점수 저장 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) # 에피소드 10번마다 받은점수를 로그에 저장, 100마다 학습모델 저장 if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(): print('wake up the brain...') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() brain.update_target_network() epsilon = 1.0 time_step = 0 total_reward_list = [] for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() if episode > OBSERVE: epsilon -= 1 / 1000. state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 print('episode: %d, score: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(IS_IMPORT): print('Loading ...') sess = tf.Session() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 global_step = tf.Variable(0, trainable=False, name='global_step') brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION, global_step) #brain = DQN(sess, 61, global_step) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) totalScores = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.totalScore/ep.', tf.reduce_mean(totalScores)) total_reward_list = [] total_score_list = [] saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state(MODEL_PATH) writer = tf.summary.FileWriter(LOG_PATH, sess.graph) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) summary_merged = tf.summary.merge_all() if IS_IMPORT == True: fs = FileLoad('F:\work\cocos\dqnTest\Resources\scenario - Copy.sce') else: server.accept() brain.update_target_network() print('global_step:', sess.run(global_step)) # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 weight = 0 # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. #state = game.reset() if IS_IMPORT: id, _, _, _, state = fs.readState() if id == -1: sys.exit(1) else: id, _, _, _, state = server.readStatus() if id == -1: continue state = reshapeFromPacket(state) ''' state.append(state[2]) state.append(state[2]) ''' brain.init_state(state) while not terminal: actionType = "Action:" if IS_IMPORT: action = fs.readAction() if action == -1: sys.exit(1) id, reward, totalScore, terminal, state = fs.readState() if id == -1: sys.exit(1) else: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) print("Random action:", action) #action = -1 #action = random.uniform(-1, 1) else: action = brain.get_action() #action = brain.get_action() if episode > OBSERVE: epsilon -= 1 / 1000 server.sendX(id, action) if action == -1: id2, action = server.readAction() actionType = "Random Action:" if id != id2: print("Invalid Packet", id, id2) id, reward, totalScore, terminal, state = server.readStatus() reward = reward + (weight * 0.1) weight = weight + 1 print(time.strftime("%H:%M:%S", time.localtime()), id, actionType, action, "totalScore:", totalScore, "reward:", reward, "terminal", terminal) if id == -1: break if terminal == True: total_score_list.append(totalScore) state = reshapeFromPacket(state) total_reward += reward # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: # DQN 으로 학습을 진행합니다. brain.train() ''' try: except: print("Train Error!!") time_step -= 1 ''' if time_step % TARGET_UPDATE_INTERVAL == 0: # 타겟 네트웍을 업데이트 해 줍니다. brain.update_target_network() time_step += 1 print('\t Count of Play: %d Score: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if (episode) % 10 == 0: summary = sess.run(summary_merged, feed_dict={ rewards: total_reward_list, totalScores: total_score_list }) writer.add_summary(summary, sess.run(global_step)) total_reward_list = [] total_score_list = [] if (episode + 1) % 100 == 0: saver.save(sess, MODEL_PATH + '/dqn.ckpt', global_step=global_step) #모두 학습한 후에 tflite 파일로 저장 converter = tf.lite.TFLiteConverter.from_session(sess, [brain.input_X], [brain.Q]) tflite_model = converter.convert() open(MODEL_PATH + "/dqn.tflite", "wb").write(tflite_model) sys.exit(1)
def train(): print('Training... 뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 타겟 네트웍을 초기화합니다. brain.update_target_network() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 total_reward_list = [] # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. state = game.reset() brain.init_state(state) while not terminal: # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고 # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다. # 초반엔 학습이 적게 되어 있기 때문입니다. # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어 # 나중에는 거의 사용하지 않게됩니다. if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다. # 초반에는 학습이 전혀 안되어 있기 때문입니다. if episode > OBSERVE: epsilon -= 1 / 1000 # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. state, reward, terminal = game.step(action) total_reward += reward # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: # DQN 으로 학습을 진행합니다. brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: # 타겟 네트웍을 업데이트 해 줍니다. brain.update_target_network() time_step += 1 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 타겟 네트웍을 초기화합니다. brain.update_target_network() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 total_reward_list = [] # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. state = game.reset() brain.init_state(state) while not terminal: # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고 # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다. # 초반엔 학습이 적게 되어 있기 때문입니다. # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어 # 나중에는 거의 사용하지 않게됩니다. if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다. # 초반에는 학습이 전혀 안되어 있기 때문입니다. if episode > OBSERVE: epsilon -= 1 / 1000 # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. state, reward, terminal = game.step(action) total_reward += reward # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: # DQN 으로 학습을 진행합니다. brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: # 타겟 네트웍을 업데이트 해 줍니다. brain.update_target_network() time_step += 1 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(track, width, height, cont): sess = tf.Session() game = Game(track, width, height, show_game=False) brain = DQN(sess, width, height, CHANNEL, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() if cont: ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() brain.update_target_network() epsilon = 1.0 time_step = 0 total_reward_list = [] if cont: OBSERVE = 100 else: OBSERVE = 5000 for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) if episode > OBSERVE: epsilon = 2000 / episode while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if episode > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if episode > OBSERVE and time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 if episode % 10 == 0: print('Games: %d Score: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode > OBSERVE and episode % 10000 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=episode)
def train_simulation(data): print("Training mode") session = tf.Session() simulation = Simulation(data) network = DQN(session, data) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('reward average / episode', tf.reduce_mean(rewards)) saver = tf.train.Saver() session.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', session.graph) summary = tf.summary.merge_all() # 네트워크 초기화 network.update_target_network() epsilon = 1.0 time = 0 # 학습 시작 for episode in range(MAX_EPISODE): total_reward = 0 list_reward = [] before_reward = 0 simulation.reset() simulation.make_state() network.init_state(simulation.state) # UE 차례로 AP에 할당 for ue in range(data['NUM_UE']): if np.random.rand() < epsilon: action = np.random.randint(data['NUM_AP']) else: action = network.get_action() epsilon -= 1 / DELTA_EPSILON fairness, error = simulation.step(ue, action) reward = fairness - before_reward before_reward = fairness total_reward += reward if error: network.remember(simulation.state, action, reward, True) else: network.remember(simulation.state, action, reward, (ue == (data['NUM_UE'] - 1))) if time > THRESH_OBSERVE and (time % INTERVAL_TRAINING == 0): network.train() if time % INTERVAL_UPDATE == 0: network.update_target_network() time += 1 if error: break list_reward.append(total_reward) print(episode, total_reward) if episode % 10 == 0: result = session.run(summary, feed_dict={rewards: list_reward}) writer.add_summary(result, time) list_reward = [] if episode % 100 == 0: saver.save(session, 'model/dqn.ckpt', global_step=time)
class Agent: def __init__(self, n_action, is_render=True, is_load=False): self.sess = tf.Session() self.batch_size = 32 self.model = DQN(self.sess, n_action, self.batch_size) self.model_name = "DQN" self.env = wrappers.wrap_dqn(gym.make("BreakoutDeterministic-v4")) self.is_render = is_render self.EPISODE = 600 # epsilon parameter self.epsilon_s = 1.0 self.epsilon_e = 0.1 self.epsilon_decay = 100000 self.epsilon = self.epsilon_s # train parameter self.train_start = 5000 self.update_target_rate = 5000 self.n_action = n_action self.loss = 0 # info self.total_q_max, self.total_loss = 0., 0. # save parameter self.save_episode_rate = 5 # load parameter self.is_load = is_load # saved_model = "./save/{}/{}_episode20.ckpt-{}".format("20180613-132735", self.model_name, "3741") self.saved_model = tf.train.latest_checkpoint("./save/20180614-180138") def preprocessing(self, img): ''' args : img : ( 210 x 160 x 3 ) return : img : ( 1 x 84 x 84 x 1 ) ''' # RGB to gray img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # resize img = cv2.resize(img, (84, 84)) # Normalization img = (img - 127.5) / 127.5 img = np.expand_dims(img, axis=0) img = np.expand_dims(img, axis=3) return img def get_action(self, state, is_play=False): if is_play: self.epsilon = self.play_epsilon if np.random.rand() < self.epsilon: action = self.env.action_space.sample() else: q_value = self.sess.run(self.model.main_q_value, feed_dict={self.model.input_M_Q: state}) action = np.argmax(q_value, 1)[0] # decay epsilon if not is_play: self.epsilon -= (self.epsilon_s - self.epsilon_e)/self.epsilon_decay return action def setup_summary(self): episode_total_reward = tf.Variable(0.) episode_avg_max_q = tf.Variable(0.) episode_duration = tf.Variable(0.) episode_avg_loss = tf.Variable(0.) tf.summary.scalar('Total Reward/Episode', episode_total_reward) tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) tf.summary.scalar('Duration/Episode', episode_duration) tf.summary.scalar('Average Loss/Episode', episode_avg_loss) summary_vars = [episode_total_reward, episode_avg_max_q, episode_duration, episode_avg_loss] summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))] update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))] summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op def train(self): # tensor board self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary() self.summary_writer = tf.summary.FileWriter('graphs/{}/{}' .format(self.model_name, NOWTIME), self.sess.graph) saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) if self.is_load: print(self.saved_model) saver.restore(self.sess, self.saved_model) print("Train Start...") global_step = 0 for e in range(self.EPISODE): obs = self.env.reset() #obs = self.preprocessing(obs) ''' print(self.env.action_space.n) print(self.env.unwrapped.get_action_meanings()) print (np.shape(obs)) ''' obs = np.reshape(obs, [1, 84, 84, 1]) state = np.concatenate((obs, obs, obs, obs), axis=3) is_terminal = False step = 0 total_reward = 0 s_t = time.time() while not is_terminal: global_step += 1 step += 1 action = self.get_action(state) observation, reward, is_terminal, info = self.env.step(action) if self.is_render: self.env.render() observation = np.reshape(observation, [1, 84, 84, 1]) next_state = np.append(observation, state[:,:,:,:3], axis=3) transition = [state, action, reward, next_state, is_terminal] self.model.replay_buffer.add_sample(transition) total_reward += reward self.total_q_max += np.argmax(self.sess.run(self.model.main_q_value, feed_dict={self.model.input_M_Q: state}), 1) state = next_state if self.model.replay_buffer.get_size() > self.train_start: self.loss = self.model.train() self.total_loss += self.loss if global_step % self.update_target_rate == 0: self.model.update_target_network() if global_step % 20 == 0: print("Episode: {} global_step: {} step: {} loss: {:.4f} reward: {} time: {}". format(e+1, global_step, step, self.loss, total_reward, time.time() - s_t)) if is_terminal: # write tensorboard if self.model.replay_buffer.get_size() > self.train_start: avg_q_max = self.total_q_max / float(step) avg_loss = self.total_loss / float(step) stats = [total_reward, avg_q_max, step, avg_loss] for i in range(len(stats)): self.sess.run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i])}) summary_str = self.sess.run(self.summary_op) self.summary_writer.add_summary(summary_str, e + 1) print("Episode: {} global_step: {} step: {} loss: {:.4f} reward: {} time: {}". format(e+1, global_step, step, self.loss, total_reward, time.time() - s_t)) self.total_loss, self.total_q_max = 0, 0 if e % self.save_episode_rate == 0: saver.save(self.sess, "./save/{0}/{1}_episode{2}.ckpt".format(NOWTIME, self.model_name, e), global_step=global_step) def play(self): self.play_epsilon = 0.1 saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) print(self.saved_model) saver.restore(self.sess, self.saved_model) print("Play Start...") for e in range(1): obs = self.env.reset() obs = np.reshape(obs, [1, 84, 84, 1]) #obs = self.preprocessing(obs) self.env.render() state = np.concatenate((obs, obs, obs, obs), axis=3) is_terminal = False step = 0 total_reward = 0 while not is_terminal: step += 1 action = self.get_action(state, is_play=True) print("action: {}".format(action)) observation, reward, is_terminal, info = self.env.step(action) self.env.render() observation = np.reshape(observation, [1, 84, 84, 1]) next_state = np.append(observation, state[:,:,:,:3], axis=3) total_reward += reward state = next_state print("step: {} total_reward: {}".format(step, total_reward))
class Agent: """Our Wasted Agent :P """ def __init__(self, sess, config, environment, evaluation_enviroment): # Get the session, config, environment, and create a replaymemory self.sess = sess self.config = config self.environment = environment self.evaluation_enviroment = evaluation_enviroment if config.prm: self.memory = PrioritizedExperienceReplay(sess, config) else: self.memory = ReplayMemory(config.state_shape, config.rep_max_size) self.init_dirs() self.init_cur_epsiode() self.init_global_step() self.init_epsilon() self.init_summaries() # Intialize the DQN graph which contain 2 Networks Target and Q self.estimator = DQN(sess, config, self.environment.n_actions) # To initialize all variables self.init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init) self.saver = tf.train.Saver(max_to_keep=10) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph) if config.is_train and not config.cont_training: pass elif config.is_train and config.cont_training: self.load() elif config.is_play: self.load() else: raise Exception("Please Set proper mode for training or playing") def load(self): latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) self.saver.restore(self.sess, latest_checkpoint) def save(self): self.saver.save(self.sess, self.checkpoint_dir, self.global_step_tensor) def init_dirs(self): # Create directories for checkpoints and summaries self.checkpoint_dir = os.path.join(self.config.experiment_dir, "checkpoints/") self.summary_dir = os.path.join(self.config.experiment_dir, "summaries/") def init_cur_epsiode(self): """Create cur episode tensor to totally save the process of the training""" with tf.variable_scope('cur_episode'): self.cur_episode_tensor = tf.Variable(-1, trainable=False, name='cur_epsiode') self.cur_epsiode_input = tf.placeholder('int32', None, name='cur_episode_input') self.cur_episode_assign_op = self.cur_episode_tensor.assign( self.cur_epsiode_input) def init_global_step(self): """Create a global step variable to be a reference to the number of iterations""" with tf.variable_scope('step'): self.global_step_tensor = tf.Variable(0, trainable=False, name='global_step') self.global_step_input = tf.placeholder('int32', None, name='global_step_input') self.global_step_assign_op = self.global_step_tensor.assign( self.global_step_input) def init_epsilon(self): """Create an epsilon variable""" with tf.variable_scope('epsilon'): self.epsilon_tensor = tf.Variable(self.config.initial_epsilon, trainable=False, name='epsilon') self.epsilon_input = tf.placeholder('float32', None, name='epsilon_input') self.epsilon_assign_op = self.epsilon_tensor.assign( self.epsilon_input) def init_summaries(self): """Create the summary part of the graph""" with tf.variable_scope('summary'): self.summary_placeholders = {} self.summary_ops = {} self.scalar_summary_tags = [ 'episode.total_reward', 'episode.length', 'evaluation.total_reward', 'evaluation.length', 'epsilon' ] for tag in self.scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.summary.scalar( tag, self.summary_placeholders[tag]) def init_replay_memory(self): # Populate the replay memory with initial experience print("initializing replay memory...") state = self.environment.reset() for i in itertools.count(): action = self.take_action(state) next_state, reward, done = self.observe_and_save( state, self.environment.valid_actions[action]) if done: if self.config.prm: if i >= self.config.prm_init_size: break else: if i >= self.config.replay_memory_init_size: break state = self.environment.reset() else: state = next_state print("finished initializing replay memory") def policy_fn(self, fn_type, estimator, n_actions): """Function that contain definitions to various number of policy functions and choose between them""" def epsilon_greedy(sess, observation, epsilon): actions = np.ones(n_actions, dtype=float) * epsilon / n_actions q_values = estimator.predict(np.expand_dims(observation, 0))[0] best_action = np.argmax(q_values) actions[best_action] += (1.0 - epsilon) return actions def greedy(sess, observation): q_values = estimator.predict(np.expand_dims(observation, 0), type="target")[0] best_action = np.argmax(q_values) return best_action if fn_type == 'epsilon_greedy': return epsilon_greedy elif fn_type == 'greedy': return greedy else: raise Exception("Please Select a proper policy function") def take_action(self, state): """Take the action based on the policy function""" action_probs = self.policy(self.sess, state, self.epsilon_tensor.eval(self.sess)) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) return action def observe_and_save(self, state, action): """Function that observe the new state , reward and save it in the memory""" next_state, reward, done = self.environment.step(action) self.memory.push(state, next_state, action, reward, done) return next_state, reward, done def update_target_network(self): """Update Target network By copying paramter between the two networks in DQN""" self.estimator.update_target_network() def add_summary(self, summaries_dict, step): """Add the summaries to tensorboard""" summary_list = self.sess.run( [self.summary_ops[tag] for tag in summaries_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in summaries_dict.items() }) for summary in summary_list: self.summary_writer.add_summary(summary, step) self.summary_writer.flush() def train_episodic(self): """Train the agent in episodic techniques""" # Initialize the epsilon step, it's step, the policy function, the replay memory self.epsilon_step = ( self.config.initial_epsilon - self.config.final_epsilon) / self.config.exploration_steps self.policy = self.policy_fn(self.config.policy_fn, self.estimator, self.environment.n_actions) self.init_replay_memory() for cur_episode in range( self.cur_episode_tensor.eval(self.sess) + 1, self.config.num_episodes, 1): # Save the current checkpoint self.save() # Update the Cur Episode tensor self.cur_episode_assign_op.eval( session=self.sess, feed_dict={ self.cur_epsiode_input: self.cur_episode_tensor.eval(self.sess) + 1 }) # Evaluate Now to see how it behave if cur_episode % self.config.evaluate_every == 0: self.evaluate(cur_episode / self.config.evaluate_every) state = self.environment.reset() total_reward = 0 # Take steps in the environment untill terminal state of epsiode for t in itertools.count(): # Update the Global step self.global_step_assign_op.eval( session=self.sess, feed_dict={ self.global_step_input: self.global_step_tensor.eval(self.sess) + 1 }) # time to update the target estimator if self.global_step_tensor.eval( self.sess ) % self.config.update_target_estimator_every == 0: self.update_target_network() # Calculate the Epsilon for this time step # Take an action ..Then observe and save self.epsilon_assign_op.eval( { self.epsilon_input: max( self.config.final_epsilon, self.epsilon_tensor.eval(self.sess) - self.epsilon_step) }, self.sess) action = self.take_action(state) next_state, reward, done = self.observe_and_save( state, self.environment.valid_actions[action]) # Sample a minibatch from the replay memory if self.config.prm: indices_batch, weights_batch, state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.sample( ) else: state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.get_batch( self.config.batch_size) # Calculate targets Then Compute the loss q_values_next = self.estimator.predict(next_state_batch, type="target") targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * self.config.discount_factor * np.amax( q_values_next, axis=1) if self.config.prm: _ = self.estimator.update(state_batch, action_batch, targets_batch, weights_batch) else: _ = self.estimator.update(state_batch, action_batch, targets_batch) total_reward += reward if done: # IF terminal state so exit the episode # Add summaries to tensorboard summaries_dict = { 'episode.total_reward': total_reward, 'episode.length': t, 'epsilon': self.epsilon_tensor.eval(self.sess) } self.add_summary(summaries_dict, self.global_step_tensor.eval(self.sess)) break state = next_state print("Training Finished") def train_continous(self): # TODO implement on global step only pass def play(self, n_episode=10): """Function that play greedily on the policy learnt""" # Play Greedily self.policy = self.policy_fn('greedy', self.estimator, self.environment.n_actions) for cur_episode in range(n_episode): state = self.environment.reset() total_reward = 0 for t in itertools.count(): best_action = self.policy(self.sess, state) next_state, reward, done = self.environment.step( self.environment.valid_actions[best_action]) total_reward += reward if done: print("Total Reward in Epsiode " + str(cur_episode) + " = " + str(total_reward)) print("Total Length in Epsiode " + str(cur_episode) + " = " + str(t)) break state = next_state def evaluate(self, local_step): print('evaluation #{0}'.format(local_step)) policy = self.policy_fn('greedy', self.estimator, self.evaluation_enviroment.n_actions) for cur_episode in range(self.config.evaluation_episodes): state = self.evaluation_enviroment.reset() total_reward = 0 for t in itertools.count(): best_action = policy(self.sess, state) next_state, reward, done = self.evaluation_enviroment.step( self.evaluation_enviroment.valid_actions[best_action]) total_reward += reward if done: # Add summaries to tensorboard summaries_dict = { 'evaluation.total_reward': total_reward, 'evaluation.length': t } self.add_summary(summaries_dict, local_step * 5 + cur_episode) break state = next_state print('Finished evaluation #{0}'.format(local_step))
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 타겟 네트웍을 초기화합니다. brain.update_target_network() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 total_reward_list = [] # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. state = game.reset() brain.init_state(state) while not terminal: if game.previous_price == 0 : now_price = driver.find_element_by_xpath( '// *[ @ id = "cont_coin_info"] / div[1] / span[1]' ).text now_price = float(str(now_price).replace(",", "")) game.previous_price = now_price print("prepare..") time.sleep(0.5) # 1. 현재 가격 저장 now_price = driver.find_element_by_xpath( '// *[ @ id = "cont_coin_info"] / div[1] / span[1]' ).text now_price = float(str(now_price).replace(",", "")) game.now_price = now_price # 2. 전체 매도량, 전체 매수량 total_sell = driver.find_element_by_xpath( '// *[ @ id = "txt_total_bid"]' ).text total_buy = driver.find_element_by_xpath( '//*[@id="txt_total_ask"]' ).text total_trade = float(str(total_sell).replace(",", "")) + float(str(total_buy).replace(",", "")) selling = [0 for _ in range(10)] buying = [0 for _ in range(10)] for num in range(1,11): _xpath = '//*[@id="contSellCoin"]/li['+ str(num) +']/div/p' bar = driver.find_element_by_xpath( _xpath ).text percent = 100 * float(bar) / total_trade selling[num-1] = percent for num in range(1,11): _xpath = '//*[@id="contBuyCoin"]/li[' + str(num) + ']/div/p' bar = driver.find_element_by_xpath( _xpath ).text percent = 100 * float(bar) / total_trade buying[num-1] = percent # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고 # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다. # 초반엔 학습이 적게 되어 있기 때문입니다. # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어 # 나중에는 거의 사용하지 않게됩니다. if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다. # 초반에는 학습이 전혀 안되어 있기 때문입니다. if episode > OBSERVE: epsilon -= 1 / 1000 # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. state, reward, terminal = game.step(action, selling, buying) total_reward += reward # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) time.sleep(0.3) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: # DQN 으로 학습을 진행합니다. brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: # 타겟 네트웍을 업데이트 해 줍니다. brain.update_target_network() time_step += 1 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward), "({})".format(game.seq)) total_reward_list.append(total_reward) if terminal == True :# 게임 종료 print("game over!") if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(cont): sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, OBS_NUM, BUN_NUM, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, CHANNEL, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() if cont: sess.run(tf.global_variables_initializer()) ckpt = str(tf.train.get_checkpoint_state('model')) i = ckpt.find("\"") + 1 j = ckpt.find("\"", i) reader = pywrap_tensorflow.NewCheckpointReader(ckpt[i:j]) var_to_shape_map = reader.get_variable_to_shape_map() target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) for key in var_to_shape_map: if "conv2d" in key and "Adam" not in key: for key_f in target_vars: if key in key_f.name: sess.run(key_f.assign(reader.get_tensor(key))) break # saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 타겟 네트웍을 초기화합니다. brain.update_target_network() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 total_reward_list = [] for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) if episode > OBSERVE: epsilon = 0.01 while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() epsilon += 0.00001 state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 if episode % 10 == 0: print('Games: %d Score: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 10000 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=episode)