Python DQN.update_target_network示例

def train():
    print("뇌세포 깨우는 중..")
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)
    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all

    brain.update_target_network()
    epsilon = 1.0

    time_step = 0
    total_reward_list = []

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            if episode > OBSERVE:
                rpdilon -= 1 / 1000

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()

            time_step += 1

            print('게임횟수 : %d, 점수 :  %d' % (episode + 1, total_reward))

            total_reward_list.append(total_reward)

            if episode % 10 == 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if episode % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

示例#2

显示文件

def train():
    with tf.Session() as sess:
        tf.set_random_seed(GLOBAL_SEED)
        brain = DQN(sess, observation_size, action_size)
        rewards = tf.placeholder(tf.float32, [None])
        tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter('logs', sess.graph)
        summary_merged = tf.summary.merge_all()
        brain.update_target_network()
        time_step = 0
        total_reward_list = []

        for episode in range(MAX_EPISODE):
            done = False
            total_reward = 0
            epsilon = 1. / ((episode / 10) + 1)

            observation = env.reset()
            brain.init_state(observation)

            while not done:
                if np.random.rand() < epsilon:
                    action = random.randrange(action_size)
                else:
                    action = brain.get_action()

                observation, reward, done, _ = env.step(action)
                # print(observation, reward, done)
                total_reward += reward
                brain.remember(observation, action, reward, done)

                if time_step > 0:
                    if time_step % TRAIN_INTERVAL_FRAMES == 0:
                        _, loss = brain.train()
                    if time_step % TARGET_UPDATE_INTERVAL == 0:
                        brain.update_target_network()

                time_step += 1

            print('episode: %d total_reward: %d' % (episode, total_reward))

            total_reward_list.append(total_reward)

            if episode % 10 == 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if episode % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

示例#3

显示文件

def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    sess.run(tf.global_variables_initializer())

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    time_step = 0
    epsilon = 1.0

    for episode in range(MAX_EPISODE):
        # 게임을 시작합니다.
        terminal = False

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        _, state, _, _ = game.first_step()
        brain.init_state(state)

        while not terminal:
            # 게임 기록을 가져옵니다.
            action, state, reward, terminal = game.step()

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)
            if (time_step > OBSERVE) and (time_step % TRAIN_INTERVAL) == 0:
                brain.train()
            # 타겟 네트웍을 업데이트 해 줍니다.
            # if (time_step % TARGET_UPDATE_INTERVAL) == 0:
            #     brain.update_target_network()
            time_step += 1
        # if episode % 50 == 0:
        print(episode)
    save_model(sess)

示例#4

显示文件

def train_rl(images, targets, folds, stochastic = False, test = False, base_rand = False): 
    print('start train rl')


    #print(images.shape)
    #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold)

    #X_train = X_train.astype("float32", casting='unsafe')
    #X_val = X_val.astype("float32", casting='unsafe')
    #X_test = X_test.astype("float32", casting='unsafe')
    
    #print('check')
    #print(X_train.shape)
    with tf.Session() as sess:
        #onfig = get_config(FLAGS) or FLAGS
       
        model = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, n_act)
        
        rewards = tf.placeholder(tf.float32, [None])
        tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        
        writer = tf.summary.FileWriter('logs', sess.graph)
        summary_merged = tf.summary.merge_all()
        
        print('total %s folds', len(folds))
        
        #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold)
#        X_train = X_train.astype("float32", casting='unsafe')
#        X_val = X_val.astype("float32", casting='unsafe')
#        X_test = X_test.astype("float32", casting='unsafe')

        ###

        # init target network
        model.update_target_network()
        
        # get next action from DQN
        epsilon = 1.0
        # def frame N
        t_step = 0
        tot_reward_list = []


        MAX_EPISODE = 10000
        n_img = len(targets)
        
        n_epi = n_img
        if stochastic: n_epi = MAX_EPISODE
        

        # call pred & loss 
        n_test = 3 
        if test:  #for debugging
            pred_all, loss_all = predict_all(images[0:n_test, :], targets[0:test, :])
            if not stochastic: n_epi = n_test
        else: pred_all, loss_all = predict_all(images, targets)
        
        #pred_all_train, loss_all_train = predict_all(X_train, y_train)

        #print(pred_all)

        # run simulation
        pred_rl = []
        for epi in range(n_epi):
            terminal = False
            tot_reward = 0

            #init game & get current state
            
            #state parsing
            state = np.expand_dims(images[epi], 0)
            #state = np.expand_dims(X_train[epi], 0)
            model.init_state(state)

            if np.random.rand() < epsilon:
                act = random.randrange(n_act)
            else:
                act = model.get_action()

            if epi > OBSERVE: epsilon -= 1/100
            if base_rand: act = random.randrange(n_act)
            
            #stochastic define
            if stochastic:
                ii = random.randrange(n_img)
                state = np.expand_dims(images[ii], 0)  
                #state = np.expand_dims(X_train[ii], 0)
                state_i = ii

            else:
                state = np.expand_dims(images[epi], 0)
                #state = np.expand_dims(X_train[epi], 0)
                state_i = epi
            
            # get model str by act
            choosen_model = model_list[act]
            
            # reward function
            if pred_all[choosen_model][state_i] == 1:
                reward = 1
                pred_rl.append(1)
            else:
                reward = -2
                pred_rl.append(0)


            tot_reward += reward

            model.remember(state, act, reward, terminal)

            if t_step > OBSERVE and t_step % TRAIN_INTERVAL == 0:
                # DQN train
                model.train()

            if t_step % TARGET_UPDATE_INTERVAL == 0:
                # target update
                model.update_target_network()

            t_step += 1

            print('epi: %d score: %d' % ((epi+1), tot_reward))

            tot_reward_list.append(tot_reward)

            if epi % 10 == 0:
                summary = sess.run(summary_merged, feed_dict={rewards: tot_reward_list})
                writer.add_summary(summary, t_step)
                tot_reward_list = []

            if epi % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=t_step)

        return tot_reward_list, pred_rl, pred_all

示例#5

显示文件

def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()
    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    # 최종 결과값 갯수 '선택할 행동의 갯수' NUM_ACTION 설정
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    # 학습결과 저장 및 확인
    # 한판마다 얻는 점수를 저장하고 확인
    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    # 파일 저장
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 목표신경망 초기화
    brain.update_target_network()

    # 행동을 선택할떄 DQN을 이용할 시점 정함
    # 일정시간이 지나기전에 행동을 무작위 선택하고 게임 진행중 epsilon값 줄여 나감
    epsilon = 1.0

    # 학습진행 조절을 위한 진행된 프레임 횟수
    time_step = 0
    # 학습결과를 확인하기 위한 점수 저장 배열
    total_reward_list = []

    # 학습 시작
    for episode in range(MAX_EPISODE):
        terminal = False  # 게임 종료
        total_reward = 0  # 한게임당 얻은 총 점수

        state = game.reset()  # 게임 초기화
        brain.init_state(state)  # DQN에 게임 초기화

        # 녹색사각형이 다른 사각형에 충돌할때까지 게임 수행
        while not terminal:

            # 학습 초반 (100회 이전)은 무작위로 수행
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 100회 이상이면 무작위값 사용비율을 줄여가면서 수행
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 게임상태, 보상과 게임종료여부 받음
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재상태를 신경망 객체에 기억
            # 기억된 정보를 이용하여 신경망 학습 시킴
            brain.remember(state, action, reward, terminal)

            # 프레임 100번이 넘으면 4프레임마다 한번씩 학습
            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()
            # 1000프레임 마다 한번씩 목표 신경망 갱신
            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()
            time_step += 1

        # 게임 종료시 획득점수 출력하고 점수 저장
        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
        total_reward_list.append(total_reward)

        # 에피소드 10번마다 받은점수를 로그에 저장, 100마다 학습모델 저장
        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

示例#6

显示文件

文件： agent.py 项目： zedoul/GolbinHacker-Tensorflow

def train():
	print('wake up the brain...')
	sess = tf.Session()

	game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
	brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

	rewards = tf.placeholder(tf.float32, [None])
	tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

	saver = tf.train.Saver()
	sess.run(tf.global_variables_initializer())

	writer = tf.summary.FileWriter('logs', sess.graph)
	summary_merged = tf.summary.merge_all()

	brain.update_target_network()

	epsilon = 1.0
	time_step = 0
	total_reward_list = []

	for episode in range(MAX_EPISODE):
		terminal = False
		total_reward = 0

		state = game.reset()
		brain.init_state(state)

		while not terminal:
			if np.random.rand() < epsilon:
				action = random.randrange(NUM_ACTION)
			else:
				action = brain.get_action()

			if episode > OBSERVE:
				epsilon -= 1 / 1000.

			state, reward, terminal = game.step(action)
			total_reward += reward

			brain.remember(state, action, reward, terminal)

			if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
				brain.train()

			if time_step % TARGET_UPDATE_INTERVAL == 0:
				brain.update_target_network()

			time_step += 1

		print('episode: %d, score: %d' % (episode + 1, total_reward))

		total_reward_list.append(total_reward)

		if episode % 10 == 0:
			summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
			writer.add_summary(summary, time_step)
			total_reward_list = []

		if episode % 100 == 0:
			saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

示例#7

显示文件

def train(IS_IMPORT):
    print('Loading ...')
    sess = tf.Session()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    global_step = tf.Variable(0, trainable=False, name='global_step')

    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION, global_step)
    #brain = DQN(sess, 61, global_step)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
    totalScores = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.totalScore/ep.', tf.reduce_mean(totalScores))

    total_reward_list = []
    total_score_list = []

    saver = tf.train.Saver(tf.global_variables())

    ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
    writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())

    summary_merged = tf.summary.merge_all()

    if IS_IMPORT == True:
        fs = FileLoad('F:\work\cocos\dqnTest\Resources\scenario - Copy.sce')
    else:
        server.accept()

    brain.update_target_network()
    print('global_step:', sess.run(global_step))

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0
        weight = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        #state = game.reset()
        if IS_IMPORT:
            id, _, _, _, state = fs.readState()
            if id == -1:
                sys.exit(1)
        else:
            id, _, _, _, state = server.readStatus()

        if id == -1:
            continue

        state = reshapeFromPacket(state)
        '''
        state.append(state[2])
        state.append(state[2])               
        '''

        brain.init_state(state)

        while not terminal:
            actionType = "Action:"

            if IS_IMPORT:
                action = fs.readAction()
                if action == -1: sys.exit(1)
                id, reward, totalScore, terminal, state = fs.readState()
                if id == -1: sys.exit(1)
            else:

                if np.random.rand() < epsilon:
                    action = random.randrange(NUM_ACTION)
                    print("Random action:", action)
                    #action = -1
                    #action = random.uniform(-1, 1)
                else:
                    action = brain.get_action()

                #action = brain.get_action()

                if episode > OBSERVE:
                    epsilon -= 1 / 1000

                server.sendX(id, action)

                if action == -1:
                    id2, action = server.readAction()
                    actionType = "Random Action:"

                    if id != id2:
                        print("Invalid Packet", id, id2)

                id, reward, totalScore, terminal, state = server.readStatus()

            reward = reward + (weight * 0.1)
            weight = weight + 1

            print(time.strftime("%H:%M:%S", time.localtime()), id, actionType,
                  action, "totalScore:", totalScore, "reward:", reward,
                  "terminal", terminal)

            if id == -1:
                break
            if terminal == True:
                total_score_list.append(totalScore)

            state = reshapeFromPacket(state)

            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.

                brain.train()
                '''
                try:
                except:
                    print("Train Error!!")
                    time_step -= 1
                '''

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('\t Count of Play: %d Score: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if (episode) % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={
                                   rewards: total_reward_list,
                                   totalScores: total_score_list
                               })
            writer.add_summary(summary, sess.run(global_step))
            total_reward_list = []
            total_score_list = []

        if (episode + 1) % 100 == 0:
            saver.save(sess, MODEL_PATH + '/dqn.ckpt', global_step=global_step)

    #모두 학습한 후에 tflite 파일로 저장
    converter = tf.lite.TFLiteConverter.from_session(sess, [brain.input_X],
                                                     [brain.Q])
    tflite_model = converter.convert()
    open(MODEL_PATH + "/dqn.tflite", "wb").write(tflite_model)
    sys.exit(1)

示例#8

显示文件

def train():
    print('Training... 뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:
            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

示例#9

显示文件

文件： agent.py 项目： superhg2012/TensorFlow-Tutorials

def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:
            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

示例#10

显示文件

文件： agent.py 项目： Medowhill/cs570proj

def train(track, width, height, cont):
    sess = tf.Session()

    game = Game(track, width, height, show_game=False)
    brain = DQN(sess, width, height, CHANNEL, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    if cont:
        ckpt = tf.train.get_checkpoint_state('model')
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    brain.update_target_network()

    epsilon = 1.0
    time_step = 0
    total_reward_list = []

    if cont:
        OBSERVE = 100
    else:
        OBSERVE = 5000

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        if episode > OBSERVE:
            epsilon = 2000 / episode

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            if episode > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()

            if episode > OBSERVE and time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()

            time_step += 1

        if episode % 10 == 0:
            print('Games: %d Score: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode > OBSERVE and episode % 10000 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=episode)

示例#11

显示文件

def train_simulation(data):
    print("Training mode")
    session = tf.Session()

    simulation = Simulation(data)
    network = DQN(session, data)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('reward average / episode', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    session.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', session.graph)
    summary = tf.summary.merge_all()

    # 네트워크 초기화
    network.update_target_network()

    epsilon = 1.0
    time = 0

    # 학습 시작
    for episode in range(MAX_EPISODE):
        total_reward = 0
        list_reward = []

        before_reward = 0

        simulation.reset()
        simulation.make_state()

        network.init_state(simulation.state)

        # UE 차례로 AP에 할당
        for ue in range(data['NUM_UE']):

            if np.random.rand() < epsilon:
                action = np.random.randint(data['NUM_AP'])
            else:
                action = network.get_action()

            epsilon -= 1 / DELTA_EPSILON

            fairness, error = simulation.step(ue, action)
            reward = fairness - before_reward
            before_reward = fairness

            total_reward += reward

            if error:
                network.remember(simulation.state, action, reward, True)
            else:
                network.remember(simulation.state, action, reward,
                                 (ue == (data['NUM_UE'] - 1)))

            if time > THRESH_OBSERVE and (time % INTERVAL_TRAINING == 0):
                network.train()

            if time % INTERVAL_UPDATE == 0:
                network.update_target_network()

            time += 1

            if error:
                break

        list_reward.append(total_reward)
        print(episode, total_reward)

        if episode % 10 == 0:
            result = session.run(summary, feed_dict={rewards: list_reward})
            writer.add_summary(result, time)
            list_reward = []

        if episode % 100 == 0:
            saver.save(session, 'model/dqn.ckpt', global_step=time)

示例#12

显示文件

class Agent:
    def __init__(self, n_action, is_render=True, is_load=False):
        self.sess = tf.Session()

        self.batch_size = 32

        self.model = DQN(self.sess, n_action, self.batch_size)
        self.model_name = "DQN"

        self.env = wrappers.wrap_dqn(gym.make("BreakoutDeterministic-v4"))
        self.is_render = is_render

        self.EPISODE = 600

        # epsilon parameter
        self.epsilon_s = 1.0
        self.epsilon_e = 0.1
        self.epsilon_decay = 100000
        self.epsilon = self.epsilon_s

        # train parameter
        self.train_start = 5000
        self.update_target_rate = 5000

        self.n_action = n_action
        self.loss = 0

        # info
        self.total_q_max, self.total_loss = 0., 0.

        # save parameter
        self.save_episode_rate = 5

        # load parameter
        self.is_load = is_load
        # saved_model = "./save/{}/{}_episode20.ckpt-{}".format("20180613-132735", self.model_name, "3741")
        self.saved_model = tf.train.latest_checkpoint("./save/20180614-180138")

    def preprocessing(self, img):
        '''
        args :
        img : ( 210 x 160 x 3 )

        return :
        img : ( 1 x 84 x 84 x 1 )
        '''
        # RGB to gray
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # resize
        img = cv2.resize(img, (84, 84))

        # Normalization
        img = (img - 127.5) / 127.5

        img = np.expand_dims(img, axis=0)
        img = np.expand_dims(img, axis=3)

        return img

    def get_action(self, state, is_play=False):
        if is_play:
            self.epsilon = self.play_epsilon

        if np.random.rand() < self.epsilon:
            action = self.env.action_space.sample()
        else:
            q_value = self.sess.run(self.model.main_q_value, feed_dict={self.model.input_M_Q: state})
            action = np.argmax(q_value, 1)[0]

        # decay epsilon
        if not is_play:
            self.epsilon -= (self.epsilon_s - self.epsilon_e)/self.epsilon_decay

        return action

    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        episode_avg_max_q = tf.Variable(0.)
        episode_duration = tf.Variable(0.)
        episode_avg_loss = tf.Variable(0.)

        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
        tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
        tf.summary.scalar('Duration/Episode', episode_duration)
        tf.summary.scalar('Average Loss/Episode', episode_avg_loss)

        summary_vars = [episode_total_reward, episode_avg_max_q,
                        episode_duration, episode_avg_loss]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in
                                range(len(summary_vars))]
        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
                      range(len(summary_vars))]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

    def train(self):
        # tensor board
        self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
        self.summary_writer = tf.summary.FileWriter('graphs/{}/{}'
                                                    .format(self.model_name, NOWTIME), self.sess.graph)

        saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())

        if self.is_load:
            print(self.saved_model)
            saver.restore(self.sess, self.saved_model)

        print("Train Start...")
        global_step = 0
        for e in range(self.EPISODE):
            obs = self.env.reset()
            #obs = self.preprocessing(obs)
            '''
            print(self.env.action_space.n)
            print(self.env.unwrapped.get_action_meanings())
            print (np.shape(obs))
            '''
            obs = np.reshape(obs, [1, 84, 84, 1])
            state = np.concatenate((obs, obs, obs, obs), axis=3)

            is_terminal = False
            step = 0
            total_reward = 0
            s_t = time.time()
            while not is_terminal:
                global_step += 1
                step += 1

                action = self.get_action(state)
                observation, reward, is_terminal, info = self.env.step(action)

                if self.is_render:
                    self.env.render()
                observation = np.reshape(observation, [1, 84, 84, 1])
                next_state = np.append(observation, state[:,:,:,:3], axis=3)

                transition = [state, action, reward, next_state, is_terminal]
                self.model.replay_buffer.add_sample(transition)

                total_reward += reward
                self.total_q_max += np.argmax(self.sess.run(self.model.main_q_value,
                                                            feed_dict={self.model.input_M_Q: state}), 1)
                state = next_state

                if self.model.replay_buffer.get_size() > self.train_start:

                    self.loss = self.model.train()
                    self.total_loss += self.loss

                if global_step % self.update_target_rate == 0:
                    self.model.update_target_network()

                if global_step % 20 == 0:
                    print("Episode: {}   global_step: {}  step: {}  loss: {:.4f}  reward: {}  time: {}".
                          format(e+1, global_step, step, self.loss, total_reward, time.time() - s_t))
                if is_terminal:
                    # write tensorboard
                    if self.model.replay_buffer.get_size() > self.train_start:
                        avg_q_max = self.total_q_max / float(step)
                        avg_loss = self.total_loss / float(step)

                        stats = [total_reward, avg_q_max, step, avg_loss]

                        for i in range(len(stats)):
                            self.sess.run(self.update_ops[i], feed_dict={
                                self.summary_placeholders[i]: float(stats[i])})
                        summary_str = self.sess.run(self.summary_op)
                        self.summary_writer.add_summary(summary_str, e + 1)

                    print("Episode: {}   global_step: {}  step: {}  loss: {:.4f}  reward: {}  time: {}".
                          format(e+1, global_step, step, self.loss, total_reward, time.time() - s_t))

                    self.total_loss, self.total_q_max = 0, 0

                if e % self.save_episode_rate == 0:
                    saver.save(self.sess, "./save/{0}/{1}_episode{2}.ckpt".format(NOWTIME, self.model_name, e), global_step=global_step)

    def play(self):
        self.play_epsilon = 0.1

        saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())

        print(self.saved_model)
        saver.restore(self.sess, self.saved_model)

        print("Play Start...")
        for e in range(1):
            obs = self.env.reset()
            obs = np.reshape(obs, [1, 84, 84, 1])
            #obs = self.preprocessing(obs)
            self.env.render()
            state = np.concatenate((obs, obs, obs, obs), axis=3)

            is_terminal = False
            step = 0
            total_reward = 0
            while not is_terminal:
                step += 1

                action = self.get_action(state, is_play=True)
                print("action: {}".format(action))
                observation, reward, is_terminal, info = self.env.step(action)
                self.env.render()
                observation = np.reshape(observation, [1, 84, 84, 1])
                next_state = np.append(observation, state[:,:,:,:3], axis=3)

                total_reward += reward
                state = next_state

            print("step: {}   total_reward: {}".format(step, total_reward))

示例#13

显示文件

文件： agent.py 项目： KokoMind/DQN-TF

class Agent:
    """Our Wasted Agent :P """
    def __init__(self, sess, config, environment, evaluation_enviroment):
        # Get the session, config, environment, and create a replaymemory
        self.sess = sess
        self.config = config
        self.environment = environment
        self.evaluation_enviroment = evaluation_enviroment

        if config.prm:
            self.memory = PrioritizedExperienceReplay(sess, config)
        else:
            self.memory = ReplayMemory(config.state_shape, config.rep_max_size)

        self.init_dirs()

        self.init_cur_epsiode()
        self.init_global_step()
        self.init_epsilon()
        self.init_summaries()

        # Intialize the DQN graph which contain 2 Networks Target and Q
        self.estimator = DQN(sess, config, self.environment.n_actions)

        # To initialize all variables
        self.init = tf.group(tf.global_variables_initializer(),
                             tf.local_variables_initializer())
        self.sess.run(self.init)

        self.saver = tf.train.Saver(max_to_keep=10)
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.sess.graph)

        if config.is_train and not config.cont_training:
            pass
        elif config.is_train and config.cont_training:
            self.load()
        elif config.is_play:
            self.load()
        else:
            raise Exception("Please Set proper mode for training or playing")

    def load(self):
        latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir)
        if latest_checkpoint:
            print("Loading model checkpoint {}...\n".format(latest_checkpoint))
            self.saver.restore(self.sess, latest_checkpoint)

    def save(self):
        self.saver.save(self.sess, self.checkpoint_dir,
                        self.global_step_tensor)

    def init_dirs(self):
        # Create directories for checkpoints and summaries
        self.checkpoint_dir = os.path.join(self.config.experiment_dir,
                                           "checkpoints/")
        self.summary_dir = os.path.join(self.config.experiment_dir,
                                        "summaries/")

    def init_cur_epsiode(self):
        """Create cur episode tensor to totally save the process of the training"""
        with tf.variable_scope('cur_episode'):
            self.cur_episode_tensor = tf.Variable(-1,
                                                  trainable=False,
                                                  name='cur_epsiode')
            self.cur_epsiode_input = tf.placeholder('int32',
                                                    None,
                                                    name='cur_episode_input')
            self.cur_episode_assign_op = self.cur_episode_tensor.assign(
                self.cur_epsiode_input)

    def init_global_step(self):
        """Create a global step variable to be a reference to the number of iterations"""
        with tf.variable_scope('step'):
            self.global_step_tensor = tf.Variable(0,
                                                  trainable=False,
                                                  name='global_step')
            self.global_step_input = tf.placeholder('int32',
                                                    None,
                                                    name='global_step_input')
            self.global_step_assign_op = self.global_step_tensor.assign(
                self.global_step_input)

    def init_epsilon(self):
        """Create an epsilon variable"""
        with tf.variable_scope('epsilon'):
            self.epsilon_tensor = tf.Variable(self.config.initial_epsilon,
                                              trainable=False,
                                              name='epsilon')
            self.epsilon_input = tf.placeholder('float32',
                                                None,
                                                name='epsilon_input')
            self.epsilon_assign_op = self.epsilon_tensor.assign(
                self.epsilon_input)

    def init_summaries(self):
        """Create the summary part of the graph"""
        with tf.variable_scope('summary'):
            self.summary_placeholders = {}
            self.summary_ops = {}
            self.scalar_summary_tags = [
                'episode.total_reward', 'episode.length',
                'evaluation.total_reward', 'evaluation.length', 'epsilon'
            ]
            for tag in self.scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder('float32',
                                                                None,
                                                                name=tag)
                self.summary_ops[tag] = tf.summary.scalar(
                    tag, self.summary_placeholders[tag])

    def init_replay_memory(self):
        # Populate the replay memory with initial experience
        print("initializing replay memory...")

        state = self.environment.reset()
        for i in itertools.count():
            action = self.take_action(state)
            next_state, reward, done = self.observe_and_save(
                state, self.environment.valid_actions[action])
            if done:
                if self.config.prm:
                    if i >= self.config.prm_init_size:
                        break
                else:
                    if i >= self.config.replay_memory_init_size:
                        break
                state = self.environment.reset()
            else:
                state = next_state
        print("finished initializing replay memory")

    def policy_fn(self, fn_type, estimator, n_actions):
        """Function that contain definitions to various number of policy functions and choose between them"""
        def epsilon_greedy(sess, observation, epsilon):
            actions = np.ones(n_actions, dtype=float) * epsilon / n_actions
            q_values = estimator.predict(np.expand_dims(observation, 0))[0]
            best_action = np.argmax(q_values)
            actions[best_action] += (1.0 - epsilon)
            return actions

        def greedy(sess, observation):
            q_values = estimator.predict(np.expand_dims(observation, 0),
                                         type="target")[0]
            best_action = np.argmax(q_values)
            return best_action

        if fn_type == 'epsilon_greedy':
            return epsilon_greedy
        elif fn_type == 'greedy':
            return greedy
        else:
            raise Exception("Please Select a proper policy function")

    def take_action(self, state):
        """Take the action based on the policy function"""
        action_probs = self.policy(self.sess, state,
                                   self.epsilon_tensor.eval(self.sess))
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        return action

    def observe_and_save(self, state, action):
        """Function that observe the new state , reward and save it in the memory"""
        next_state, reward, done = self.environment.step(action)
        self.memory.push(state, next_state, action, reward, done)
        return next_state, reward, done

    def update_target_network(self):
        """Update Target network By copying paramter between the two networks in DQN"""
        self.estimator.update_target_network()

    def add_summary(self, summaries_dict, step):
        """Add the summaries to tensorboard"""
        summary_list = self.sess.run(
            [self.summary_ops[tag] for tag in summaries_dict.keys()], {
                self.summary_placeholders[tag]: value
                for tag, value in summaries_dict.items()
            })
        for summary in summary_list:
            self.summary_writer.add_summary(summary, step)
        self.summary_writer.flush()

    def train_episodic(self):
        """Train the agent in episodic techniques"""

        # Initialize the epsilon step, it's step, the policy function, the replay memory
        self.epsilon_step = (
            self.config.initial_epsilon -
            self.config.final_epsilon) / self.config.exploration_steps
        self.policy = self.policy_fn(self.config.policy_fn, self.estimator,
                                     self.environment.n_actions)
        self.init_replay_memory()

        for cur_episode in range(
                self.cur_episode_tensor.eval(self.sess) + 1,
                self.config.num_episodes, 1):

            # Save the current checkpoint
            self.save()

            # Update the Cur Episode tensor
            self.cur_episode_assign_op.eval(
                session=self.sess,
                feed_dict={
                    self.cur_epsiode_input:
                    self.cur_episode_tensor.eval(self.sess) + 1
                })

            # Evaluate Now to see how it behave
            if cur_episode % self.config.evaluate_every == 0:
                self.evaluate(cur_episode / self.config.evaluate_every)

            state = self.environment.reset()
            total_reward = 0

            # Take steps in the environment untill terminal state of epsiode
            for t in itertools.count():

                # Update the Global step
                self.global_step_assign_op.eval(
                    session=self.sess,
                    feed_dict={
                        self.global_step_input:
                        self.global_step_tensor.eval(self.sess) + 1
                    })

                # time to update the target estimator
                if self.global_step_tensor.eval(
                        self.sess
                ) % self.config.update_target_estimator_every == 0:
                    self.update_target_network()

                # Calculate the Epsilon for this time step
                # Take an action ..Then observe and save
                self.epsilon_assign_op.eval(
                    {
                        self.epsilon_input:
                        max(
                            self.config.final_epsilon,
                            self.epsilon_tensor.eval(self.sess) -
                            self.epsilon_step)
                    }, self.sess)
                action = self.take_action(state)
                next_state, reward, done = self.observe_and_save(
                    state, self.environment.valid_actions[action])

                # Sample a minibatch from the replay memory
                if self.config.prm:
                    indices_batch, weights_batch, state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.sample(
                    )
                else:
                    state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.get_batch(
                        self.config.batch_size)

                # Calculate targets Then Compute the loss
                q_values_next = self.estimator.predict(next_state_batch,
                                                       type="target")
                targets_batch = reward_batch + np.invert(done_batch).astype(
                    np.float32) * self.config.discount_factor * np.amax(
                        q_values_next, axis=1)

                if self.config.prm:
                    _ = self.estimator.update(state_batch, action_batch,
                                              targets_batch, weights_batch)
                else:
                    _ = self.estimator.update(state_batch, action_batch,
                                              targets_batch)

                total_reward += reward

                if done:  # IF terminal state so exit the episode
                    # Add summaries to tensorboard
                    summaries_dict = {
                        'episode.total_reward': total_reward,
                        'episode.length': t,
                        'epsilon': self.epsilon_tensor.eval(self.sess)
                    }
                    self.add_summary(summaries_dict,
                                     self.global_step_tensor.eval(self.sess))
                    break

                state = next_state

        print("Training Finished")

    def train_continous(self):
        # TODO implement on global step only
        pass

    def play(self, n_episode=10):
        """Function that play greedily on the policy learnt"""
        # Play Greedily
        self.policy = self.policy_fn('greedy', self.estimator,
                                     self.environment.n_actions)

        for cur_episode in range(n_episode):

            state = self.environment.reset()
            total_reward = 0

            for t in itertools.count():

                best_action = self.policy(self.sess, state)
                next_state, reward, done = self.environment.step(
                    self.environment.valid_actions[best_action])

                total_reward += reward

                if done:
                    print("Total Reward in Epsiode " + str(cur_episode) +
                          " = " + str(total_reward))
                    print("Total Length in Epsiode " + str(cur_episode) +
                          " = " + str(t))
                    break

                state = next_state

    def evaluate(self, local_step):

        print('evaluation #{0}'.format(local_step))

        policy = self.policy_fn('greedy', self.estimator,
                                self.evaluation_enviroment.n_actions)

        for cur_episode in range(self.config.evaluation_episodes):

            state = self.evaluation_enviroment.reset()
            total_reward = 0

            for t in itertools.count():

                best_action = policy(self.sess, state)
                next_state, reward, done = self.evaluation_enviroment.step(
                    self.evaluation_enviroment.valid_actions[best_action])

                total_reward += reward

                if done:
                    # Add summaries to tensorboard
                    summaries_dict = {
                        'evaluation.total_reward': total_reward,
                        'evaluation.length': t
                    }
                    self.add_summary(summaries_dict,
                                     local_step * 5 + cur_episode)
                    break

                state = next_state

        print('Finished evaluation #{0}'.format(local_step))

示例#14

显示文件

文件： agent.py 项目： KangHyunMoon/finance

def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []


    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):

        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:

            if game.previous_price == 0 :
                now_price = driver.find_element_by_xpath(
                    '// *[ @ id = "cont_coin_info"] / div[1] / span[1]'
                ).text
                now_price = float(str(now_price).replace(",", ""))

                game.previous_price = now_price

                print("prepare..")

                time.sleep(0.5)

            # 1. 현재 가격 저장

            now_price = driver.find_element_by_xpath(
                '// *[ @ id = "cont_coin_info"] / div[1] / span[1]'
            ).text
            now_price = float(str(now_price).replace(",", ""))

            game.now_price = now_price

            # 2. 전체 매도량, 전체 매수량

            total_sell = driver.find_element_by_xpath(
                '// *[ @ id = "txt_total_bid"]'
            ).text

            total_buy = driver.find_element_by_xpath(
                '//*[@id="txt_total_ask"]'
            ).text

            total_trade = float(str(total_sell).replace(",", "")) + float(str(total_buy).replace(",", ""))

            selling = [0 for _ in range(10)]
            buying = [0 for _ in range(10)]

            for num in range(1,11):
                _xpath = '//*[@id="contSellCoin"]/li['+ str(num) +']/div/p'
                bar = driver.find_element_by_xpath(
                    _xpath
                ).text

                percent = 100 * float(bar) / total_trade


                selling[num-1] = percent



            for num in range(1,11):
                _xpath = '//*[@id="contBuyCoin"]/li[' + str(num) + ']/div/p'
                bar = driver.find_element_by_xpath(
                    _xpath
                ).text

                percent = 100 * float(bar) / total_trade


                buying[num-1] = percent


            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)

            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action, selling, buying)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            time.sleep(0.3)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

            print('게임횟수: %d 점수: %d' % (episode + 1, total_reward), "({})".format(game.seq))



            total_reward_list.append(total_reward)

            if terminal == True :# 게임 종료
                print("game over!")


        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

示例#15

显示文件

def train(cont):
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, OBS_NUM, BUN_NUM, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, CHANNEL, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    if cont:
        sess.run(tf.global_variables_initializer())

        ckpt = str(tf.train.get_checkpoint_state('model'))
        i = ckpt.find("\"") + 1
        j = ckpt.find("\"", i)
        reader = pywrap_tensorflow.NewCheckpointReader(ckpt[i:j])
        var_to_shape_map = reader.get_variable_to_shape_map()
        target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        for key in var_to_shape_map:
            if "conv2d" in key and "Adam" not in key:
                for key_f in target_vars:
                    if key in key_f.name:
                        sess.run(key_f.assign(reader.get_tensor(key)))
                        break


#        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        if episode > OBSERVE:
            epsilon = 0.01

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()
            epsilon += 0.00001

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()

            time_step += 1

        if episode % 10 == 0:
            print('Games: %d Score: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 10000 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=episode)