################## scores = [] agent = ReinforceAgent(model, actions=ACTIONS, noise=EXPLORE_RATE) for _ in range(TEST_EPISODES_PER_EPOCH): env.reset() agent.reset() replay = [] done = False while not done: action = agent.process(env.state) _, reward, done, prevState = env.apply(action) replay.append((prevState, action, reward)) ## states, actions, rewards = zip( *replay ) actions = ACTIONS.toIndex(actions) trainable.fit( [ np.array(states), np.array(actions), np.array(discountedReturns(rewards, GAMMA)) ], epochs=1, verbose=0 ) ## scores.append(env.score) Utils.trackScores(scores, metrics) ################## EXPLORE_RATE = max((0.001, EXPLORE_RATE * EXPLORE_RATE_DECAY)) plotData2file(metrics, 'chart.jpg')
TRAIN_EPISODES = 200 TEST_EPISODES = 1 EPOCHS = 100 NOISE_STD = 0.1 NOISE_STD_DECAY = 0.99 TAU = 0.005 memory = CebLinear(maxSize=50000, sampleWeight='same') metrics = {} for epoch in range(EPOCHS): print('Start of %d epoch. Noise std: %.3f' % (epoch, NOISE_STD)) ################## print('Testing...') scores = Utils.testAgent(RawActionAgent(model, processor=addNoise(NOISE_STD)), memory, TEST_EPISODES, env=RawPendulumEnvironment) Utils.trackScores(scores, metrics) ################## # train model lossesActor = [] lossesCritic = [] for _ in range(TRAIN_EPISODES): states, actions, rewards, nextStates, nextStateScoreMultiplier = memory.sampleBatch( BATCH_SIZE) nextStateScoreMultiplier = tf.convert_to_tensor( nextStateScoreMultiplier * GAMMA, dtype=tf.float32) rewards = tf.convert_to_tensor(rewards, dtype=tf.float32) lossCritic, lossActor = model.fit(states, actions, rewards, nextStates,
for epoch in range(EPOCHS): print('Start of %d epoch. Explore rate: %.3f' % (epoch, EXPLORE_RATE)) # for stability modelClone.set_weights(model.get_weights()) lossSum = 0 for _ in range(TRAIN_EPISODES): states, actions, rewards, nextStates, nextStateScoreMultiplier = memory.sampleBatch( batch_size=BATCH_SIZE, maxSamplesFromEpisode=16 ) actions = ACTIONS.toIndex(actions) futureScores = modelClone.predict(nextStates).max(axis=-1) * nextStateScoreMultiplier targets = modelClone.predict(states) targets[np.arange(len(targets)), actions] = rewards + futureScores * GAMMA lossSum += model.fit(states, targets, epochs=1, verbose=0).history['loss'][0] print('Avg. train loss: %.4f' % (lossSum / TRAIN_EPISODES)) ################## print('Testing...') scores = Utils.testAgent( DQNAgent(model, actions=ACTIONS, exploreRate=EXPLORE_RATE), memory, TEST_EPISODES ) Utils.trackScores(scores, metrics) ################## if (epoch % 10) == 0: # debug Utils.showAgentPlay( DQNAgent(model, actions=ACTIONS, exploreRate=0) ) ################## EXPLORE_RATE = max((0.001, EXPLORE_RATE * EXPLORE_RATE_DECAY)) plotData2file(metrics, 'chart.jpg')
STEPS_PER_EPISODE = 200 BOOTSTRAPPED_STEPS = 10 metrics = {} env = PendulumEnvironment() memory = EB.CebLinear(maxSize=10 * TEST_EPISODES * STEPS_PER_EPISODE, sampleWeight='abs') curiosityModel = CCuriosityIRWatched(CCuriosityIR(layersSizes=[10, 10, 10])) processor = replayProcessor(curiosityModel, rewardScale=1.0 / BOOTSTRAPPED_STEPS, normalize=True) # collect random experience for episodeN in range(2): Utils.testAgent(RandomAgent(low=-1, high=1), memory, episodes=100, processor=processor) print('random experience collected') #################### model = createFatModel(input_shape=(3, ), output_size=ACTIONS.N) model.compile(optimizer=tf.optimizers.Adam(lr=1e-4), loss=tf.keras.losses.Huber(delta=1.0)) ghostNetwork = GhostNetwork(model, mixer='hard') for epoch in range(EPOCHS): print('Start of %d epoch. Explore rate: %.3f' % (epoch, EXPLORE_RATE)) ################## # Training ghostNetwork.update() trainLoss = train( ghostNetwork, memory, {