model = create_model(env.observation_space.shape[-1], env.action_space.shape[-1]) brain = QBrain(model, kind="diff", v_selectivity=False, gamma=0.99) brain.compile(Adam(lr=1e-3), ["mse"]) tanks.append(TankAgent(env, brain, train_sample_size=1000)) controller = SynchronousMultiAgentController(env, tanks, rounds_between_train = 10000, episodes_between_train = 1 ) taus = [2.0, 1.0, 0.1, 0.01] ntaus = len(taus) t = 0 test_policy = BoltzmannQPolicy(0.005) test_run_logger = RunLogger("run_log.csv") for _ in range(20000): for i in range(2*ntaus): t += 1 tau = taus[t%ntaus] policy = BoltzmannQPolicy(tau) print "Tau=%f, training..." % (tau,) controller.fit(max_episodes=10, callbacks=[RunLogger(), EpisodeLogger()], policy=policy) print "-- Testing..." controller.test(max_episodes=50, callbacks=[EpisodeLogger(), test_run_logger], policy=test_policy) controller.test(max_episodes=5, callbacks=[Visualizer(), EpisodeLogger()], policy=test_policy)
taus = [1.0, 0.1, 0.01, 0.001] ntaus = len(taus) t = 0 test_policy = BoltzmannQPolicy(0.0001) #print "test policy:", test_policy train_run_logger = RunLogger() test_run_logger = RunLogger(run_log, loss_info_from=train_run_logger) for _ in range(20000): controller.randomMoves([agent], 100) for i in range(2 * ntaus): tau = taus[t % ntaus] policy = BoltzmannQPolicy(tau) print "-- Training with tau=%.4f..." % (tau, ) controller.fit(max_episodes=20, callbacks=[train_run_logger], policy=policy) t += 1 print "-- Testing..." controller.test(max_episodes=50, callbacks=[test_run_logger], policy=test_policy) controller.test(max_episodes=3, callbacks=[Visualizer(), EpisodeLogger()], policy=test_policy) #controller.test(max_episodes=20, callbacks=[], policy=test_policy)
train_policy=self.TrainPolicy, test_policy=self.TestPolicy, steps_between_train=500, episodes_between_train=1, train_sample_size=20, train_rounds=100, trains_between_updates=1) def updateState(self, observation): return self.ValidActions #env = gym.make("LunarLander-v2") env = LunarLander() model = create_model(env.observation_space.shape[-1], 4) qnet = QNet(model, 0.01) qnet.compile(Adam(lr=1e-3), ["mse"]) lander = Lander(env, qnet) for t in range(2000): lander.TrainPolicy.Epsilon = 0.1 if t % 2 == 0 else 0.5 print "epsilon:", lander.TrainPolicy.Epsilon lander.fit(max_episodes=5, callbacks=[TrainCallback(), EpisodeLogger()]) #lander.TrainPolicy.Epsilon = max(lander.TrainPolicy.Epsilon*0.95, 0.2) print "QNet train_samples=", qnet.TrainSamples, " memory age=", lander.age lander.test(max_episodes=1, callbacks=[TestLogger(), Visualizer(), TrainCallback()])