Exemplo n.º 1
0
    model = create_model(env.observation_space.shape[-1], env.action_space.shape[-1])
    brain = QBrain(model, kind="diff", v_selectivity=False, gamma=0.99)
    brain.compile(Adam(lr=1e-3), ["mse"])
    tanks.append(TankAgent(env, brain, train_sample_size=1000))

controller = SynchronousMultiAgentController(env, tanks,
    rounds_between_train = 10000, episodes_between_train = 1
    )

taus = [2.0, 1.0, 0.1, 0.01]
ntaus = len(taus)
t = 0

test_policy = BoltzmannQPolicy(0.005)

test_run_logger = RunLogger("run_log.csv")

for _ in range(20000):
    for i in range(2*ntaus):
        t += 1
        tau = taus[t%ntaus]
        policy = BoltzmannQPolicy(tau)
        print "Tau=%f, training..." % (tau,)
        controller.fit(max_episodes=10, callbacks=[RunLogger(), EpisodeLogger()], policy=policy)
    print "-- Testing..."
    controller.test(max_episodes=50, callbacks=[EpisodeLogger(), test_run_logger], policy=test_policy)
    controller.test(max_episodes=5, callbacks=[Visualizer(), EpisodeLogger()], policy=test_policy)
    

    
Exemplo n.º 2
0
taus = [1.0, 0.1, 0.01, 0.001]
ntaus = len(taus)
t = 0

test_policy = BoltzmannQPolicy(0.0001)
#print "test policy:", test_policy
train_run_logger = RunLogger()
test_run_logger = RunLogger(run_log, loss_info_from=train_run_logger)

for _ in range(20000):

    controller.randomMoves([agent], 100)

    for i in range(2 * ntaus):
        tau = taus[t % ntaus]
        policy = BoltzmannQPolicy(tau)
        print "-- Training with tau=%.4f..." % (tau, )
        controller.fit(max_episodes=20,
                       callbacks=[train_run_logger],
                       policy=policy)
        t += 1
    print "-- Testing..."
    controller.test(max_episodes=50,
                    callbacks=[test_run_logger],
                    policy=test_policy)
    controller.test(max_episodes=3,
                    callbacks=[Visualizer(), EpisodeLogger()],
                    policy=test_policy)
    #controller.test(max_episodes=20, callbacks=[], policy=test_policy)
Exemplo n.º 3
0
                          train_policy=self.TrainPolicy,
                          test_policy=self.TestPolicy,
                          steps_between_train=500,
                          episodes_between_train=1,
                          train_sample_size=20,
                          train_rounds=100,
                          trains_between_updates=1)

    def updateState(self, observation):
        return self.ValidActions


#env = gym.make("LunarLander-v2")
env = LunarLander()
model = create_model(env.observation_space.shape[-1], 4)
qnet = QNet(model, 0.01)
qnet.compile(Adam(lr=1e-3), ["mse"])

lander = Lander(env, qnet)

for t in range(2000):
    lander.TrainPolicy.Epsilon = 0.1 if t % 2 == 0 else 0.5
    print "epsilon:", lander.TrainPolicy.Epsilon
    lander.fit(max_episodes=5, callbacks=[TrainCallback(), EpisodeLogger()])
    #lander.TrainPolicy.Epsilon = max(lander.TrainPolicy.Epsilon*0.95, 0.2)
    print "QNet train_samples=", qnet.TrainSamples, "  memory age=", lander.age
    lander.test(max_episodes=1,
                callbacks=[TestLogger(),
                           Visualizer(),
                           TrainCallback()])