Пример #1
0
 # Play a game
 while not done:
     if random.random() < exploration_rate:
         action = random.randint(0, 1)
     else:
         actions = mlp.frontprop(normalize(np.array(state)))
         action = np.argmax(actions)
         # action = np.argmin(actions)
     short_mem.append((state, action))
     state, reward, done, _ = env.step(action)
     if render: env.render()
     score += 1
 # If win then learn short mem
 if score == 500:
     for state, action in short_mem:
         mlp.backprop(normalize(np.array(state)),
                      np.array([1, 0] if action == 0 else [0, 1]))
         mlp.fit()
         # long_mem.append((state, action))
 # If loose, don't
 else:
     pop = True
     while pop:
         state, action = short_mem.pop()
         actions = mlp.frontprop(normalize(np.array(state)))
         # Action chosen is bad
         if action == 0:
             expected = np.array([0, actions[1]])
         else:
             expected = np.array([actions[0], 0])
         # Learn corrected action
         mlp.backprop(normalize(np.array(state)), expected)