# Play a game while not done: if random.random() < exploration_rate: action = random.randint(0, 1) else: actions = mlp.frontprop(normalize(np.array(state))) action = np.argmax(actions) # action = np.argmin(actions) short_mem.append((state, action)) state, reward, done, _ = env.step(action) if render: env.render() score += 1 # If win then learn short mem if score == 500: for state, action in short_mem: mlp.backprop(normalize(np.array(state)), np.array([1, 0] if action == 0 else [0, 1])) mlp.fit() # long_mem.append((state, action)) # If loose, don't else: pop = True while pop: state, action = short_mem.pop() actions = mlp.frontprop(normalize(np.array(state))) # Action chosen is bad if action == 0: expected = np.array([0, actions[1]]) else: expected = np.array([actions[0], 0]) # Learn corrected action mlp.backprop(normalize(np.array(state)), expected)