def test_single_training(): numberOfCells = 10 # in each axis startingPosition = (4, 5) # head foodPosition = (3, 6) env = Environment(numberOfCells, deterministic=True) agent = DQNAgent(state_size=env.state_size, action_size=Actions.action_size, deterministic=True, batch_size=24, memory_limit=2000) state = env.reset(startingPosition, foodPosition) agent.reset_convolutional_layers() full_state = agent.get_convolutional_layers(state) loss10 = -1 action10 = -1 maxsteps = 10 for step in range(maxsteps): action = agent.get_exploration_action() next_state, reward, done = env.step(action, food_position=(1, 1)) assert(not done) full_next_state = agent.get_convolutional_layers(next_state) assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers)) agent.save_transition(full_state, action, reward, full_next_state, done) current_loss = agent.train() full_state = full_next_state loss10 = current_loss action10 = action assert(loss10 == 0.006804642267525196) assert(action10 == 0)
def test_smoke(): # just runs the code - no assetions numberOfCells = 10 # in each axis startingPosition = (4, 5) # head foodPosition = (3, 6) env = Environment(numberOfCells) agent = DQNAgent(state_size=env.state_size, action_size=Actions.action_size, deterministic=True, batch_size=24, memory_limit=2000) state = env.reset(startingPosition, foodPosition) agent.reset_convolutional_layers() full_state = agent.get_convolutional_layers(state) maxsteps = 2 for step in range(maxsteps): action = agent.get_exploration_action() next_state, reward, done = env.step(action, food_position=(1, 1)) full_next_state = agent.get_convolutional_layers(next_state) assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers)) agent.save_transition(full_state, action, reward, full_next_state, done) current_loss = agent.train() if (step == 0): action1 = action loss1 = current_loss full_state = full_next_state loss2 = current_loss action2 = action
def test_multiepisode_training(): numberOfCells = 10 # in each axis startingPosition = (4, 5) # head foodPosition = (3, 6) env = Environment(numberOfCells, deterministic=True) state_size = env.state_size action_size = Actions.action_size # 3 agent = DQNAgent(state_size=state_size, action_size=action_size, deterministic=True, batch_size=24, memory_limit=2000) losses = [-1, -1, -1, -1] done = False episodes = 4 maxsteps = 9 for e in range(episodes): state = env.reset(startingPosition, foodPosition) agent.reset_convolutional_layers() full_state = agent.get_convolutional_layers(state) loss = 0 for step in range(maxsteps): action = agent.get_exploration_action() next_state, reward, done = env.step(action, food_position=(1, 1)) # generation on (1, 1) happens once over the test full_next_state = agent.get_convolutional_layers(next_state) assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers)) agent.save_transition(full_state, action, reward, full_next_state, done) current_loss = agent.train() loss += current_loss full_state = full_next_state losses[e] = loss assert(losses[0] == 3.9618697417899966) assert(losses[1] == 0.044194952584803104) assert(losses[2] == 0.1333141174982302) assert(losses[3] == 2.834151452407241)
# We can use proportional or rank-based prioritized replay (proportional seems to be prefered by many papers) # Simple, non-prioritized replay is also implemented alpha_scheduler = dqn.annealing_schedules.Constant(0.7) beta_scheduler = dqn.annealing_schedules.Constant(0.5) memory = dqn.experience_replay.Proportional(capacity=50000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler) ##memory = dqn.experience_replay.RankBased(capacity=50000, alpha_scheduler=alpha_scheduler, beta_scheduler=beta_scheduler) ##memory = dqn.experience_replay.Simple(capacity=50000) # Below we add n-step learning with the parameter n-step # Not yet supported: Frame skipping will be added in the future agent = DQNAgent(network=q_func, observation_space=env.observation_space, action_space=env.action_space, action_selection=action_selection, loss=loss, update_target=update_target, memory=memory, n_step=3, update_target_network_frequency=2000) agent.train(env, num_timesteps=num_steps, render=False) # We can save and load an agent # Note: Currently this only saves the weights of the network -- the entire agent must be recreated (or reused, as would happen here) before calling load ##agent.save('/tmp/save_test/test') ##agent.load('/tmp/save_test/test')
def train_snake(): # todo: put all these parameters using a configuration file numberOfCells = 10 # in each axis startingPosition = (4, 5) # head foodPosition = (3, 6) max_steps_allowed = 1000 env = Environment(numberOfCells) state_size = env.state_size #(numberOfCells x numberOfCells) action_size = Actions.action_size # 3 agent = DQNAgent(state_size=state_size, action_size=action_size, batch_size=32, memory_limit=6000, number_of_channels=5) episodes = 30000 decay = 0.9 / episodes * 2 # changes epsilon : explore vs exploit epochs = [] losses = [] steps_list = [] with open('training_data', 'w') as f: for e in range(episodes): state = env.reset(startingPosition) #print('state array reset: \n', state) agent.reset_convolutional_layers() full_state = agent.get_convolutional_layers(state) loss = 0.0 steps = 0 done = False episode_reward = 0 while not done: # state at this point is just a 2D array action = agent.get_action(full_state) #action = agent.get_raction() #print('action chosen: ', action) # step onto the next state next_state, reward, done = env.step(action) #print('state array after step ', steps, ' : \n', next_state) #print('reward returned: ', reward) #print('next state: ', next_state) # we store the next_state in (1,H,W,C) full_next_state = agent.get_convolutional_layers(next_state) #print('full next state: \n:', full_next_state) #assert(full_next_state.shape == (1, numberOfCells, numberOfCells, agent.numberOfLayers)) # save S,A,R,S' to experience # full states are a snapshot - copies of the state agent.save_transition(full_state, action, reward, full_next_state, done) episode_reward += reward # use alternative policy to train model - rely on experience only current_loss = agent.train() #print('current_loss: ', current_loss) loss += current_loss full_state = full_next_state # limit max steps - avoid something bad steps += 1 if steps >= max_steps_allowed: done = True # next episode if agent.epsilon > 0.1: agent.epsilon -= decay # agent slowly reduces exploring print( 'episode: {:5d} steps: {:3d} epsilon: {:.3f} loss: {:8.4f} reward: {:3d} fruits: {:2d}' .format(e, steps, agent.epsilon, loss, episode_reward, env.fruits_eaten)) f.write('{:5d} {:3d} {:8.4f} {:4d} {:2d}\n'.format( e, steps, loss, episode_reward, env.fruits_eaten)) agent.model.save('trained_snake.model')
help="Please specify the agent you wish to use, either DQN or A3C", required=True) parser.add_argument( "-n", "--mode", type=str, action='store', help="Please specify the mode you wish to run, either train or eval", required=True) args = parser.parse_args() print(args) if args.model == 'DQN': agent = DQNAgent() if args.mode == 'train': agent.train() if args.mode == 'eval': agent.Evaluate() if args.model == 'A3C': agent = A3CGlobalAgent() if args.mode == 'train': agent.train() if args.mode == 'eval': agent.Evaluate()