def run_experiment(environment, backend, device_id, max_epoch, record, logdir, visualizer): env = GymEnvironment(environment, monitoring_path=logdir if record else None) if backend == 'cntk': from malmopy.model.cntk import QNeuralNetwork as CntkDQN model = CntkDQN((4, 84, 84), env.available_actions, momentum=0.95, device_id=device_id, visualizer=visualizer) else: from malmopy.model.chainer import DQNChain, QNeuralNetwork as ChainerDQN chain = DQNChain((4, 84, 84), env.available_actions) target_chain = DQNChain((4, 84, 84), env.available_actions) model = ChainerDQN(chain, target_chain, momentum=0.95, device_id=device_id) memory = TemporalMemory(1000000, model.input_shape[1:]) agent = QLearnerAgent("DQN Agent", env.available_actions, model, memory, 0.99, 32, train_after=10000, reward_clipping=(-1, 1), visualizer=visualizer) state = env.reset() reward = 0 agent_done = False viz_rewards = [] max_training_steps = max_epoch * EPOCH_SIZE for step in range(1, max_training_steps + 1): # check if env needs reset if env.done: visualize_training(visualizer, step, viz_rewards) agent.inject_summaries(step) viz_rewards = [] state = env.reset() # select an action action = agent.act(state, reward, agent_done, is_training=True) # take a step state, reward, agent_done = env.do(action) viz_rewards.append(reward) if (step % EPOCH_SIZE) == 0: model.save('%s-%s-dqn_%d.model' % (backend, environment, step / EPOCH_SIZE))
def agent_factory(name, role, clients, backend, device, max_epochs, logdir, visualizer): assert len(clients) >= 2, 'Not enough clients (need at least 2)' clients = parse_clients_args(clients) if role == 0: builder = PigChaseSymbolicStateBuilder() env = PigChaseEnvironment(clients, builder, role=role, randomize_positions=True) agent = PigChaseChallengeAgent(name) if type(agent.current_agent) == RandomAgent: agent_type = PigChaseEnvironment.AGENT_TYPE_1 else: agent_type = PigChaseEnvironment.AGENT_TYPE_2 obs = env.reset(agent_type) reward = 0 agent_done = False while True: if env.done: if type(agent.current_agent) == RandomAgent: agent_type = PigChaseEnvironment.AGENT_TYPE_1 else: agent_type = PigChaseEnvironment.AGENT_TYPE_2 obs = env.reset(agent_type) while obs is None: # this can happen if the episode ended with the first # action of the other agent print('Warning: received obs == None.') obs = env.reset(agent_type) # select an action action = agent.act(obs, reward, agent_done, is_training=True) # take a step obs, reward, agent_done = env.do(action) else: env = PigChaseEnvironment(clients, MalmoALEStateBuilder(), role=role, randomize_positions=True) memory = TemporalMemory(100000, (84, 84)) if backend == 'cntk': from malmopy.model.cntk import QNeuralNetwork model = QNeuralNetwork((memory.history_length, 84, 84), env.available_actions, device) else: from malmopy.model.chainer import QNeuralNetwork, DQNChain chain = DQNChain((memory.history_length, 84, 84), env.available_actions) target_chain = DQNChain((memory.history_length, 84, 84), env.available_actions) model = QNeuralNetwork(chain, target_chain, device) explorer = LinearEpsilonGreedyExplorer(1, 0.1, 1000000) agent = PigChaseQLearnerAgent(name, env.available_actions, model, memory, 0.99, 32, 50000, explorer=explorer, visualizer=visualizer) obs = env.reset() reward = 0 agent_done = False viz_rewards = [] max_training_steps = EPOCH_SIZE * max_epochs for step in six.moves.range(1, max_training_steps + 1): # check if env needs reset if env.done: visualize_training(visualizer, step, viz_rewards) agent.inject_summaries(step) viz_rewards = [] obs = env.reset() while obs is None: # this can happen if the episode ended with the first # action of the other agent print('Warning: received obs == None.') obs = env.reset() # select an action action = agent.act(obs, reward, agent_done, is_training=True) # take a step obs, reward, agent_done = env.do(action) viz_rewards.append(reward) if (step % EPOCH_SIZE) == 0: if 'model' in locals(): model.save('pig_chase-dqn_%d.model' % (step / EPOCH_SIZE))
def run_experiment(backend, device_id, max_epoch, record, clients, logdir, visualizer): env = MazeEnvironment(mission, [str.split(client, ':') for client in clients]) env.recording = False if backend == 'cntk': from malmopy.model.cntk import QNeuralNetwork as CntkDQN model = CntkDQN((4, 84, 84), env.available_actions, momentum=0.95, device_id=device_id, visualizer=visualizer) else: from malmopy.model.chainer import DQNChain, QNeuralNetwork as ChainerDQN chain = DQNChain((4, 84, 84), env.available_actions) target_chain = DQNChain((4, 84, 84), env.available_actions) model = ChainerDQN(chain, target_chain, momentum=0.95, device_id=device_id) memory = TemporalMemory(1000000, model.input_shape[1:]) agent = QLearnerAgent("DQN Agent", env.available_actions, model, memory, 0.99, 32, train_after=10000, reward_clipping=(-1, 1), visualizer=visualizer) #taking random actions EPOCH_SIZE = 250000 max_training_steps = 50 * EPOCH_SIZE state = env.reset() reward = 0 agent_done = False viz_rewards = [] for step in range(1, max_training_steps + 1): # check if env needs reset if env.done: visualize_training(visualizer, step, viz_rewards) agent.inject_summaries(step) viz_rewards = [] state = env.reset() # select an action action = agent.act(state, reward, agent_done, is_training=True) if type(action) == int: print('ACTION BEING TAKEN: ', action) else: print('ACTION BEING TAKEN: ', np.asscalar(action)) # take a step state, reward, agent_done = env.do(action) viz_rewards.append(reward) if (step % EPOCH_SIZE) == 0: model.save('%s-%s-dqn_%d.model' % (backend, environment, step / EPOCH_SIZE))