def __init__(self, mission_type, port, addr): # malmoutils.fix_print() # metadata = {'render.modes': ['human']} self.env = malmoenv.make() # self.train_2 = train_2 self.mission_type = mission_type mission_param = self.load_mission_param(self.mission_type) # print(mission_param) self.actions = [ "movenorth", "movesouth", "movewest", "moveeast", "attack", "use" ] #,"strafe 1","strafe -1"] #,"attack 1","attack 0"] self.observation_space = spaces.Box(0, 13, shape=(1, 1, 9, 9)) self.state_map = mission_param["state_map"] self.entity_map = mission_param["entity_map"] self.relevant_entities = mission_param["relevant_entities"] self.goal = mission_param["goal"] self.step_cost = mission_param["step_cost"] self.goal_reward = mission_param["goal_reward"] self.max_steps = mission_param["max_steps"] self.port = port self.addr = addr self.episode = 0 mission = self.get_mission_xml(self.mission_type) self.env.init(mission, server=addr, port=self.port, exp_uid="test", role=0, episode=self.episode, action_filter=self.actions) #, args.port, self.action_space = self.env.action_space
def run(role): env = malmoenv.make() env.init(xml, args.port, server=args.server, server2=args.server2, port2=(args.port + role), role=role, exp_uid=args.experimentUniqueId, episode=args.episode, resync=args.resync) def log(message): print('[' + str(role) + '] ' + message) for r in range(args.episodes): log("reset " + str(r)) env.reset() steps = 0 done = False while not done: steps += 1 action = env.action_space.sample() log(str(steps) + " action: " + str(action)) obs, reward, done, info = env.step(action) # log("reward: " + str(reward)) # log("done: " + str(done)) # log("info: " + str(info)) # log(" obs: " + str(obs)) time.sleep(.05) env.close()
def run(role): env = malmoenv.make() env.init(xml, args.port, server=args.server, server2=args.server2, port2=(args.port + role), role=role, exp_uid=args.experimentUniqueId, episode=args.episode, resync=args.resync) def log(message): print('[' + str(role) + '] ' + message) for r in range(args.episodes): log("reset " + str(r)) env.reset() steps = 0 done = False while not done: steps += 1 action = env.action_space.sample() log(str(steps) + " action: " + str(action)) obs, reward, done, info = env.step(action) # log("reward: " + str(reward)) # log("done: " + str(done)) # log("info: " + str(info)) # log(" obs: " + str(obs)) time.sleep(.05) env.close()
def create_env(config): env = malmoenv.make() env.init(xml, COMMAND_PORT + config.worker_index, reshape=True) env.reward_range = (-float('inf'), float('inf')) env = DownsampleObs(env, shape=tuple((84, 84))) return env
def create_env(config): env = malmoenv.make() env.init(xml, COMMAND_PORT, reshape=True) env.reward_range = (-float('inf'), float('inf')) # env = ScreenCapturer(env) env = DownsampleObs(env, shape=tuple((84, 84))) return env
def create_env(args): # Example environment creator mission_xml = Path(args.mission).read_text() env = malmoenv.make() env.init(mission_xml, args.port, server=args.server, server2=args.server2, port2=args.port2, role=args.role, exp_uid=args.experimentUniqueId, episode=args.episode, resync=args.resync, reshape=True) return env
def env_factory(agent_id, xml, role, host_address, host_port, command_address, command_port): env = malmoenv.make() env.init(xml, host_port, server=host_address, server2=command_address, port2=command_port, role=role, exp_uid="multiagent", reshape=True) env = DownsampleObs(env, shape=(84, 84)) return env
def env_factory(agent_id, xml, role, host_address, host_port, command_address, command_port): env = malmoenv.make() env.init(xml, host_port, server=host_address, server2=command_address, port2=command_port, role=role, exp_uid="multiagent", reshape=True) env = FrameStack(env, FRAME_STACK) env = malmoenv.SyncEnv(env, idle_action=4, idle_delay=0.02) env = TrackingEnv(env) return env
def create_env(args): xml = Path(args.mission).read_text() env = malmoenv.make() print(f"create env listening on port {args.port}") env.init(xml, args.port, server=args.server, server2=args.server2, port2=args.port2, role=args.role, exp_uid=args.experimentUniqueId, episode=args.episode, resync=args.resync, reshape=True) env.reward_range = (-float('inf'), float('inf')) # env = DownsampleObs(env, shape=tuple((84, 84))) # env = MultiEntrySymbolicObs(env) return env
def create_malmo(env_config: dict): config = deepcopy(MALMO_DEFAULTS) config.update(env_config) if config['server2'] is None: config['server2'] = config['server'] xml = Path(MALMO_MISSION_PATH+config["mission"]).read_text() env = malmoenv.make() env.init(xml, config["port"], server=config["server"], server2=config["server2"], port2=config["port2"], role=config["role"], exp_uid=config["experimentUniqueId"], episode=config["episode"], resync=config["resync"]) return env
def _default_env_factory(agent_id, xml, role, host_address, host_port, command_address, command_port): """ Default environment factory that fills out just enough settings to connect multiple game instances into a single game session. agent_id - The agent we're constructing the environment connection for. xml - The mission XML. role - The agent's role number. 0 == host agent. host_address, host_port - Connection details for the game session host. command_address, command_port - Connection details for the game instance the agent is controlling. """ env = malmoenv.make() env.init(xml, host_port, server=host_address, server2=command_address, port2=command_port, role=role, exp_uid="default_experiment_id") return env
help='exit and re-sync every N resets' ' - default is 0 meaning never.') parser.add_argument('--synctick', action='store_true', help='whether or not MalmoEnv' 'will run synchronously') parser.add_argument('--experimentUniqueId', type=str, default='test1', help="the experiment's unique id.") args = parser.parse_args() if args.server2 is None: args.server2 = args.server xml = Path(args.mission).read_text() env = malmoenv.make() env.init(xml, args.port, server=args.server, server2=args.server2, port2=args.port2, role=args.role, exp_uid=args.experimentUniqueId, episode=args.episode, resync=args.resync, synchronous=args.synctick) for i in range(args.episodes): print("reset " + str(i)) obs = env.reset()
parser.add_argument('--port2', type=int, default=9000, help="(Multi-agent) role N's mission port") parser.add_argument('--server2', type=str, default=None, help="(Multi-agent) role N's server DNS or IP") parser.add_argument('--episodes', type=int, default=1, help='the number of resets to perform - default is 1') parser.add_argument('--episode', type=int, default=0, help='the start episode - default is 0') parser.add_argument('--role', type=int, default=0, help='the agent role - defaults to 0') parser.add_argument('--episodemaxsteps', type=int, default=0, help='max number of steps per episode') parser.add_argument('--saveimagesteps', type=int, default=0, help='save an image every N steps') parser.add_argument('--resync', type=int, default=0, help='exit and re-sync every N resets' ' - default is 0 meaning never.') parser.add_argument('--experimentUniqueId', type=str, default='test1', help="the experiment's unique id.") args = parser.parse_args() if args.server2 is None: args.server2 = args.server xml = Path(args.mission).read_text() env = malmoenv.make() env.init(xml, args.port, server=args.server, server2=args.server2, port2=args.port2, role=args.role, exp_uid=args.experimentUniqueId, episode=args.episode, resync=args.resync) for i in range(args.episodes): print("reset " + str(i)) obs = env.reset() steps = 0 done = False while not done and (args.episodemaxsteps <= 0 or steps < args.episodemaxsteps):
def main(): args = get_args() if args.server2 is None: args.server2 = args.server xml = Path(args.mission).read_text() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") env = malmoenv.make() env.init(xml, args.port, server=args.server, server2=args.server2, port2=args.port2, role=args.role, exp_uid=args.experimentUniqueId, episode=args.episode, resync=args.resync, reshape=True,) #obs_shape = (env.observation_space.shape[2],env.observation_space.shape[0],env.observation_space.shape[1]) obs_shape = env.observation_space.shape # if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: # env = TransposeImage(env, op=[2, 0, 1]) if args.guided: pass else: actor_critic = Policy( obs_shape, env.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, env.action_space, actor_critic.recurrent_hidden_state_size) obs = env.reset() obs = torch.from_numpy(obs).float().to(device) rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = env.step(action) if reward is None: continue #reward = 0.0 obs = torch.from_numpy(obs).float().to(device) # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) if done or step > args.episodemaxsteps: episode_rewards.append(reward) done = False obs = env.reset() obs = torch.from_numpy(obs).float().to(device) break # If done then clean the history of observations. # masks = torch.FloatTensor( # [[0.0] if done_ else [1.0] for done_ in done]) # bad_masks = torch.FloatTensor( # [[0.0] if 'bad_transition' in info.keys() else [1.0] # for info in infos]) # Hardcode for testing masks = torch.FloatTensor([[1.0]]) #always not done bad_masks = torch.FloatTensor([[1.0]]) #always good transitions rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, torch.FloatTensor([reward]), masks, bad_masks) #print(reward) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() # obs = env.reset() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) total_num_steps = (j + 1) * args.num_processes * args.num_steps print("{} Steps, Value Loss: {}, Action Loss: {}".format(total_num_steps, value_loss, action_loss)) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) steps = 0 done = False # while not done and (args.episodemaxsteps <= 0 or steps < args.episodemaxsteps): # action = env.action_space.sample() # obs, reward, done, info = env.step(action) # obs = torch.from_numpy(obs).float().to(device) #unsqueeze? # #reward = torch.from_numpy(reward).unsqueeze(dim=1).float() # steps += 1 # print("reward: " + str(reward)) # # print("done: " + str(done)) # print("obs: " + str(obs)) # # print("info" + info) # if args.saveimagesteps > 0 and steps % args.saveimagesteps == 0: # d, h, w = env.observation_space.shape # img = Image.fromarray(obs.reshape(h, w, d)) # img.save('image' + str(args.role) + '_' + str(steps) + '.png') time.sleep(.05) env.close()
def run_sim(self, exp_role, num_episodes, port1, serv1, serv2, exp_id, epi, rsync): '''Code to actually run simulation ''' env = malmoenv.make() env.init(self.get_mission_xml(MalmoSim.MOB_TYPE + " Apocalypse"), port1, server=serv1, server2=serv2, port2=(port1 + exp_role), role=exp_role, exp_uid=exp_id, episode=epi, resync=rsync, action_space = malmoenv.ActionSpace(self.create_actions())) max_num_steps = 1000 for r in range(num_episodes): print("Reset [" + str(exp_role) + "] " + str(r) ) env.reset() num_steps = 0 sim_done = False total_reward = 0 total_commands = 0 flash = False (obs, reward, sim_done, info) = env.step(0) while not sim_done: num_steps += 1 if (not (info is None or len(info) == 0)): info_json = json.loads(info) agent_life = info_json["Life"] agent_yaw = info_json["Yaw"] if "entities" in info_json: entities = [EntityInfo(**k) for k in info_json["entities"]] self.draw_mobs(entities, flash) # best_yaw_bin = self.agent_decision_net.classify_input(self.get_scores(entities, agent_yaw, agent_life)) # best_yaw = sum((round(best_yaw_bin[x]) * (2**x)) for x in range(len(best_yaw_bin))) # num_bin_dig = int(math.ceil(math.log(self.agent_search_resolution,2))) # desired_output_bin = [int(x) for x in ('{0:0'+str(num_bin_dig)+'b}').format(desired_output)] index, scores, best_yaw = self.get_best_angle(entities, agent_yaw, agent_life) self.training_out.append(best_yaw) self.training_obs.append(scores) # self.training_out.append(desired_output_bin) # self.training_obs.append(inputs) difference = best_yaw - agent_yaw #Sometimes we seem to get a difference above 360, still haven't figure out that one while difference < -180: difference += 360; while difference > 180: difference -= 360; #Our action id is dependent upon our yaw angle to turn (see create_actions for more info) action_id = int(difference + 360) + 2 (obs, reward, sim_done, info) = env.step(action_id) #difference /= 180.0; total_commands += 1 if (not(reward is None)): total_reward += reward else: (obs, reward, sim_done, info) = env.step(0) time.sleep(0.05) print("We stayed alive for " + str(num_steps) + " commands, and scored " + str(total_reward)) time.sleep(1) # Give the mod a little time to prepare for the next mission.