def __init__(self, mission_type, port, addr):
        # malmoutils.fix_print()
        # metadata = {'render.modes': ['human']}
        self.env = malmoenv.make()
        # self.train_2 = train_2
        self.mission_type = mission_type
        mission_param = self.load_mission_param(self.mission_type)
        #   print(mission_param)
        self.actions = [
            "movenorth", "movesouth", "movewest", "moveeast", "attack", "use"
        ]  #,"strafe 1","strafe -1"] #,"attack 1","attack 0"]

        self.observation_space = spaces.Box(0, 13, shape=(1, 1, 9, 9))
        self.state_map = mission_param["state_map"]
        self.entity_map = mission_param["entity_map"]
        self.relevant_entities = mission_param["relevant_entities"]
        self.goal = mission_param["goal"]
        self.step_cost = mission_param["step_cost"]
        self.goal_reward = mission_param["goal_reward"]
        self.max_steps = mission_param["max_steps"]
        self.port = port
        self.addr = addr
        self.episode = 0
        mission = self.get_mission_xml(self.mission_type)
        self.env.init(mission,
                      server=addr,
                      port=self.port,
                      exp_uid="test",
                      role=0,
                      episode=self.episode,
                      action_filter=self.actions)  #, args.port,
        self.action_space = self.env.action_space
예제 #2
0
    def run(role):
        env = malmoenv.make()
        env.init(xml,
                 args.port, server=args.server,
                 server2=args.server2, port2=(args.port + role),
                 role=role,
                 exp_uid=args.experimentUniqueId,
                 episode=args.episode, resync=args.resync)

        def log(message):
            print('[' + str(role) + '] ' + message)

        for r in range(args.episodes):
            log("reset " + str(r))
            env.reset()
            steps = 0

            done = False
            while not done:
                steps += 1
                action = env.action_space.sample()
                log(str(steps) + " action: " + str(action))
                obs, reward, done, info = env.step(action)
                # log("reward: " + str(reward))
                # log("done: " + str(done))
                # log("info: " + str(info))
                # log(" obs: " + str(obs))

                time.sleep(.05)

        env.close()
예제 #3
0
    def run(role):
        env = malmoenv.make()
        env.init(xml,
                 args.port,
                 server=args.server,
                 server2=args.server2,
                 port2=(args.port + role),
                 role=role,
                 exp_uid=args.experimentUniqueId,
                 episode=args.episode,
                 resync=args.resync)

        def log(message):
            print('[' + str(role) + '] ' + message)

        for r in range(args.episodes):
            log("reset " + str(r))
            env.reset()
            steps = 0

            done = False
            while not done:
                steps += 1
                action = env.action_space.sample()
                log(str(steps) + " action: " + str(action))
                obs, reward, done, info = env.step(action)
                # log("reward: " + str(reward))
                # log("done: " + str(done))
                # log("info: " + str(info))
                # log(" obs: " + str(obs))

                time.sleep(.05)

        env.close()
예제 #4
0
    def create_env(config):
        env = malmoenv.make()
        env.init(xml, COMMAND_PORT + config.worker_index, reshape=True)
        env.reward_range = (-float('inf'), float('inf'))

        env = DownsampleObs(env, shape=tuple((84, 84)))
        return env
예제 #5
0
def create_env(config):
    env = malmoenv.make()
    env.init(xml, COMMAND_PORT, reshape=True)
    env.reward_range = (-float('inf'), float('inf'))

    # env = ScreenCapturer(env)
    env = DownsampleObs(env, shape=tuple((84, 84)))
    return env
예제 #6
0
파일: utils.py 프로젝트: martinballa/malmo
def create_env(args):
    # Example environment creator
    mission_xml = Path(args.mission).read_text()
    env = malmoenv.make()
    env.init(mission_xml, args.port,
             server=args.server,
             server2=args.server2, port2=args.port2,
             role=args.role,
             exp_uid=args.experimentUniqueId,
             episode=args.episode, resync=args.resync,
             reshape=True)
    return env
예제 #7
0
    def env_factory(agent_id, xml, role, host_address, host_port,
                    command_address, command_port):
        env = malmoenv.make()
        env.init(xml,
                 host_port,
                 server=host_address,
                 server2=command_address,
                 port2=command_port,
                 role=role,
                 exp_uid="multiagent",
                 reshape=True)
        env = DownsampleObs(env, shape=(84, 84))

        return env
예제 #8
0
def env_factory(agent_id, xml, role, host_address, host_port, command_address,
                command_port):
    env = malmoenv.make()
    env.init(xml,
             host_port,
             server=host_address,
             server2=command_address,
             port2=command_port,
             role=role,
             exp_uid="multiagent",
             reshape=True)
    env = FrameStack(env, FRAME_STACK)
    env = malmoenv.SyncEnv(env, idle_action=4, idle_delay=0.02)
    env = TrackingEnv(env)
    return env
예제 #9
0
파일: utils.py 프로젝트: martinballa/malmo
def create_env(args):
    xml = Path(args.mission).read_text()
    env = malmoenv.make()
    print(f"create env listening on port {args.port}")
    env.init(xml, args.port,
             server=args.server,
             server2=args.server2, port2=args.port2,
             role=args.role,
             exp_uid=args.experimentUniqueId,
             episode=args.episode, resync=args.resync,
             reshape=True)
    env.reward_range = (-float('inf'), float('inf'))
    # env = DownsampleObs(env, shape=tuple((84, 84)))
    # env = MultiEntrySymbolicObs(env)
    return env
예제 #10
0
def create_malmo(env_config: dict):
    config = deepcopy(MALMO_DEFAULTS)
    config.update(env_config)

    if config['server2'] is None:
        config['server2'] = config['server']

    xml = Path(MALMO_MISSION_PATH+config["mission"]).read_text()
    env = malmoenv.make()
    env.init(xml, config["port"],
             server=config["server"],
             server2=config["server2"], port2=config["port2"],
             role=config["role"],
             exp_uid=config["experimentUniqueId"],
             episode=config["episode"], resync=config["resync"])

    return env
예제 #11
0
def _default_env_factory(agent_id, xml, role, host_address, host_port,
                         command_address, command_port):
    """
    Default environment factory that fills out just enough settings to connect multiple game
    instances into a single game session.
    agent_id - The agent we're constructing the environment connection for.
    xml - The mission XML.
    role - The agent's role number. 0 == host agent.
    host_address, host_port - Connection details for the game session host.
    command_address, command_port - Connection details for the game instance the agent is controlling.
    """
    env = malmoenv.make()
    env.init(xml,
             host_port,
             server=host_address,
             server2=command_address,
             port2=command_port,
             role=role,
             exp_uid="default_experiment_id")
    return env
예제 #12
0
                        help='exit and re-sync every N resets'
                        ' - default is 0 meaning never.')
    parser.add_argument('--synctick',
                        action='store_true',
                        help='whether or not MalmoEnv'
                        'will run synchronously')
    parser.add_argument('--experimentUniqueId',
                        type=str,
                        default='test1',
                        help="the experiment's unique id.")
    args = parser.parse_args()
    if args.server2 is None:
        args.server2 = args.server

    xml = Path(args.mission).read_text()
    env = malmoenv.make()

    env.init(xml,
             args.port,
             server=args.server,
             server2=args.server2,
             port2=args.port2,
             role=args.role,
             exp_uid=args.experimentUniqueId,
             episode=args.episode,
             resync=args.resync,
             synchronous=args.synctick)

    for i in range(args.episodes):
        print("reset " + str(i))
        obs = env.reset()
예제 #13
0
파일: run.py 프로젝트: Microsoft/malmo
    parser.add_argument('--port2', type=int, default=9000, help="(Multi-agent) role N's mission port")
    parser.add_argument('--server2', type=str, default=None, help="(Multi-agent) role N's server DNS or IP")
    parser.add_argument('--episodes', type=int, default=1, help='the number of resets to perform - default is 1')
    parser.add_argument('--episode', type=int, default=0, help='the start episode - default is 0')
    parser.add_argument('--role', type=int, default=0, help='the agent role - defaults to 0')
    parser.add_argument('--episodemaxsteps', type=int, default=0, help='max number of steps per episode')
    parser.add_argument('--saveimagesteps', type=int, default=0, help='save an image every N steps')
    parser.add_argument('--resync', type=int, default=0, help='exit and re-sync every N resets'
                                                              ' - default is 0 meaning never.')
    parser.add_argument('--experimentUniqueId', type=str, default='test1', help="the experiment's unique id.")
    args = parser.parse_args()
    if args.server2 is None:
        args.server2 = args.server

    xml = Path(args.mission).read_text()
    env = malmoenv.make()

    env.init(xml, args.port,
             server=args.server,
             server2=args.server2, port2=args.port2,
             role=args.role,
             exp_uid=args.experimentUniqueId,
             episode=args.episode, resync=args.resync)

    for i in range(args.episodes):
        print("reset " + str(i))
        obs = env.reset()

        steps = 0
        done = False
        while not done and (args.episodemaxsteps <= 0 or steps < args.episodemaxsteps):
예제 #14
0
파일: run_a2c.py 프로젝트: balloch/malmo
def main():
    args = get_args()
    if args.server2 is None:
        args.server2 = args.server

    xml = Path(args.mission).read_text()
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    log_dir = os.path.expanduser(args.log_dir)
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    env = malmoenv.make()

    env.init(xml, args.port,
             server=args.server,
             server2=args.server2, port2=args.port2,
             role=args.role,
             exp_uid=args.experimentUniqueId,
             episode=args.episode, 
             resync=args.resync,
             reshape=True,)

    #obs_shape = (env.observation_space.shape[2],env.observation_space.shape[0],env.observation_space.shape[1])
    obs_shape = env.observation_space.shape
    #    if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
    #        env = TransposeImage(env, op=[2, 0, 1])

    if args.guided:
        pass
    else:
        actor_critic = Policy(
            obs_shape,
            env.action_space,
            base_kwargs={'recurrent': args.recurrent_policy})
        actor_critic.to(device)
        

        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR(
                actor_critic,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                alpha=args.alpha,
                max_grad_norm=args.max_grad_norm)
        elif args.algo == 'ppo':
            agent = algo.PPO(
                actor_critic,
                args.clip_param,
                args.ppo_epoch,
                args.num_mini_batch,
                args.value_loss_coef,
                args.entropy_coef,
                lr=args.lr,
                eps=args.eps,
                max_grad_norm=args.max_grad_norm)
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR(
                actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              obs_shape, env.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = env.reset()
    obs = torch.from_numpy(obs).float().to(device)
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        args.num_env_steps) // args.num_steps // args.num_processes

    for j in range(num_updates):
        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates,
                agent.optimizer.lr if args.algo == "acktr" else args.lr)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = env.step(action)
            if reward is None:
                continue #reward = 0.0
            obs = torch.from_numpy(obs).float().to(device)
           # for info in infos:
           #     if 'episode' in info.keys():
           #     episode_rewards.append(info['episode']['r'])
            if done or step > args.episodemaxsteps:
                episode_rewards.append(reward)
                done = False
                obs = env.reset()
                obs = torch.from_numpy(obs).float().to(device)
                break
            # If done then clean the history of observations.
#             masks = torch.FloatTensor(
#                 [[0.0] if done_ else [1.0] for done_ in done])
#             bad_masks = torch.FloatTensor(
#                 [[0.0] if 'bad_transition' in info.keys() else [1.0]
#                  for info in infos])
            # Hardcode for testing
            masks = torch.FloatTensor([[1.0]]) #always not done
            bad_masks = torch.FloatTensor([[1.0]]) #always good transitions
            
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, torch.FloatTensor([reward]), masks, bad_masks)
            #print(reward)
        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()
     
        # obs = env.reset()
        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        print("{} Steps, Value Loss: {}, Action Loss: {}".format(total_num_steps, value_loss, action_loss))
        rollouts.after_update()

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))


        steps = 0
        done = False
       # while not done and (args.episodemaxsteps <= 0 or steps < args.episodemaxsteps):
       #     action = env.action_space.sample()

       #     obs, reward, done, info = env.step(action)
       #     obs = torch.from_numpy(obs).float().to(device) #unsqueeze?
       #     #reward = torch.from_numpy(reward).unsqueeze(dim=1).float()
       #     steps += 1
       #     print("reward: " + str(reward))
       #     # print("done: " + str(done))
       #     print("obs: " + str(obs))
       #     # print("info" + info)
       #     if args.saveimagesteps > 0 and steps % args.saveimagesteps == 0:
       #         d, h, w = env.observation_space.shape
       #         img = Image.fromarray(obs.reshape(h, w, d))
       #         img.save('image' + str(args.role) + '_' + str(steps) + '.png')

        time.sleep(.05)

    env.close()
예제 #15
0
	def run_sim(self, exp_role, num_episodes, port1, serv1, serv2, exp_id, epi, rsync):
		'''Code to actually run simulation
		'''

		env = malmoenv.make()

		env.init(self.get_mission_xml(MalmoSim.MOB_TYPE + " Apocalypse"),
				 port1, server=serv1,
				 server2=serv2, port2=(port1 + exp_role),
				 role=exp_role,
				 exp_uid=exp_id,
				 episode=epi,
				 resync=rsync,
				 action_space = malmoenv.ActionSpace(self.create_actions()))

		max_num_steps = 1000

		for r in range(num_episodes):
			print("Reset [" + str(exp_role) + "] " + str(r) )

			env.reset()
			num_steps = 0

			sim_done = False
			total_reward = 0
			total_commands = 0

			flash = False

			(obs, reward, sim_done, info) = env.step(0)
			while not sim_done:
				num_steps += 1

				if (not (info is None or len(info) == 0)):
					info_json = json.loads(info)
					agent_life = info_json["Life"]
					agent_yaw = info_json["Yaw"]
					if "entities" in info_json:
						entities = [EntityInfo(**k) for k in info_json["entities"]]
						self.draw_mobs(entities, flash)
						# best_yaw_bin = self.agent_decision_net.classify_input(self.get_scores(entities, agent_yaw, agent_life))
						# best_yaw = sum((round(best_yaw_bin[x]) * (2**x)) for x in range(len(best_yaw_bin)))
						# num_bin_dig = int(math.ceil(math.log(self.agent_search_resolution,2)))
						# desired_output_bin = [int(x) for x in ('{0:0'+str(num_bin_dig)+'b}').format(desired_output)]

						index, scores, best_yaw = self.get_best_angle(entities, agent_yaw, agent_life)
						self.training_out.append(best_yaw)
						self.training_obs.append(scores)

						# self.training_out.append(desired_output_bin)
						# self.training_obs.append(inputs)
						difference = best_yaw - agent_yaw
						#Sometimes we seem to get a difference above 360, still haven't figure out that one
						while difference < -180:
							difference += 360;
						while difference > 180:
							difference -= 360;
						#Our action id is dependent upon our yaw angle to turn (see create_actions for more info)
						action_id = int(difference + 360) + 2
						(obs, reward, sim_done, info) = env.step(action_id)

						#difference /= 180.0;
						total_commands += 1
						if (not(reward is None)):
							total_reward += reward
				else:
					(obs, reward, sim_done, info) = env.step(0)
				time.sleep(0.05)
			print("We stayed alive for " + str(num_steps) + " commands, and scored " + str(total_reward))
			time.sleep(1) # Give the mod a little time to prepare for the next mission.