Пример #1
0
class AgentTest(unittest.TestCase):
    def setUp(self):
        self.json_data = '{"observations": {"screen_features": ["height_map", "player_id", "player_relative", "unit_type"], ' \
                    '"minimap_features": ["player_id", "selected"], "nonspatial_features": ["player", "score_cumulative"], ' \
                    '"action_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}, "rewards": [1, 1, 1, 1]}'
        self.config = json.loads(self.json_data)
        self.sess = tf.Session()
        self.agent_modifier = AgentModifier(self.config, 32)
        self.agent = A2CAgent(self.sess, self.agent_modifier)
        # self.obs_spec = {}
        # self._builder = dummy_observation.Builder(self._obs_spec)
        # self.obs = self._builder.build().observation
        self.env = Environment()
        self.obs = self.env.reset()

    def testMakeAction(self):
        print("Testing Make Action")
        action = self.agent.act(self.obs)
        action_made_1 = self.agent.convert_actions(action)
        action_2 = self.agent.act(self.obs)
        self.obs = self.env.reset()
        action_made_2 = self.agent.convert_actions(action_2)
        self.assertNotEqual(action_made_1, action_made_2)

    def testGetObservationFeed(self):
        print("Testing Get Observation Feed")
        feed_dict = self.agent._get_observation_feed(self.obs)
        self.obs = self.env.reset()
        feed_dict_2 = self.agent._get_observation_feed(self.obs)
        self.assertNotEqual(feed_dict, feed_dict_2)
Пример #2
0
class GameManager:
    def __init__(self, id):

        self.visualize = False

        if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0:
            self.visualize = True
        elif Config.PLAY_MODE:
            self.visualize = True

        world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)]
        self.env = Environment(world_name)
        print("Env {} for Agent {} started.".format(world_name, id))

        self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END)
        self.env.set_observation_rotation_size(Config.OBSERVATION_ROTATION_SIZE)
        self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION)
        self.env.set_cluster_size(Config.CLUSTER_SIZE)

        self.reset()

    def reset(self):
        observation, _, _, _ = self.env.reset()
        input_laser, rotation = self.process_observation(observation)
        map = StateMap(input_laser)
        obs = np.array([ [map.S_image], [rotation] ])
        return obs

    def step(self, action):
        self._update_display()
        if action is None:
            observation, reward, done, info = self.env.step(0, 0, 20)

            input_laser, rotation = self.process_observation(observation)
            map = StateMap(input_laser)
            #obs = np.array([[map.States_map, map.Reward_map], [rotation]])
            obs = np.array([[map.S_image], [rotation]])
            reward = 0
            done = False
        else:

            linear, angular = map_action(action)
            observation, reward, done, info = self.env.step(linear, angular, 20)
            input_laser, rotation = self.process_observation(observation)
            map = StateMap(input_laser)
            obs = np.array([[map.S_image], [rotation]])

        return obs, reward, done, info

    def _update_display(self):
        if self.visualize:
            self.env.visualize()

    def observation_size(self):
        return self.env.observation_size()

    def process_observation(self, observation):
        laser_scan = np.array(observation[:Config.OBSERVATION_SIZE])
        oriontaion = np.array(observation[Config.OBSERVATION_SIZE:])
        return laser_scan, oriontaion
Пример #3
0
def main():
	sess = tf.Session()
	K.set_session(sess)
	env = Environment("test")
	actor_critic = ActorCritic(env, sess)
	done = False
	num_trials = 10000
	trial_len  = 500

	steps = []
	state_size = env.observation_size()
	for trial in range(num_trials):

		cur_state,_,_,_ = env.reset()
		cur_state = np.reshape(cur_state, [1,state_size])
		
		for step in range(trial_len):
			action = actor_critic.act(cur_state)
			linear, angular = convert_action(action)
			new_state, reward, done, _ = env.step(linear, angular,10)
			new_state = np.reshape(new_state, [1, state_size])
			actor_critic.remember(cur_state, action, reward, new_state, done)
			actor_critic.train()
			cur_state = new_state
			env.visualize()
			if done:
				break
Пример #4
0
class RewardModifierTest(unittest.TestCase):

    def setUp(self):
        json_data = '{"observations": {"screen_features": ["height_map", "player_id", "player_relative", "unit_type"], ' \
                   '"minimap_features": ["player_id", "selected"], "nonspatial_features": ["player", "score_cumulative"], ' \
                   '"action_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}, "rewards": [1, 1, 1, 1]}'
        config = json.loads(json_data)
        self._reward_mod = RewardModifier(config["rewards"])
        self.old_obs = [None]
        self.zero_obs = [None]
        # self._obs_spec = {}
        # self._builder = dummy_observation.Builder(self._obs_spec)
        # self.obs = self._builder.build()
        self.env = Environment()
        # self.obs = observations = [None] * 16
        self.obs = self.env.reset()

    def testModifyZero(self):
        print("Testing Zero")
        reward_1 = self._reward_mod.modify(self.obs[0], 0, self.zero_obs[0])
        self.assertEqual(reward_1, 0)

    def testModifyReset(self):
        print("Testing Reset")
        reward_1 = self._reward_mod.modify(self.obs[0], 0, self.old_obs[0])
        self._old_obs = self.obs
        self.obs = self.env.reset()
        reward_2 = self._reward_mod.modify(self.obs[0], 0, self.old_obs[0])
        self.assertEqual(reward_1, reward_2)

    def testModifySelectArmy(self):
        print("Testing Select Army")
        reward_1 = self._reward_mod.modify(self.obs[0], 0, self.old_obs[0])
        self._old_obs = self.obs
        new_obs = self.env.step([actions.FunctionCall(_SELECT_ARMY, [_NOT_QUEUED])])
        reward_2 = self._reward_mod.modify(new_obs[0], 0, self.old_obs[0])
        self.assertEqual(reward_1, reward_2)

    def testModifyAttack(self):
        print("Testing Attack")
        reward_1 = self._reward_mod.modify(self.obs[0], 0, self.old_obs[0])
        self._old_obs = self.obs
        new_obs = self.env.step([actions.FunctionCall(_SELECT_ARMY, [_NOT_QUEUED])])
        new_obs = self.env.step([actions.FunctionCall(_ATTACK_MINIMAP, [_QUEUED, [20, 20]])])
        reward_2 = self._reward_mod.modify(new_obs[0], 0, self.old_obs[0])
        self.assertEqual(reward_1, reward_2)
Пример #5
0
def learn_flappyb():
    env = Environment(draw=DRAW, fps=1, debug=False,
                      dist_to_pipe=DIFFICULTY_LEARN,
                      dist_between_pipes=DIST_BETWEEN_PIPES,
                      obs_this_pipe=OBS_THIS_PIPE_LEARN)
    writer = None
    if WRITE:
        writer = SummaryWriter(comment=NAME)
    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    model = load_model('models/dqn/{}.h5'.format(LOAD_NAME))
    dqn_solver = DQNSolver(observation_space, action_space, model)
    run = 0

    if SAVE_MODEL:
        name = '{}-PART={}'.format(NAME, run)
        dqn_solver.model.save('models/dqn/{}.h5'.format(name))
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        reward_score = 0

        while True:
            step += 1
            action = dqn_solver.act(state, env)
            state_next, reward, terminal, info = env.step_buffer(action)
            reward_score += reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " +
                      str(dqn_solver.exploration_rate) + ", score: " +
                      str(reward_score))
                if WRITE:
                    writer.add_scalar("reward", reward_score, run)
                break
            dqn_solver.experience_replay()
        if (run % 100 == 0) and SAVE_MODEL:
            name = '{}-PART={}'.format(NAME, run)
            dqn_solver.model.save('models/dqn/{}.h5'.format(name))
    if WRITE:
        writer.close()
Пример #6
0
class ObservationModifierTest(unittest.TestCase):
    def setUp(self):
        json_data = '{"observations": {"screen_features": ["height_map", "player_id", "player_relative", "unit_type"], ' \
                    '"minimap_features": ["player_id", "selected"], "nonspatial_features": ["player", "score_cumulative"], ' \
                    '"action_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}, "rewards": [1, 1, 1, 1]}'
        config = json.loads(json_data)
        self._obs_mod = ObservationModifier(config["observations"], 32)
        #self.old_obs = [None]
        #self._obs_spec = {}
        #self._builder = dummy_observation.Builder(self._obs_spec)
        #self.obs = self._builder.build()
        self.env = Environment()
        #self.obs = observations = [None] * 16
        self.obs = self.env.reset()

    #def testModify(self):
    #    alt_obs = self._obs_mod.modify(self.obs[0], 0, self.old_obs[0])
    #    self._old_obs = self.obs
    #    self.obs = self.env.reset()
    #    alt_obs_2 = self._obs_mod.modify(self.obs[0], 0, alt_obs)
    #    self.assert(alt_obs[0], alt_obs_2[0])

    def testModifyScreen(self):
        print("Testing Screen Observations")
        alt_obs = self._obs_mod.modify(self.obs[0])
        self.assertIn("screen_features", alt_obs)

    def testModifyMinimap(self):
        print("Testing Minimap Observations")
        alt_obs = self._obs_mod.modify(self.obs[0])
        self.assertIn("minimap_features", alt_obs)

    def testModifyNonspatial(self):
        print("Testing Nonspatial Observations")
        alt_obs = self._obs_mod.modify(self.obs[0])
        self.assertIn("nonspatial_features", alt_obs)

    def testModifyAvailableMask(self):
        print("Testing Available Mask Observations")
        alt_obs = self._obs_mod.modify(self.obs[0])
        self.assertIn("available_mask", alt_obs)

    def testModifyAvailableActions(self):
        print("Testing Available Actions Observations")
        alt_obs = self._obs_mod.modify(self.obs[0])
        self.assertNotIn("available_actions", alt_obs)
Пример #7
0
class GameManager:
    def __init__(self, id):

        self.visualize = False

        if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0:
            self.visualize = True
        elif Config.PLAY_MODE:
            self.visualize = True

        world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)]
        self.env = Environment(world_name)
        print("Env {} for Agent {} started.".format(world_name, id))

        self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END)
        self.env.set_observation_rotation_size(
            Config.OBSERVATION_ROTATION_SIZE)
        self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION)
        self.env.set_cluster_size(Config.CLUSTER_SIZE)

        self.reset()

    def reset(self):
        observation, _, _, _ = self.env.reset()
        return observation

    def step(self, action):
        self._update_display()
        if action is None:
            observation, reward, done, info = self.env.step(0, 0, 20)
            reward = 0
            done = False
        else:
            linear, angular = map_action(action)
            observation, reward, done, info = self.env.step(
                linear, angular, 20)
        return observation, reward, done, info

    def _update_display(self):
        if self.visualize:
            self.env.visualize()

    def observation_size(self):
        return self.env.observation_size()
Пример #8
0
def main():
	sess = tf.Session()
	K.set_session(sess)
	env = Environment("test")
	actor_critic = ActorCritic(env, sess)
	done = False
	num_trials = 10000
	trial_len  = 500

	steps = []
	state_size = env.observation_size()
	for trial in range(num_trials):
		reward_sum = 0
		cur_state,_,_,_ = env.reset()
		cur_state = np.reshape(cur_state, [1,state_size])
		
		for step in range(trial_len):
			action = actor_critic.act(cur_state)
			action2 = np.argmax(action[0])
			linear , angular =  convert_action(action2)
			print("action", action)
			#linear = action[0][0]
			#linear = np.array([linear])
			#linear = float(linear[0])
			#linear = (0.8/math.pi)*math.atan((linear-0.5))+0.45
			#2/pi*atan(50*(x-0.5))
			print("linear", linear)	
			#angular =action[0][1]# 0.77
			#angular = np.array([angular])
			#angular = float(angular[0])
			#1/pi*atan(15*(x-0.5))+0.5
			#angular = (2/math.pi)*math.atan((angular - 0.5))
			print("angular", angular)	
			new_state, reward, done, _ = env.step(linear, angular,20)
			new_state = np.reshape(new_state, [1, state_size])
			reward_sum = reward_sum + reward
			actor_critic.remember(cur_state, action, reward, new_state, done)
			cur_state = new_state
			env.visualize()
			if done:
				print("Break!")
				break
		actor_critic.train()
Пример #9
0
def main():
    #env     = gym.make("MountainCar-v0")
    env = Environment("test")
    state_size = env.observation_size()
    gamma = 0.9
    epsilon = .95

    trials = 1000
    trial_len = 500

    # updateTargetNetwork = 1000
    dqn_agent = DQN(env=env)
    done = False
    batch_size = 32
    steps = []
    for trial in range(trials):

        reward_sum = 0
        cur_state, _, _, _ = env.reset()
        cur_state = np.reshape(cur_state, [1, state_size])
        for step in range(trial_len):
            action = dqn_agent.act(cur_state)
            linear, angular = convert_action(action)

            new_state, reward, done, _ = env.step(linear, angular, 10)
            # reward = reward if not done else -20
            new_state = np.reshape(new_state, [1, state_size])
            reward_sum = reward_sum + reward
            dqn_agent.remember(cur_state, action, reward, new_state, done)

            #dqn_agent.replay()       # internally iterates default (prediction) model
            dqn_agent.target_train()  # iterates target model

            cur_state = new_state

            env.visualize()
            if done:
                print("episode: {}/{}, score: {}, e: {:.2} time:{}".format(
                    trial, trials, reward_sum, dqn_agent.epsilon, step))
                break
        if len(dqn_agent.memory) > batch_size:
            dqn_agent.replay()
Пример #10
0
def play_flappyb():
    env = Environment(draw=True, fps=1, debug=True,
                      dist_to_pipe=DIFFICULTY_PLAY,
                      dist_between_pipes=DIST_BETWEEN_PIPES,
                      obs_this_pipe=OBS_THIS_PIPE_PLAY)

    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    model = keras.models.load_model('models/dqn/{}.h5'.format(LOAD_NAME))
    dqn_solver = DQNSolver(observation_space, action_space, model)

    for i in range(20):
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        is_done = False
        while not is_done:
            action = dqn_solver.act_free(state)
            # action = env.get_action_random()
            state_next, reward, terminal, info = env.step_buffer(action)
            is_done = terminal
            state = np.reshape(state_next, [1, observation_space])
Пример #11
0
    def run(self):
        global episode
        env = Environment("test")
        while episode < EPISODES:
            state, _, _, _ = env.reset()
            score = 0
            while True:
                action = self.get_action(state)
                linear, angular = self.convert_action(action)
                next_state, reward, done, _ = env.step(linear, angular, 10)
                next_state = np.reshape(next_state, [1, state_size])
                score += reward

                self.memory(state, action, reward)

                state = next_state
                env.visualize()

                if done:
                    episode += 1
                    print("episode: ", episode, "/ score : ", score)
                    scores.append(score)
                    self.train_episode(score != 100)
                    break
Пример #12
0
def main():

    # Check if the ROM is given through argv
    filename = './Super_Mario_Land_World.gb'

    env = Environment(filename, max_steps=N_STEPS, visualize=VISUALIZE)
    env.start()
    agent = A2C_Agent(discount=0.99, epsilon=0.9, learning_rate=1e-3)

    agent_is_setup = False

    entropy_term = 0
    all_rewards = []
    all_lengths = []
    average_lengths = []

    for episode in range(N_EPOCHS):
        print("\n ", "=" * 50)
        print("Epoch {}/{}".format(episode + 1, N_EPOCHS))
        env.reset()
        state = env.obs()

        log_probs = []
        values = []
        rewards = []

        if not agent_is_setup:
            agent.setup(env.observation_space, env.action_space, use_model)
            agent_is_setup = True

        for steps in range(N_STEPS):
            # Get action from agent
            with torch.no_grad():
                action, log_prob, entropy, value = agent.get_action(state, TRAINING)

            value = value.detach().numpy()[0, 0]

            new_state, reward, done = env.step(action, steps)

            rewards.append(reward)
            values.append(value)
            log_probs.append(log_prob)
            entropy_term += entropy

            # Set obs to the new state
            state = new_state

            if done or steps == N_STEPS - 1:
                Qval, _ = agent.model.forward(torch.Tensor(new_state))
                Qval = Qval.detach().numpy()[0, 0]
                all_rewards.append(np.sum(rewards))
                all_lengths.append(steps)
                average_lengths.append(np.mean(all_lengths[-10:]))
                if episode % 10 == 0:
                    sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode,
                                                                                                               np.sum(
                                                                                                                   rewards),
                                                                                                               steps,
                                                                                                               average_lengths[
                                                                                                                   -1]))
                break

        print("Loss :", agent.train(values, rewards, log_probs, Qval, entropy_term))

    if SAVE_MODEL and TRAINING:
        date = datetime.datetime.now()
        model_name = str(date.day) + '_' + str(date.month) + '_' + str(date.hour) + '_' + agent.name + '.h5'
        agent.save_model(model_name)

    env.stop()
Пример #13
0
                  0,
                  target_policy,
                  behaviour_policy,
                  from_file=False,
                  double=True)
    actions = []
    action = 0
    actions.append(action)
    environment = Env(population_size=1000,
                      initial_sick=1,
                      contagion_rate=1000,
                      mortality_rate=0.1)

    for i in range(10000):
        reward, discount, next_state = environment.update(action)
        action = agent.step(reward, discount, next_state)
        actions.append(action)

        if not environment.ALIVE:
            environment = environment.reset(population_size=1000,
                                            initial_sick=1,
                                            contagion_rate=1,
                                            mortality_rate=0.1)

    agent.save_to_file()

    # heatmap(agent.q_values)
    # plot_history(array(environment.reward_history)*-1)
    print(agent.q_values)
    print(actions)
Пример #14
0
    state_size = env.observation_size()
    action_size = action_mapper.ACTION_SIZE
    agent = DQNAgent(state_size, action_size)
    # agent.load("./save/cartpole-dqn.h5")
    done = False
    batch_size = 48

    print("START DQN")

    for e in range(EPISODES):

        visualize = (e % 5 == 0)

        reward_sum = 0

        state, _, _, _ = env.reset()

        state = np.reshape(state, [1, state_size])

        for iteration in range(100):
            action = agent.act(state)

            linear, angular = action_mapper.map_action(action)

            next_state, reward, done, _ = env.step(linear, angular, 20)

            next_state = np.reshape(next_state, [1, state_size])

            reward_sum = reward_sum + reward

            agent.remember(state, action, reward_sum, next_state, done)
Пример #15
0
def main():
    # Check if the ROM is given through argv
    filename = './Super_Mario_Land_World.gb'
    env = Environment(filename, max_steps=N_STEPS, visualize=VISUALIZE)
    env.start()
    agent = DQN_Agent(discount=0.9, epsilon=0.9, learning_rate=1e-5)
    avg_loss = None
    agent_is_setup = False
    min_epsilon = 0.001
    max_epsilon = 0.001

    for episode in range(N_EPOCHS):
        print("\n ", "=" * 50)
        env.reset()
        state = torch.Tensor(env.obs())
        old_state = state
        old_old_state = state
        is_a_released = torch.ones(1)
        states = [
            torch.cat((state, old_state, old_old_state), 0).view(3, 16, 20),
            is_a_released, env.mario_size
        ]
        episode_reward = 0

        if not agent_is_setup:
            agent.setup(env.observation_space, env.action_space, use_model)
            agent_is_setup = True

        for steps in range(N_STEPS):
            # Get action from agent
            actions = agent.get_action(states, TRAINING)

            new_state, reward, done = env.step(actions)

            #env.print_obs(new_state.numpy().astype(int))

            if actions[1] == 0:
                is_a_released = torch.zeros(1)
            else:
                is_a_released = torch.ones(1)

            if steps + 1 == N_STEPS:
                done = True

            episode_reward += reward

            new_states = [
                torch.cat((new_state, states[0][0, :, :], states[0][1, :, :]),
                          0).view(3, 16, 20), is_a_released, env.mario_size
            ]

            agent.update_replay_memory(states, actions, reward, new_states,
                                       done)

            # Train the neural network
            if TRAINING:
                loss = agent.train(done)
                if avg_loss is None:
                    avg_loss = loss
                else:
                    avg_loss = 0.99 * avg_loss + 0.01 * loss
            else:
                avg_loss = 0

            states = new_states

            if (steps + 1) % 20 == 0:
                print("\rAverage loss : {:.5f} --".format(avg_loss),
                      "Episode rewards: {} --".format(episode_reward),
                      "epochs {}/{} --".format(episode, N_EPOCHS),
                      "steps {}/{}".format(steps + 1, N_STEPS),
                      end="")
            if done:
                print("\n", env.level_progress_max)
                break

        agent.epsilon = max(
            min_epsilon, min(max_epsilon, 1.0 - math.log10((episode + 1) / 5)))
    if SAVE_MODEL and TRAINING:
        date = datetime.datetime.now()
        model_name = str(date.day) + '_' + str(date.month) + '_' + str(
            date.hour) + '_' + agent.name + '.h5'
        agent.save_model(model_name)

    env.stop()
from environment.action import Action
from environment.environment import Environment

from utils.constants import NUM_EPISODES

env = Environment()
for episode in range(0, NUM_EPISODES):
    state = env.reset()
    done = False
    score = 0

    while not done:

        action = Action(0,
                        env.action_space.sample() - 1,
                        env.action_space.sample() - 1)
        n_state, reward, done, info = env.step(action)
        score += reward

    print(f"Episode: {episode} Score: {score}")
Пример #17
0
    def run_ddpg(self):
        torch.manual_seed(0)
        torch.cuda.manual_seed(0)
        np.random.seed(0)

        # define model
        agent = DDPGAgent(self.config.agent.ddpg)
        agent.cuda()
        env = Environment(self.config.env)
        env.reset()

        if self.config.env.model_load_path is not None:
            agent.load_state_dict(torch.load(self.config.env.model_load_path))

        # define data
        if self.config.env.dataset == 'cifar100':
            dataset = Cifar100(self.config.env)
        else:
            dataset = Cifar10(self.config.env)

        train_actor_val_loader = DataLoader(
            dataset.train_actor_val_dataset, batch_size=self.config.env.val_batch_size, shuffle=False,
            num_workers=self.config.env.workers)
        test_actor_test_loader = DataLoader(
            dataset.test_actor_test_dataset, batch_size=self.config.env.val_batch_size, shuffle=False,
            num_workers=self.config.env.workers)

        # define log out
        time_array = time.localtime(time.time())
        log_time = time.strftime("%Y_%m_%d_%H_%M_%S", time_array)
        # name log_dir with paras
        use_loss_norm = 'use_loss_norm' if self.config.env.features.use_loss_norm else 'no_loss_norm'
        use_logits = 'use_logits' if self.config.env.features.use_logits else 'no_logits'
        use_loss_abs = 'use_loss_abs' if self.config.env.features.use_loss_abs else 'no_loss_abs'
        use_loss_gain = 'use_loss_gain' if self.config.env.learn_lr_gain else 'no_loss_gain'
        log_dir_name = '_'.join([self.config.env.reward.reward_option, use_loss_norm, use_logits, use_loss_abs,
                                 str(self.config.env.reward.filter_loss_rate), str(self.config.agent.ddpg.buffer_size),
                                 str(self.config.agent.ddpg.a_learning_rate),
                                 str(self.config.agent.ddpg.c_learning_rate),
                                 self.config.agent.ddpg.weight_option, use_loss_gain, log_time])

        log_dir = os.path.join(self.config.log_root, self.config.env.log_dir, log_dir_name)
        if os.path.exists(log_dir) is False:
            os.makedirs(log_dir)

        save_dir = os.path.join(self.config.log_root, self.config.env.save_dir, log_dir_name)
        if os.path.exists(save_dir) is False:
            os.makedirs(save_dir)

        # parms out
        paras_path = os.path.join(log_dir, 'paras.yaml')
        with open(paras_path, "w", encoding='utf-8') as f:
            yaml.dump(self.config, f)

        b_out = None
        for current_episode in range(self.config.env.num_episode):
            episode_log = 'sampler_episode_' + str(current_episode)
            if (current_episode or self.config.env.model_load_path is not None) and current_episode % \
                    self.config.agent.ddpg.test_actor_step == 0:
                episode_log = 'test_sampler_episode_' + str(current_episode)
            log_path = os.path.join(log_dir, episode_log)
            tb_logger = SummaryWriter(log_path)

            # if self.config.env.baseline.baseline_out:
            #     if b_out is not None:
            #         b_out.close()
            #     b_out = open('baseline_episode_' + str(current_episode), 'w')

            agent.reset_noise()
            env.reset()
            episode_step = 0
            pre_buffer = [None, None, None]

            if current_episode and current_episode % self.config.env.save_interval == 0:
                save_path = os.path.join(save_dir, 'episode_' + str(current_episode) + '.pth')
                agent.save_model(save_path)

            if (current_episode or self.config.env.model_load_path is not None) and current_episode % \
                    self.config.agent.ddpg.test_actor_step == 0:
                sampler = EpisodeGivenSampler(dataset.test_actor_train_dataset, self.config.env.num_stages,
                                              self.config.env.num_stage_step_test, self.config.env.num_candidates,
                                              total_iters=self.config.env.total_iters)
                test_actor_train_loader = DataLoader(
                    dataset.test_actor_train_dataset, batch_size=self.config.env.num_candidates, shuffle=False,
                    num_workers=self.config.env.workers, drop_last=True, sampler=sampler)

                episode_done_step = len(test_actor_train_loader) - 1
                for i, (input_image, target) in enumerate(test_actor_train_loader):
                    env.adjust_learning_rate_by_stage(episode_step, self.config.env.num_stage_step_test, env.optimizer,
                                                      self.config.env.lr_stages)
                    if self.config.env.reward.reward_option == 'sub_reference_model':
                        env.adjust_learning_rate_by_stage(episode_step, self.config.env.num_stage_step_test,
                                                          env.reference_optimizer, self.config.env.lr_stages)

                    input_image = input_image.cuda()
                    target = target.cuda()

                    current_epoch = episode_step // self.config.env.num_step_per_epoch_test

                    if episode_step < self.config.env.num_warmup_step_test:
                        self.warm_up(env, input_image, target, episode_step, current_epoch, test_actor_test_loader,
                                     tb_logger, b_out, self.config.env.num_stage_step_test)
                    else:
                        done = (episode_step == episode_done_step)
                        self.test_actor(agent, env, episode_step, input_image, target,
                                        test_actor_test_loader, tb_logger, pre_buffer, current_epoch, done)
                    episode_step += 1
            else:
                sampler = EpisodeGivenSampler(dataset.train_actor_train_dataset, self.config.env.num_stages,
                                              self.config.env.num_stage_step, self.config.env.num_candidates,
                                              total_iters=self.config.env.total_iters)
                train_actor_train_loader = DataLoader(
                    dataset.train_actor_train_dataset, batch_size=self.config.env.num_candidates, shuffle=False,
                    num_workers=self.config.env.workers, drop_last=True, sampler=sampler)

                episode_done_step = len(train_actor_train_loader) - 1
                for i, (input_image, target) in enumerate(train_actor_train_loader):
                    env.adjust_learning_rate_by_stage(episode_step, self.config.env.num_stage_step, env.optimizer,
                                                      self.config.env.lr_stages)
                    if self.config.env.reward.reward_option == 'sub_reference_model':
                        env.adjust_learning_rate_by_stage(episode_step, self.config.env.num_stage_step,
                                                          env.reference_optimizer, self.config.env.lr_stages)

                    input_image = input_image.cuda()
                    target = target.cuda()

                    current_epoch = episode_step // self.config.env.num_step_per_epoch
                    if episode_step < self.config.env.num_warmup_step:
                        self.warm_up(env, input_image, target, episode_step, current_epoch, train_actor_val_loader,
                                     tb_logger, b_out, self.config.env.num_stage_step)
                    else:
                        done = episode_step == episode_done_step - 1
                        self.train_actor(agent, env, episode_step, input_image, target, train_actor_val_loader,
                                         tb_logger, pre_buffer, current_epoch, done)
                    episode_step += 1

        if b_out is not None:
            b_out.close()
Пример #18
0
class WorkerAgent(threading.Thread):
    def __init__(self, name, graph_ops, update_ops, world_name, use_target,
                 session, saver):
        super().__init__()

        self.name = name
        self.graph_ops = graph_ops
        self.session = session
        self.saver = saver

        self.graph_ops = graph_ops
        self.update_ops = update_ops

        self.env = Environment(world_name)
        self.env.use_observation_rotation_size(use_target)
        self.env.set_cluster_size(CLUSTER_SIZE)
        self.state_size = self.env.observation_size()
        self.action_size = action_mapper.ACTION_SIZE

    def run(self):
        global global_episode, global_step
        print('Thread {} started.'.format(self.name))

        local_episodes = 0
        accumulated_reward = 0
        best_reward = 0
        epsilon = INITIAL_EPSILON

        state_batch = []
        reward_batch = []
        action_batch = []

        period_start_time = time.time()

        while global_episode <= MAX_EPISODES:
            self.env.reset()
            state, _, _, _ = self.env.step(0, 0)
            state = self.reshape_state(state)

            episode_step = 0
            episode_reward = 0

            while True:
                q_output = self.graph_ops['network']['q_values'].eval(
                    session=self.session,
                    feed_dict={self.graph_ops['network']['input']: [state]})

                if random() <= epsilon:
                    action_index = randrange(self.action_size)
                else:
                    action_index = np.argmax(q_output)

                a_t = np.zeros([self.action_size])
                a_t[action_index] = 1

                if epsilon > final_epsilon:
                    epsilon -= (INITIAL_EPSILON -
                                final_epsilon) / anneal_epsilon_timesteps

                #print("Choosing Action {}".format(action_index))

                x1, x2 = action_mapper.map_action(action_index)
                next_state, reward, term, info = self.env.step(x1, x2, 10)
                next_state = self.reshape_state(next_state)
                episode_reward += reward

                if visualize:
                    self.env.visualize()

                #print("Reward: {} \n\n".format(reward))

                next_q_values = self.graph_ops['target_network'][
                    'q_values'].eval(
                        session=self.session,
                        feed_dict={
                            self.graph_ops['target_network']['input']:
                            [next_state]
                        })

                if not term:
                    reward = reward + gamma * np.amax(next_q_values)

                state_batch.append(state)
                action_batch.append(a_t)
                reward_batch.append(reward)

                if global_step % target_update_timestep == 0:
                    self.session.run(self.update_ops['reset_target_network'])
                    print("Target Net Resetted")

                # start = time.time()
                if episode_step % UPDATE_PERIOD == 0 or term:
                    self.session.run(self.update_ops['minimize'],
                                     feed_dict={
                                         self.update_ops['y']:
                                         reward_batch,
                                         self.update_ops['a']:
                                         action_batch,
                                         self.graph_ops['network']['input']:
                                         state_batch
                                     })

                    state_batch = []
                    action_batch = []
                    reward_batch = []

                # end = time.time()
                # print('Time for updating: ', end - start)

                if global_step % CHECKPOINT_PERIOD_TIMESTEPS == 0:
                    self.saver.save(self.session,
                                    CHECKPOINT_PATH,
                                    global_step=global_step)

                global_step += 1
                state = next_state
                episode_step += 1

                if term:
                    break

            accumulated_reward += episode_reward
            best_reward = episode_reward if (
                episode_reward > best_reward) else best_reward

            local_episodes += 1
            global_episode += 1

            if local_episodes % PRINT_EVERY == 0:
                period_end_time = time.time()
                #writer.add_summary(tf.summary.scalar('AVG Reward', accumulated_reward / PRINT_EVERY))
                print(
                    "Thread {0:}. Total Episodes {1:}. Reward AVG: {2:.3f}, Best Reward: {3:.3f}, Globalstep: {4:6d}, Epsilon: {5:f}, Time: {6:}"
                    .format(self.name, global_episode,
                            accumulated_reward / PRINT_EVERY, best_reward,
                            global_step, epsilon,
                            period_end_time - period_start_time))
                accumulated_reward = 0
                best_reward = -99999
                period_start_time = time.time()

    def reshape_state(self, state):
        return np.reshape(state, [self.state_size, 1])
Пример #19
0
):  # Checking if there are previous training performances saved
    os.remove(performance_file_path)  # Deleting the old train performances
if os.path.exists(
        log):  # Checking if there are previous training performances saved
    os.remove(log)  # Deleting the old train performances

print(dt.now())
print("stop loss:", stop_loss_value)
print("pc: BH")
# ********************************************* Looping over all Episodes ***************-******************************
for ep in range(n_episodes - n_prev_iterations):
    time_start = dt.now()
    total_revenue = 0  # Counts the total reward for a single episode
    print("Iteration: " + str(ep + 1) + "/" +
          str(n_episodes - n_prev_iterations))
    env.reset()  # Resetting the environment
    agent.reset()  # Resetting the agent mini-batch memory
    state, reward = env.step(
        "Hold")  # Making a first neutral action for get the first state

    # ******************************************* Looping over all Instances *******************************************
    while not env.done:  # Loop until we finish all the instances
        action = agent.act(
            state)  # The agent choose an action based on the current state
        next_state, reward = env.step(
            actions[action]
        )  # Getting the next state and reward based on the action choose
        '''with open(log, "a+") as file:
            file.write(str(actions[action]) + "\n")  # Saving the performance on a file
            if env.stop_loss_triggered:
                file.write("Stop Loss Triggered!" + "\n")  # Saving the stop loss taken on a file
Пример #20
0
        return action

    def process_state_batch(self, batch):
        return batch[:, 0, :]


env = Environment("Simulation2d/svg/proto_1", 6)
env.use_observation_rotation_size(True)
env.set_observation_rotation_size(128)
env.set_mode(Mode.ALL_RANDOM)

processor = ManualProc()
states = env.observation_size()
actions = action_mapper.ACTION_SIZE

if DEBUG:
    print('states: {0}'.format(states))
    print('actions: {0}'.format(actions))

state, reward, done, _ = env.reset()
env.render()

done = False

while not done:
    value = input("Próxima ação: [0 - 6]: \n")
    action = int(value)
    state, reward, done, _ = env.step(action)
    #print ('reward: {0}'.format(reward))
    env.render()
Пример #21
0
class Worker(object):
    def __init__(self, name, globalAC):
        if MULTIPLE_ROOMS:
            if name == "W_0" or name == "W_1" or name == "W_2":
                self.env = Environment(ENV_NAME)
            elif name == "W_3" or name == "W_4" or name == "W_5":
                self.env = Environment(ENV_NAME_2)
            else:
                self.env = Environment(ENV_NAME_3)
        else:
            self.env = Environment(ENV_NAME)

        self.env.set_cluster_size(CLUSTER_SIZE)
        self.env.set_observation_rotation_size(64)  # TODO
        self.env.use_observation_rotation_size(True)
        self.name = name
        self.AC = ACNet(name, globalAC)

    def convert_action(self, action):
        angular = 0
        linear = 0

        if action == 0:
            angular = 1.0
            linear = 0.5
        elif action == 1:
            angular = 0.5
            linear = 0.75
        elif action == 2:
            angular = 0.0
            linear = 1.0
        elif action == 3:
            angular = -0.5
            linear = 0.75
        else:
            angular = -1.0
            linear = 0.5

        return linear, angular

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s, _, _, _ = self.env.reset()
            s = np.reshape(s, [1, N_S])
            ep_r = 0
            # rnn_state = SESS.run(self.AC.init_state)    # zero rnn state at beginning
            # keep_state = deepcopy(rnn_state)      # keep rnn state for updating global net
            for ep_t in range(MAX_EP_STEP):

                # a, rnn_state_ = self.AC.choose_action(s, rnn_state)  # get the action and next rnn state
                a = self.AC.choose_action(
                    s)  # get the action and next rnn state
                b = np.asarray(a)
                b = b[0][0]

                action = np.argmax(b)

                linear, angular = self.convert_action(action)

                s_, r, done, _ = self.env.step(linear, angular, SKIP_LRF)
                s_ = np.reshape(s_, [1, N_S])

                # if (self.name == 'W_0' or self.name == "W_3") and VISUALIZE:
                if (self.name == 'W_0') and VISUALIZE:
                    self.env.visualize()

                done = True if ep_t == MAX_EP_STEP - 1 else done

                ep_r += r
                buffer_s.append(s)
                buffer_a.append(b)
                buffer_r.append(r)
                # buffer_r.append((r+8)/8)    # normalize

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        # v_s_ = SESS.run(self.AC.v, {self.AC.s: s_, self.AC.init_state: rnn_state_})[0, 0]
                        v_s_ = SESS.run(self.AC.v, {self.AC.s: s_})[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(
                        buffer_s), np.vstack(buffer_a), np.vstack(
                            buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                        # self.AC.init_state: keep_state,
                    }

                    self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                    # keep_state = deepcopy(rnn_state_)   # replace the keep_state as the new initial rnn state_

                s = s_
                # rnn_state = rnn_state_  # renew rnn state
                total_step += 1

                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] +
                                                0.1 * ep_r)

                    if self.name == "W_0":
                        print(self.name, "Ep:", GLOBAL_EP, "Ep_r:", ep_r)
                        # print(
                        #     self.name,
                        #     "Ep:", GLOBAL_EP,
                        #     "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                        #       )
                    GLOBAL_EP += 1
                    if GLOBAL_EP % SAVE_INTERVAL == 0:
                        print("Versuche zu Speichern...")
                        self.AC.save_global()
                        print("...gespeichert!")
                    break
Пример #22
0
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = Environment(ENV_NAME)
        self.env.set_cluster_size(CLUSTER_SIZE)
        self.name = name
        self.AC = ACNet(name, globalAC)

    def convert_action(self, action):
        angular = 0
        linear = 0

        if action == 0:
            angular = 1.0
            linear = 0.5
        elif action == 1:
            angular = 0.5
            linear = 0.75
        elif action == 2:
            angular = 0.0
            linear = 1.0
        elif action == 3:
            angular = -0.5
            linear = 0.75
        else:
            angular = -1.0
            linear = 0.5

        return linear, angular

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s, _, _, _ = self.env.reset()
            s = np.reshape(s, [1, N_S])
            ep_r = 0
            rnn_state = SESS.run(
                self.AC.init_state)  # zero rnn state at beginning
            keep_state = rnn_state.copy(
            )  # keep rnn state for updating global net
            for ep_t in range(MAX_EP_STEP):
                if self.name == 'W_0':
                    self.env.visualize()
                a, rnn_state_ = self.AC.choose_action(
                    s, rnn_state)  # get the action and next rnn state

                action = np.argmax(a)

                linear, angular = self.convert_action(action)

                s_, r, done, _ = self.env.step(
                    linear, angular,
                    10)  # Die Zahl heißt: überspringe so viele Laserscanns
                s_ = np.reshape(s_, [1, N_S])

                done = True if ep_t == MAX_EP_STEP - 1 else done

                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)
                # buffer_r.append((r+8)/8)    # normalize

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        v_s_ = SESS.run(self.AC.v, {
                            self.AC.s: s_,
                            self.AC.init_state: rnn_state_
                        })[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(
                        buffer_s), np.vstack(buffer_a), np.vstack(
                            buffer_v_target)

                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                        self.AC.init_state: keep_state,
                    }

                    self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()
                    keep_state = rnn_state_.copy(
                    )  # replace the keep_state as the new initial rnn state_

                s = s_
                rnn_state = rnn_state_  # renew rnn state
                total_step += 1
                if self.name == 'W_0':
                    self.env.visualize()
                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] +
                                                0.1 * ep_r)
                    print(
                        self.name,
                        "Ep:",
                        GLOBAL_EP,
                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                    )
                    GLOBAL_EP += 1
                    break
Пример #23
0
def main(test=False, chkpt=None, device='cuda'):
    """
    main is used to start and preform the training in non-render mode
    :param test: Not required
    :param chkpt: Not required
    :param device: string (cuda or cpu)
    :return: None
    """
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if not test:
        wandb.init(project="MultiSection Continum",
                   name="Reaching Task 32 Per Layer")

    robot = Robot()
    robot.newSection()
    robot.newSection()

    env = Environment(robot)
    if test:
        # env.staticPoint([-9.966711079379195, 99.3346653975306])
        env.render()
    # else:
    #     env.staticPoint([-9.966711079379195, 99.3346653975306])

    lastObs = env.getObservation()

    rb = ReplayBuffer()

    memorySize = 500000
    minRBSize = 20000

    sampleSize = 750

    envStepsBeforeTrain = 250

    targetModelUpdate = 500

    epsMin = 0.01
    epsDecay = 0.99999

    model = Model(len(lastObs.state), len(env.robot.actions)).to(device)
    if chkpt != None:
        model.load_state_dict(torch.load(chkpt))

    targetModel = Model(len(lastObs.state), len(env.robot.actions)).to(device)
    updateTGTModel(model, targetModel)

    stepSinceTrain = 0
    # stepSinceTrain keeps track of the number of steps since the last main network training
    # in this case main network updates after every envStepsBeforeTrain

    stepSinceTGTUpdate = 0
    # stepSinceTGTUpdate keeps track of the number of steps since the last target network update (ie transfering main network weights)
    # in this case the target network updates after every targetModelUpdate

    stepNum = -1 * minRBSize

    episodeRewards = []
    rollingReward = 0

    # Copying over the weights
    tq = tqdm()
    # Work in progress
    while True:
        if test:
            env.render()
            time.sleep(0.05)
        tq.update(1)
        eps = epsDecay**(stepNum / 10)
        if test:
            eps = 0

        if random() < eps:
            # print("Taking random action")
            action = env.robot.randomAction()
        else:
            actNum = model(torch.tensor(
                lastObs.state).to(device)).max(-1)[-1].item()
            action = env.robot.actions[actNum]

        obs = env.robotStep(action[0], action[1])

        rollingReward = obs.reward

        # print(obs)
        # # env.render()
        # x = model(torch.Tensor(obs.state))
        # # print(x)
        #
        episodeRewards.append(rollingReward)
        #
        # if stepSinceTGTUpdate > targetModelUpdate:
        # # if env.done():
        #     episodeRewards.append(rollingReward)
        #     if test:
        #         print(rollingReward)
        #     print(episodeRewards)
        #     rollingReward = 0
        #     # env.reset()
        if env.done():
            env.reset()
            # env.staticPoint([-9.966711079379195, 99.3346653975306])

        # obs.reward = obs.reward / 100

        stepSinceTrain += 1
        stepNum += 1
        rb.insert(obs)
        if (
                not test
        ) and rb.index >= minRBSize and stepSinceTrain > envStepsBeforeTrain:
            stepSinceTGTUpdate += 1
            loss = trainStep(rb.sample(sampleSize), model, targetModel,
                             len(env.robot.actions), device)
            wandb.log(
                {
                    "Loss": loss.detach().cpu().item(),
                    "eps": eps,
                    "Step Rewards:": np.mean(episodeRewards)
                },
                step=stepNum)
            stepSinceTrain = 0

            if stepSinceTGTUpdate > targetModelUpdate:
                print("Updating Target Model")
                updateTGTModel(model, targetModel)
                stepSinceTGTUpdate = 0
                torch.save(
                    targetModel.state_dict(),
                    f"/u/meharabd/research/CRLMachineLearningProject/Models/{stepNum}.pth"
                )
                episodeRewards = []