Exemplo n.º 1
0
    def record(self, time, game_tick_packet):

        s = EasyGameState(game_tick_packet, self.team, self.index)
        # trace(s.car.on_ground)
        trace(s.car.pos[0])
        trace(s.car.pos[1])

        time = time - self.record_start_time

        output_vector = (
            round(controller.fThrottle),
            round(controller.fSteer),
            round(controller.fPitch),
            round(controller.fYaw),
            round(controller.fRoll),
            round(controller.bJump),
            round(controller.bBoost),
            round(controller.bHandbrake),
        )

        history_item = historian.HistoryItem(
            float(time),
            # game_tick_packet,
            # output_vector,
        )
        history_item.output_vector = output_vector
        history_item.game_tick_packet = game_tick_packet
        self.history.append(history_item)
        # if self.first_time:
        #     print (history_item.encode())
        #     self.first_time = False

        return output_vector
Exemplo n.º 2
0
	def get_output_vector(self, game_tick_packet):

			#Comment to switch training mode
		output = self.reinforced_play(game_tick_packet)
		# output = self.supervised_play(game_tick_packet)
		# output = self.obvious_play(game_tick_packet)

			##Debuging every 50 steps
		if self.debug and (self.state['Step']%50)==0:
			trace(np.mean(self.epi_rewards))
			trace(Car(Get_car(game_tick_packet,self.index)).loc.c_2d())

			##Count loops, at MaxStep will reset, also train every maxsteps
		if self.state['Step'] >= self.state['MaxStep']:

				#Comment to switch training mode
			self.reinforced_train()
			# self.supervised_train()
			# self.obvious_train()


			self.state['TrainStep'] += 1
			self.state['Step'] = 0

			if self.state['TrainStep'] >= self.state['SaveAt']:
				self.model.save("V3_S_{}".format(self.name))
				self.state['TrainStep'] = 0

		else:
			self.state['Step'] += 1

		return self.Format_Output(output)
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    episodes = memory.sample(BATCH_SIZE)
    opt.zero_grad()
    batch_errors = []

    for states, actions, rewards, next_states, dones in episodes:
        # opt.zero_grad()
        actions_tensor = torch.tensor(actions, device=device).long().unsqueeze(1)
        rewards_tensor = np_to_device(rewards)

        states_tensor = np_to_device(states).view((-1, 1, 4))
        # state_values, hidden = model(states_tensor, model.initial_hidden(1))
        # print(model(states_tensor, model.initial_hidden(1))[0].shape, actions_tensor.shape)
        # print(model(states_tensor, model.initial_hidden(1))[0].squeeze(1).shape)
        state_values = model(states_tensor, model.initial_hidden(1))[0].squeeze(1)
        state_action_values = state_values.gather(1, actions_tensor)
        # print(state_action_values)

        next_state_values = torch.zeros(len(states), device=device)
        # print(state_values.shape)
        # print(state_values[1:].max(1)[0].shape)
        next_state_values[:-1] = state_values[1:].max(1)[0].detach()
        expected_state_action_values = (next_state_values * GAMMA) + rewards_tensor

        # import numpy as np
        # print(np.array([dones, state_action_values.squeeze().detach().cpu().numpy(), rewards_tensor.cpu().numpy(), expected_state_action_values.cpu().numpy()]).T)
        # print(expected_state_action_values)
        # print(state_action_values)
        errors = torch.abs(state_action_values - expected_state_action_values.unsqueeze(1))
        batch_errors.append(errors.flatten())
        # batch_errors.append(torch.mean(errors).flatten())

        # Compute Huber loss
        # loss = torch.mean(torch.where(errors < 1, 0.5 * errors ** 2, errors - 0.5))
        # loss = torch.mean(errors ** 2)
        # print(loss)
        # trace(loss.cpu().item())
        # loss.backward()
        # for param in model.parameters():
        #     param.grad.data.clamp_(-1, 1)
        # opt.step()
    # print(batch_errors)
    # print([_x.shape for _x in batch_errors])
    batch_errors = torch.cat(batch_errors)
    loss = torch.mean(torch.where(batch_errors < 1, 0.5 * batch_errors ** 2, batch_errors - 0.5))
    print(loss)
    trace(loss.cpu().item())
    loss.backward()
    # for param in model.parameters():
    #     param.grad.data.clamp_(-1, 1)
    opt.step()
def optimize_model():
    if len(memory) < WARMUP:
        return
    if len(memory) == WARMUP:
        print("Memory warmed up")

    for i in range(5):
        episodes = memory.sample(BATCH_SIZE)
        opt.zero_grad()
        batch_errors = []

        for states, actions, rewards, next_states, dones in episodes:
            actions_tensor = torch.tensor(actions,
                                          device=device).long().unsqueeze(1)
            rewards_tensor = np_to_device(rewards)

            states_tensor = torch.from_numpy(
                np.transpose(states, axes=[0, 3, 1, 2])).to(device).float()
            # print(states_tensor.shape)
            state_values = model(states_tensor,
                                 model.initial_hidden(1))[0].squeeze(1)
            state_action_values = state_values.gather(1, actions_tensor)
            # print(state_action_values)

            next_state_values = torch.zeros(len(states), device=device)
            next_state_values[:-1] = state_values[1:].max(1)[0].detach()
            expected_state_action_values = (next_state_values *
                                            GAMMA) + rewards_tensor

            # import numpy as np
            # print(np.array([dones, state_action_values.squeeze().detach().cpu().numpy(), rewards_tensor.cpu().numpy(), expected_state_action_values.cpu().numpy()]).T)
            errors = torch.abs(state_action_values -
                               expected_state_action_values.unsqueeze(1))
            batch_errors.append(errors.flatten())

        batch_errors = torch.cat(batch_errors)
        loss = torch.mean(
            torch.where(batch_errors < 1, 0.5 * batch_errors**2,
                        batch_errors - 0.5))
        print(loss)
        trace(loss.cpu().item())
        loss.backward()
        # for param in model.parameters():
        #     param.grad.data.clamp_(-1, 1)
        opt.step()
    def optimize_model(self, batch_size, gamma, get_tensor_from_obs):
        for i in range(1):
            episodes = self.memory.sample(batch_size)
            self.optimizer.zero_grad()
            batch_errors = []

            for states, actions, rewards, next_states, dones in episodes:
                actions_tensor = torch.tensor(
                    actions, device=self.device).long().unsqueeze(1)
                rewards_tensor = self.np_to_device(rewards)

                states_tensor = get_tensor_from_obs(states)
                # print(states_tensor.shape)
                state_values = self.model(
                    states_tensor, self.model.initial_hidden(1))[0].squeeze(1)
                state_action_values = state_values.gather(1, actions_tensor)
                # print(state_action_values)

                target_next_state_values = torch.zeros(len(states),
                                                       device=self.device)
                target_state_values = self.target_model(
                    states_tensor, self.model.initial_hidden(1))[0].squeeze(1)
                target_next_state_values[:-1] = target_state_values[1:].max(
                    1)[0].detach()
                expected_state_action_values = (target_next_state_values *
                                                gamma) + rewards_tensor

                # import numpy as np
                # print(np.array([dones, state_action_values.squeeze().detach().cpu().numpy(), rewards_tensor.cpu().numpy(), expected_state_action_values.cpu().numpy()]).T)
                errors = torch.abs(state_action_values -
                                   expected_state_action_values.unsqueeze(1))
                batch_errors.append(errors.flatten())

            batch_errors = torch.cat(batch_errors)
            loss = torch.mean(
                torch.where(batch_errors < 1, 0.5 * batch_errors**2,
                            batch_errors - 0.5))
            print(loss)
            trace(loss.cpu().item())
            loss.backward()
            # for param in model.parameters():
            #     param.grad.data.clamp_(-1, 1)
            self.optimizer.step()
Exemplo n.º 6
0
    def train_step(self,
                   formatted_input,
                   formatted_output,
                   rewards=None,
                   batch_size=1):
        self.optimizer.zero_grad()

        formatted_input = [
            self.torch.from_numpy(x).float() for x in formatted_input
        ]
        formatted_output = self.torch.from_numpy(formatted_output).float()

        network_output = self.actor_model.forward(*formatted_input)

        loss = self.loss_function(network_output, formatted_output)
        loss.backward()
        # for i in range(9):
        #     trace(self.loss_function(network_output[:, i], formatted_output[:, i]).item(), key=i)
        trace(loss.item(), key='loss')

        self.optimizer.step()
Exemplo n.º 7
0
    def episode(self):
        self.simulation.random_state()
        reward = torch.zeros((self.simulation.o.shape[0], steps),
                             device=device)

        for i in range(steps):
            self.simulation.step(delta_time)

            reward[:, i] = self.simulation.error().neg() + rotation_eps

        trace((reward > 0).float().sum(1).mean(0).item(),
              reset_on_parent_change=False,
              key='frames done')

        reward[:, steps - 1] = self.andt(reward[:, steps - 1])
        for i in reversed(range(steps - 1)):
            reward[:, i] = self.andt(reward[:, i], reward[:, i + 1])

        loss = reward.sum(1).mean(0).neg()

        # if average_reward.item() > self.max_reward:
        #     self.max_reward = average_reward.item()
        #     torch.save(self.policy.state_dict(), f'{model_name}_{round(self.max_reward, 1)}.mdl')
        #     torch.save(self.optimizer.state_dict(), f'{model_name}_{round(self.max_reward, 1)}.state')

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        trace(loss.item(), reset_on_parent_change=False, key='loss')
        trace((reward < 0).sum(1).float().mean(0).item(),
              reset_on_parent_change=False,
              key='frame weight')
Exemplo n.º 8
0
def main():

    # Trace some dummy data
    for i in range(120*60):
        # Demo: Simple and common usecase
        trace(30 * math.sin(i/30))

        # Demo: Vector display and having multiple displays in one view_box
        t = i / 100
        t += math.sin(t)
        trace([math.cos(t) * 2, math.cos(20*t)], view_box='Wave-like things')
        trace([math.cos(t) + 5, math.sin(t)   ], view_box='Wave-like things')

        # Custom display
        trace(almost_fizzbuzz(i), custom_display=StringCounter)

        time.sleep(1/60.)  # Simulate an external main loop
Exemplo n.º 9
0
    def get_output_vector(self, game_tick_packet):
        s = EasyGameState(game_tick_packet, self.team, self.index)
        speed = mag(s.car_vel)
        turn_rate = game_tick_packet.gamecars[
            self.index].AngularVelocity.Z  # rad/s
        turn_radius = speed / max(turn_rate, 0.001)

        if self.start_time is None:
            self.start_time = s.time
        time_elapsed = s.time - self.start_time
        desired_speed = 10 + time_elapsed * 100

        too_slow = desired_speed > speed
        should_boost = desired_speed > 1000 and too_slow
        pedal = too_slow
        if desired_speed < 500: pedal *= 0.5

        trace(speed)
        # trace(turn_rate)
        trace(turn_radius)
        # trace(desired_speed)
        trace(turn_radius - estimate_turn_radius(speed))

        output_vector = [
            pedal,  # fThrottle
            1,  # fSteer
            0,  # fPitch
            0,  # fYaw
            0,  # fRoll
            0,  # bJump
            should_boost,  # bBoost
            0,  # bHandbrake
        ]

        if not controller.hat_toggle_west:
            if self.measurements:
                print('TADA:')
                print(repr(self.measurements))
            output_vector = (
                round(controller.fThrottle),
                round(controller.fSteer),
                round(controller.fPitch),
                round(controller.fYaw),
                round(controller.fRoll),
                round(controller.bJump),
                round(controller.bBoost),
                round(controller.bHandbrake),
            )
            self.start_time = None
            self.measurements = []
        else:
            # self.start_time = s.time
            self.measurements.append((desired_speed, turn_radius))

        return sanitize_output_vector(output_vector)
Exemplo n.º 10
0
    def episode(self):
        self.simulation.random_state()

        reward = torch.zeros((self.simulation.o.shape[0], ), device=device)
        framesDone = torch.zeros((self.simulation.o.shape[0], ), device=device)

        # profile()
        # sys.exit()

        for i in range(steps):
            self.simulation.step(delta_time)
            diff = rotation_eps - self.simulation.error()
            # reward *= 0.8
            # reward += diff.clamp(max=0)

            reward += diff.clamp(max=0,
                                 min=-rotation_eps /
                                 2 if self.reachesEnd else None)

            finished = (diff > 0).float()
            # reward = diff.clamp(max=0)
            framesDone += 1
            framesDone *= finished
            # reward = finished * (reward + 1)
            # if i == steps-1:
            #     framesDone = reward.clone().detach()
            #     reward += diff.clamp(max=0) / rotation_eps
        # reward = framesDone

        trace(((steps - framesDone) * delta_time * 120).mean(0).item(),
              reset_on_parent_change=False,
              key='game frames to destination')
        failed = (framesDone == 0).float().mean(0).item()
        self.reachesEnd = failed < 0.2
        trace(failed, reset_on_parent_change=False, key='amount failed')

        # reward[:, steps - 1] = self.andt(reward[:, steps - 1])
        # for i in reversed(range(steps - 1)):
        #     reward[:, i] = self.andt(reward[:, i], reward[:, i+1])

        loss = reward.mean(0).neg()

        # average_reward = sum(reward[:, steps - 1]) / len(reward[:, steps - 1])
        # if average_reward.item() > self.max_reward:
        #     self.max_reward = average_reward.item()
        #     torch.save(self.policy.state_dict(), f'out/{model_name}_{round(self.max_reward, 1)}.mdl')
        #     torch.save(self.optimizer.state_dict(), f'out/{model_name}_{round(self.max_reward, 1)}.state')

        self.optimizer.zero_grad()
        loss.backward()  # spits out error
        self.optimizer.step()
        trace(loss.item(), reset_on_parent_change=False, key='loss')
Exemplo n.º 11
0
def main():
    import math
    import time
    from quicktracer import trace

    # Demo: Trace some dummy data
    for i in range(40 * 60):
        # Simple and common usecase
        trace(30 * math.sin(i / 30), view_box="view1")
        trace(30 * math.cos(i / 30), view_box="view1")

        # Vectors are supported
        t = i / 100
        t += math.sin(t)
        trace([math.cos(t), math.cos(30 * t)])

        # Custom display
        trace(almost_fizzbuzz(i), custom_display=StringCounter)

        time.sleep(1 / 60.)  # Simulate an external main loop
Exemplo n.º 12
0
def select_action(state, hidden):
    global steps_done

    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    trace(eps_threshold)
    trace(steps_done)
    steps_done += 1
    # print(state.shape, "oipetreioreuio")
    out, hidden = agent.model(get_tensor_from_obs(state), hidden)

    if sample > eps_threshold:
        with torch.no_grad():
            action = out.max(2)[1].item()
            trace(action)
    else:
        action = random.getrandbits(2)
    return action, hidden
def select_action(state, hidden):
    global steps_done

    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
                    math.exp(-1. * steps_done / EPS_DECAY)
    trace(eps_threshold)
    trace(steps_done)
    steps_done += 1
    out, hidden = model(np_to_device(state).view((1, 1, -1)), hidden)

    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(0) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            print(out)
            action = out.max(2)[1].item()
            trace(action)
    else:
        action = random.getrandbits(1)
    return action, hidden
episode_rewards = []
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = get_obs(env.reset())
    step = 0
    episode_reward = 0
    hidden = model.initial_hidden(1)
    while True:
        # env.render()

        action, hidden = select_action(state, hidden)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -1
        used_action = action
        trace(used_action)
        next_state = get_obs(next_state)
        trace(reward)

        memory.add(state, action, reward, done)

        state = next_state

        episode_reward += reward
        step += 1

        if done:
            episode_rewards.append(episode_reward)
            print(
                f"Ep: {len(episode_rewards): 3d}, \tstep: {step: 4d}, \treward: {int(episode_reward): 2d}, \taverage_reward: {np.mean(episode_rewards[-20:]):.2f}"
            )
Exemplo n.º 15
0
 def _trace_logs(self, logs):
     for metric in self.params['metrics']:
         if metric in logs:
             trace(float(logs[metric]), key=metric)
    # Initialize the environment and state
    state = env.reset()
    step = 0
    hidden = model.initial_hidden(1)
    while True:
        # env.render()

        # Select and perform an action
        action, hidden = select_action(state, hidden)
        # print(f"a: {action}")
        next_state, reward, done, _ = env.step(action)
        # print(reward, done)
        # reward = 0 if done else reward
        # Store the transition in memory
        memory.add(state, action, reward, done)

        # Move to the next state
        state = next_state

        if done:
            print(step)
            episode_duration = step
            trace(episode_duration)
            break
        step += 1
    # Perform one step of the optimization
    optimize_model()
    # Update the target network, copying all weights and biases in DQN
    # if i_episode % TARGET_UPDATE == 0:
    #     target_net.load_state_dict(policy_net.state_dict())
Exemplo n.º 17
0
    state = get_obs(env.reset())
    step = 0
    while True:
        env.render()

        # Select and perform an action
        action = select_action(state)
        # print(f"a: {action}")
        next_state, reward, done, _ = env.step(action)
        next_state = get_obs(next_state)
        # print(reward, done)
        # reward = 0 if done else reward
        # Store the transition in memory
        memory.add(state, action, reward, next_state, done)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization
        optimize_model()
        if done:
            # episode_durations.append(t + 1)
            # plot_durations()
            print(step)
            trace(step)
            break
        step += 1
    # Update the target network, copying all weights and biases in DQN
    # if i_episode % TARGET_UPDATE == 0:
    #     target_net.load_state_dict(policy_net.state_dict())