示例#1
0
def init_dqn(args):
    """Intitialises and returns the necessary objects for
       Deep Q-learning:
       Q-network, target network, replay buffer and optimizer.
    """
    logging.info(
        "Initialisaling DQN with architecture {} and optimizer {}".format(
            args.dqn_archi, args.optimizer_agent))
    if args.dqn_archi == 'mlp':
        q_net = DQN(args.obs_shape, args.n_actions, args)
        q_target = DQN(args.obs_shape, args.n_actions, args)
    elif args.dqn_archi == 'cnn':
        q_net = CnnDQN(args.obs_shape, args.n_actions, args)
        q_target = CnnDQN(args.obs_shape, args.n_actions, args)
    if args.optimizer_agent == 'RMSProp':
        optimizer_agent = optim.RMSprop(q_net.parameters(),
                                        lr=args.lr_agent,
                                        weight_decay=args.lambda_agent)
    else:
        assert args.optimizer_agent == 'Adam'
        optimizer_agent = optim.Adam(q_net.parameters(),
                                     lr=args.lr_agent,
                                     weight_decay=args.lambda_agent)
    q_target.load_state_dict(
        q_net.state_dict())  # set params of q_target to be the same
    replay_buffer = ReplayBuffer(args.replay_buffer_size)

    if args.epsilon_annealing_scheme == 'linear':
        epsilon_schedule = LinearSchedule(schedule_timesteps=int(
            args.exploration_fraction * args.n_agent_steps),
                                          initial_p=args.epsilon_start,
                                          final_p=args.epsilon_stop)
    else:
        assert args.epsilon_annealing_scheme == 'exp'
        epsilon_schedule = ExpSchedule(decay_rate=args.epsilon_decay,
                                       final_p=args.epsilon_stop,
                                       initial_p=args.epsilon_start)

    return q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule
示例#2
0
def train_DQN(env: WrapIt, Q: DQN, Q_target: DQN, optimizer: namedtuple,
              replay_buffer: ReplayBuffer, exploration: Schedule):
    """
    @parameters
        Q:
        Q_target:
        optimizer: torch.nn.optim.Optimizer with parameters
        buffer: store the frame
    @return
        None
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    optimizer = optimizer.constructor(Q.parameters(), **optimizer.kwargs)

    num_actions = env.action_space.n
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    LOG_EVERY_N_STEPS = 10000
    last_obs = env.reset(passit=True)

    # Q.getSummary()

    out_count = 0
    bar = tqdm(range(ARGS.timesteps))
    for t in bar:
        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()
        if t > ARGS.startepoch:
            value = select_epsilon_greedy_action(Q, recent_observations,
                                                 exploration, t, num_actions)
            action = value[0, 0]
        else:
            action = random.randrange(num_actions)
        obs, reward, done, _ = env.step(action)
        reward = max(-1.0, min(reward, 1.0))
        replay_buffer.store_effect(last_idx, action, reward, done)

        if done:
            obs = env.reset()
        last_obs = obs
        # bar.set_description(f"{obs.shape} {obs.dtype}")

        if (t > ARGS.startepoch and t % ARGS.dqn_freq == 0
                and replay_buffer.can_sample(ARGS.batchsize)):
            bar.set_description("backward")
            (obs_batch, act_batch, rew_batch, next_obs_batch,
             done_mask) = replay_buffer.sample(ARGS.batchsize)
            (obs_batch, act_batch, rew_batch, next_obs_batch,
             not_done_mask) = TENSOR(obs_batch, act_batch, rew_batch,
                                     next_obs_batch, 1 - done_mask)
            (obs_batch, act_batch, rew_batch, next_obs_batch,
             not_done_mask) = TO(obs_batch, act_batch, rew_batch,
                                 next_obs_batch, not_done_mask)

            values = Q(obs_batch)
            current_Q_values = values.gather(
                1,
                act_batch.unsqueeze(1).long()).squeeze()
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = Q_target(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            Q_target_values = rew_batch + (ARGS.gamma * next_Q_values)
            # Compute Bellman error
            bellman_error = Q_target_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            # current_Q_values.backward(d_error.data.unsqueeze(1))
            current_Q_values.backward(d_error.data)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            if num_param_updates % ARGS.dqn_updatefreq == 0:
                bar.set_description("update")
                Q_target.load_state_dict(Q.state_dict())
                            if receive.edgeTotalConnectInfo[edge] < give.edgeTotalConnectInfo[edge]: 
                                receive.edgeTotalConnectInfo[edge] = give.edgeTotalConnectInfo[edge]
                                j.add(infomation)
                        if feat == 2:  
                            if receive.edgeCountInfo[edge] < give.edgeCountInfo[edge]: 
                                receive.edgeCountInfo[edge] = give.edgeCountInfo[edge]
                                j.add(infomation)
                    for i in range(num_agent): 
                        if i != give.num and i != receive.num: receive.featureUpdate[i] = receive.featureUpdate[i].union(j)
                    give.featureUpdate[receive.num].clear()
                elif give.num == receive.num: give.featureUpdate[receive.num].clear()

model = DQN(nfeat=num_feature)
# model.load_state_dict(torch.load(lists))  #retrain
model_target = DQN(nfeat=num_feature)
model_target.load_state_dict(model.state_dict())
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr=0.0002)    #
replay = namedtuple('replay',('nextnode','state','action','reward','next_state'))
class Replay_buffer():
    def __init__(self , buffer_size):
        self.buffer_size = buffer_size
        self.buffer = np.zeros(  [buffer_size] , dtype = replay)
        self.index = 0
        self.cur_size = 0
    def push(self,experience):
        self.buffer[self.index] = experience
        self.index = (self.index+1)%self.buffer_size
        if self.cur_size < self.buffer_size:
            self.cur_size += 1
    def sample(self,batch_size):
示例#4
0
class QAgent:
    def __init__(self, epsilon_start, epsilon_end, epsilon_anneal, nb_actions,
                 learning_rate, gamma, batch_size, replay_memory_size,
                 hidden_size, model_input_size, use_PER, use_ICM):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_anneal_over_steps = epsilon_anneal

        self.num_actions = nb_actions

        self.gamma = gamma

        self.batch_size = batch_size

        self.learning_rate = learning_rate

        self.step_no = 0

        self.policy = DQN(hidden_size=hidden_size,
                          inputs=model_input_size,
                          outputs=nb_actions).to(self.device)
        self.target = DQN(hidden_size=hidden_size,
                          inputs=model_input_size,
                          outputs=nb_actions).to(self.device)
        self.target.load_state_dict(self.policy.state_dict())
        self.target.eval()
        self.hidden_size = hidden_size
        self.optimizer = torch.optim.AdamW(self.policy.parameters(),
                                           lr=self.learning_rate)

        self.use_PER = use_PER
        if use_PER:
            self.replay = Prioritized_Replay_Memory(replay_memory_size)
        else:
            self.replay = Replay_Memory(replay_memory_size)

        self.loss_function = torch.nn.MSELoss()
        self.use_ICM = use_ICM
        if use_ICM:
            self.icm = ICM(model_input_size, nb_actions)

    # Get the current epsilon value according to the start/end and annealing values
    def get_epsilon(self):
        eps = self.epsilon_end
        if self.step_no < self.epsilon_anneal_over_steps:
            eps = self.epsilon_start - self.step_no * \
                ((self.epsilon_start - self.epsilon_end) /
                 self.epsilon_anneal_over_steps)
        return eps

    # select an action with epsilon greedy
    def select_action(self, state):
        self.step_no += 1
        if np.random.uniform() > self.get_epsilon():
            with torch.no_grad():
                return torch.argmax(self.policy(state)).view(1)
        else:
            return torch.tensor([random.randrange(self.num_actions)],
                                device=self.device,
                                dtype=torch.long)

    # update the model according to one step td targets
    def update_model(self):
        if self.use_PER:
            batch_index, batch, ImportanceSamplingWeights = self.replay.sample(
                self.batch_size)
        else:
            batch = self.replay.sample(self.batch_size)

        batch_tuple = Transition(*zip(*batch))

        state = torch.stack(batch_tuple.state)
        action = torch.stack(batch_tuple.action)
        reward = torch.stack(batch_tuple.reward)
        next_state = torch.stack(batch_tuple.next_state)
        done = torch.stack(batch_tuple.done)

        self.optimizer.zero_grad()
        if self.use_ICM:
            self.icm.optimizer.zero_grad()
            forward_loss = self.icm.get_forward_loss(state, action, next_state)
            inverse_loss = self.icm.get_inverse_loss(state, action, next_state)
            icm_loss = (1 - self.icm.beta) * inverse_loss.mean(
            ) + self.ICM.beta * forward_loss.mean()

        td_estimates = self.policy(state).gather(1, action).squeeze()

        td_targets = reward + (1 - done.float()) * self.gamma * \
            self.target(next_state).max(1)[0].detach_()

        if self.use_PER:

            loss = (torch.tensor(ImportanceSamplingWeights, device=self.device)
                    * self.loss_function(td_estimates, td_targets)
                    ).sum() * self.loss_function(td_estimates, td_targets)

            errors = td_estimates - td_targets
            self.replay.batch_update(batch_index, errors.data.numpy())
        else:
            loss = self.loss_function(td_estimates, td_targets)

        if self.use_ICM:
            loss = self.icm.lambda_weight * loss + icm_loss

        loss.backward()

        for param in self.policy.parameters():
            param.grad.data.clamp_(-1, 1)

        if self.use_ICM:
            self.icm.optimizer.step()

        self.optimizer.step()

        return loss.item()

    # set target net parameters to policy net parameters
    def update_target(self):
        self.target.load_state_dict(self.policy.state_dict())

    # save model
    def save(self, path, name):
        dirname = os.path.dirname(__file__)
        filename = os.path.join(dirname, os.path.join(path, name + ".pt"))
        torch.save(self.policy.state_dict(), filename)

    # load a model
    def load(self, path):
        dirname = os.path.dirname(__file__)
        filename = os.path.join(dirname, path)
        self.policy.load_state_dict(torch.load(filename))

    # store experience in replay memory
    def cache(self, state, action, reward, next_state, done):
        self.replay.push(state, action, reward, next_state, done)
示例#5
0
class DQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=0.9999,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn',
                 device='cuda:0'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.device = device

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        # Create policy and target DQN models
        self.policy = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'policy',
                          chkpt_dir=self.chkpt_dir)
        self.target = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'target',
                          chkpt_dir=self.chkpt_dir)

        # put on correct device (GPU or CPU)
        self.policy.to(device)
        self.target.to(device)

        # Optimizer
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        # Loss
        self.loss = nn.MSELoss()

    def choose_action(self, observation):
        # Choose an action
        if np.random.random() > self.epsilon:
            state = torch.tensor([observation],
                                 dtype=torch.float).to(self.device)
            actions = self.policy.forward(state)
            action = torch.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = torch.tensor(state).to(self.device)
        rewards = torch.tensor(reward).to(self.device)
        dones = torch.tensor(done).to(self.device)
        actions = torch.tensor(action).to(self.device)
        states_ = torch.tensor(new_state).to(self.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.target.load_state_dict(self.policy.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon *= self.eps_dec

    def save_models(self):
        self.policy.save_checkpoint()

    def load_models(self):
        self.policy.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        q_pred = self.policy.forward(states)[indices, actions]
        q_next = self.target.forward(states_).max(dim=1)[0]

        q_next[dones] = 0.0
        q_target = rewards + self.gamma * q_next

        loss = self.loss(q_target, q_pred).to(self.device)
        loss.backward()
        self.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()
示例#6
0
                            if receive.edgeCountInfo[
                                    edge] < give.edgeCountInfo[edge]:
                                receive.edgeCountInfo[
                                    edge] = give.edgeCountInfo[edge]
                                j.add(infomation)
                    for i in range(num_agent):
                        if i != give.num and i != receive.num:
                            receive.featureUpdate[i] = receive.featureUpdate[
                                i].union(j)
                    give.featureUpdate[receive.num].clear()
                elif give.num == receive.num:
                    give.featureUpdate[receive.num].clear()


model = DQN(nfeat=num_feature)
model.load_state_dict(torch.load(lists))


def pick_edge(ag):
    X = feature_matrix(ag)
    output = model(torch.from_numpy(X))
    outputnum = -1
    outputmax = -math.inf
    for i in range(num_node):
        if output[i] >= outputmax and i in node_ALL[
                ag.togonode].connected_node:
            outputmax = output[i]
            outputnum = i
    return outputnum

示例#7
0
def main(test=False, checkpoint=None, device='cuda'):
    if not test:
        wandb.init(project='dqn-breakout', name='test3')
    memory_size = 100000
    min_rb_size = 20000
    sample_size = 100
    lr = 0.0001

    eps_min = 0.05
    eps_decay = 0.99995
    env_steps_before_train = 10
    tgt_model_update = 5000

    env = gym.make('Breakout-v0')
    env = FrameStackingAndResizingEnv(env, 84, 84, 4)
    last_observation = env.reset()

    model = DQN(env.observation_space.shape, env.action_space.n, lr=lr).to(device)
    if checkpoint is not None:
        model.load_state_dict(torch.load(checkpoint))
    target = DQN(env.observation_space.shape, env.action_space.n).to(device)
    update_target_model(model, target)
    replay = ReplayBuffer(memory_size)
    steps_since_train = 0
    epochs_since_tgt = 0
    step_num = -1 * min_rb_size
    episode_rewards = []
    rolling_reward = 0

    tq = tqdm()
    while True:
        if test:
            env.render()
            time.sleep(0.05)
        tq.update(1)
        eps = max(eps_min, eps_decay ** (step_num))
        if test:
            eps = 0

        if random() < eps:
            action = env.action_space.sample()
        else:
            x = torch.Tensor(last_observation).unsqueeze(0).to(device)
            action = model(x).max(-1)[-1].item()

        observation, reward, done, info = env.step(action)
        rolling_reward += reward
        reward = reward * 0.1
        replay.insert(Sarsd(last_observation, action, reward, observation, done))
        last_observation = observation
        if done:
            episode_rewards.append(rolling_reward)
            if test:
                print(rolling_reward)
            rolling_reward = 0
            observation = env.reset()
        steps_since_train += 1
        step_num += 1
        if (not test) and (replay.idx > min_rb_size) and (steps_since_train > env_steps_before_train):
            loss = train_step(model, replay.sample(sample_size), target, env.action_space.n, device)
            wandb.log(
                {
                    "loss": loss.detach().cpu().item(),
                    "eps": eps,
                    "avg_reward": np.mean(episode_rewards)
                }
            )
            episode_rewards = []
            epochs_since_tgt += 1
            if epochs_since_tgt > tgt_model_update:
                print('updating target model')
                update_target_model(model, target)
                epochs_since_tgt = 0
                torch.save(target.state_dict(), f'target.model')
            steps_since_train = 0
    env.close()
示例#8
0
class Agent:
	def __init__(self):
		self.controller, self.target = DQN(), DQN() # For RL 
		self.vision = VAE()

		if USE_CUDA:
			self.controller.cuda()
			self.target.cuda()
			self.vision.cuda()

		# Init weights based on init function
		self.controller.apply(init_weights)
		self.vision.apply(init_weights)
		# Load model params into target
		self.target.load_state_dict(self.controller.state_dict())
		self.action_number = 0 # actions taken (to determine whether or not to update)
	
		# NOTE: DQN exp buffer should use embeddings generated by vision module
		# The vision module (aka the VAE) has memory consisting of game states
		self.exp_buffer = [] # exp buffer
		self.exp_number = 0 # size of exp buffer so far

		self.opt = torch.optim.Adam(self.controller.parameters(),lr=DQN_LEARNING_RATE)
		self.loss = nn.SmoothL1Loss()

	# Make an action given a state
	def act(self, state, explore=True):
		self.action_number += 1
		# Update target
		if self.action_number % TARGET_INTERVAL == 0:
			self.target.load_state_dict(self.model.state_dict())

		if explore and np.random.rand() <= EPSILON:
			# Act randomly
			a = np.random.randint(NUM_ACTIONS)
			return a
		
		# Send state to model
		a_vec = self.controller(self.vision.encode(state))
		a = int(torch.argmax(torch.squeeze(a_vec)))
		return a

	def load_params(self):
		# Looks in current directory for params for model and for VAE		 
		if LOAD_CHECKPOINT_VAE:
			try:
				self.vision.load_state_dict(torch.load("VAEparams.pt"))
				print("Loaded checkpoint for VAE")
			except:
				print("Could not load VAE checkpoint")
		if LOAD_CHECKPOINT_DQN:
			try:
				self.controller.load_state_dict(torch.load("DQNparams.pt"))
				self.target.load_state_dict(torch.load("DQNparams.pt"))
				print("Loaded checkpoint for DQN")
			except:
				print("Could not load DQN checkpoint")

	def save_params(self):
		torch.save(agent.controller.state_dict(), "DQNparams.pt")
		torch.save(agent.vision.state_dict(), "VAEparams.pt")

	# clear the buffer
	def clear_exp_buffer(self):
		self.exp_buffer = []
		self.exp_number = 0
		self.vision.memory = []
		self.vision.memory_num = 0

	# Add experience to exp buffer
	def add_exp(self, exp):
		self.vision.remember(exp[0])

		if self.exp_number >= EXP_BUFFER_MAX:
			del self.exp_buffer[0]
		else:
			self.exp_number += 1

		exp[0] = self.vision.encode(exp[0])
		exp[3] = self.vision.encode(exp[3])

		self.exp_buffer.append(exp)

	# Replay gets batch and trains on it
	# Returns [vision loss, controller loss]
	def replay(self, batch_size):
		v_loss, q_loss = 0,0 # Init to 0 in case we need to return without any training

		# Train vision component first
		if self.action_number % VAE_UPDATE_INTERVAL == 0:
			v_loss = self.vision.replay()
	
	# If experience buffer isn't right size yet, don't do anything
		if self.exp_number < EXP_BUFFER_MIN or self.action_number % TRAINING_INTERVAL != 0: return [v_loss, q_loss]

		# Get batch from experience_buffer
		batch = random.sample(self.exp_buffer, batch_size)
		
		s,a,r,s_new,_ = zip(*batch)
		s_new = s_new[:-1] # Remove last

		# First turn batch into something we can run through model
		s = torch.cat(s)
		a = torch.LongTensor(a).unsqueeze(1)
		r = torch.FloatTensor(r)
		s_new = torch.cat(s_new)
		
		if USE_CUDA:
			a = a.cuda()
			r = r.cuda()

		# Get q vals for s (what model outputted) from a
		# .gather gets us q value for specific action a
		pred_q_vals = self.model(s).gather(1,a).squeeze()

		# Having chosen a in s,
		# What is the highest possible reward we can get from s_new?
		# We add q of performing a in s then add best q from next state
		# cat 0 to end for the terminal state
		s_new_q_vals = self.target(s_new).max(1)[0]

		zero = torch.zeros(1)
		if USE_CUDA: zero = zero.cuda()
		s_new_q_vals = torch.cat((s_new_q_vals, zero))
		exp_q_vals = r + s_new_q_vals*GAMMA
		
		myloss = self.loss(pred_q_vals, exp_q_vals)
		self.opt.zero_grad()
		myloss.backward()
		

		if WEIGHT_CLIPPING:
			for param in self.model.parameters():
				param.grad.data.clamp_(-1,1) # Weight clipping avoids exploding gradients

		self.opt.step()
		
		global EPSILON
		if EPSILON > EPSILON_MIN:
			EPSILON *= EPSILON_DECAY 
	
		return [v_loss, myloss.item()]
示例#9
0
        args.eps_start = 0.0
        args.eps_end = 0.0
        args.eps_steps = 1

    policy = EpsGreedyPolicy(args.eps_start, args.eps_end, args.eps_steps)

    opt_step = 0

    # pre-training
    if not args.no_train:
        print('Pre-training')
        for i in range(1000):
            opt_step += 1
            optimize_dqfd(args.bsz, 1.0, opt_step)
            if i % TARGET_UPDATE == 0:
                target_net.load_state_dict(policy_net.state_dict())
        print('Pre-training done')
    else:
        args.demo_prop = 0

    env = MyEnv()
    env.reset()

    # training loop
    ep_counter = count(1) if args.num_eps < 0 else range(args.num_eps)
    for i_episode in ep_counter:
        state = env.reset()
        total_reward = 0
        transitions = []
        q_vals = policy_net(state)
        for step_n in count():
示例#10
0
def main(test=False,
         checkpoint=None,
         device='cuda',
         project_name='dqn',
         run_name='example'):
    if not test:
        wandb.init(project=project_name, name=run_name)

    ## HYPERPARAMETERS
    memory_size = 500000
    min_rb_size = 50000
    sample_size = 64
    lr = 0.0001
    boltzmann_exploration = False
    eps_min = 0.05
    eps_decay = 0.999995
    train_interval = 4
    update_interval = 10000
    test_interval = 5000
    episode_reward = 0
    episode_rewards = []
    screen_flicker_probability = 0.5

    # additional hparams
    living_reward = -0.01
    same_frame_ctr = 0
    same_frame_limit = 200

    # replay buffer
    replay = ReplayBuffer(memory_size)
    step_num = -1 * min_rb_size

    # environment creation
    env = gym.make('BreakoutDeterministic-v4')
    env = BreakoutFrameStackingEnv(env, 84, 84, 4)
    test_env = gym.make('BreakoutDeterministic-v4')
    test_env = BreakoutFrameStackingEnv(test_env, 84, 84, 4)
    last_observation = env.reset()

    # model creation
    model = DQN(env.observation_space.shape, env.action_space.n,
                lr=lr).to(device)
    if checkpoint is not None:
        model.load_state_dict(torch.load(checkpoint))
    target = DQN(env.observation_space.shape, env.action_space.n).to(device)
    update_target_model(model, target)

    # training loop
    tq = tqdm()
    while True:
        if test:
            env.render()
            time.sleep(0.05)
        tq.update(1)
        eps = max(eps_min, eps_decay**(step_num))
        if test:
            eps = 0
        if boltzmann_exploration:
            x = torch.Tensor(last_observation).unsqueeze(0).to(device)
            logits = model(x)
            action = torch.distributions.Categorical(
                logits=logits[0]).sample().item()
        else:
            # epsilon-greedy
            if random() < eps:
                action = env.action_space.sample()
            else:
                x = torch.Tensor(last_observation).unsqueeze(0).to(device)
                qvals = model(x)
                action = qvals.max(-1)[-1].item()

        # screen flickering
        # if random() < screen_flicker_probability:
        #    last_observation = np.zeros_like(last_observation)

        # observe and obtain reward
        observation, reward, done, info = env.step(action)
        episode_reward += reward

        # add to replay buffer
        replay.insert(
            Sarsd(last_observation, action, reward, observation, done))
        last_observation = observation

        # episode end logic
        if done:
            episode_rewards.append(episode_reward)
            if len(episode_rewards) > 100:
                del episode_rewards[0]
            wandb.log({
                "reward_ep": episode_reward,
                "avg_reward_100ep": np.mean(episode_rewards)
            })
            episode_reward = 0
            last_observation = env.reset()
        step_num += 1

        # testing, model updating and checkpointing
        if (not test) and (replay.idx > min_rb_size):
            if step_num % train_interval == 0:
                loss = train_step(model, replay.sample(sample_size), target,
                                  env.action_space.n, device)
                wandb.log({
                    "loss": loss.detach().cpu().item(),
                    "step": step_num
                })
                if not boltzmann_exploration:
                    wandb.log({"eps": eps})
            if step_num % update_interval == 0:
                print('updating target model')
                update_target_model(model, target)
                torch.save(target.state_dict(), f'target.model')
                model_artifact = wandb.Artifact("model_checkpoint",
                                                type="raw_data")
                model_artifact.add_file('target.model')
                wandb.log_artifact(model_artifact)
            if step_num % test_interval == 0:
                print('running test')
                avg_reward, best_reward, frames = policy_evaluation(
                    model, test_env, device)  # model or target?
                wandb.log({
                    'test_avg_reward':
                    avg_reward,
                    'test_best_reward':
                    best_reward,
                    'test_best_video':
                    wandb.Video(frames.transpose(0, 3, 1, 2),
                                str(best_reward),
                                fps=24)
                })
    env.close()
示例#11
0
def val_dist():
    file = open(thefile, 'r', encoding='UTF-8')
    line = file.readlines()

    num_node = int(line[0])
    num_edge = int(line[1])
    num_agent = int(line[num_node + num_edge + 2])
    constraint = int(line[num_node + num_edge + num_agent + 3])
    maxspeed = 0
    Cost = 0
    # lists = "Model\_3f_dist_no"
    # lists = "Model\_3f_dist_1"
    lists = "Model\_3f_dist_2"

    class Node:
        def __init__(self, pos, number):
            self.pos = pos
            self.number = number
            self.connected_node = []
            self.in_commu_range = []  #溝通範圍(constraint)內的node
            self.all_ag_here = []  #在這個node上的agent

    class Edge:
        def __init__(self, distance, number):
            self.ox = 'x'
            self.distance = distance
            self.number = number
            self.count = 0

    class Agent:
        def __init__(self, cur, speed, number):
            self.currnode_ori = cur
            self.currnode = cur
            self.togonode = cur
            self.lastedge = 0
            self.togoedge = 0
            self.curedge_length = 0
            self.step = 0
            self.speed = speed
            self.cost = 0
            self.num = number
            self.historyaction = []
            self.reward = 0
            self.start = cur
            self.edgeLengthInfo = []
            self.alreadyVisitInfo = []
            self.edgeTotalConnectMap = [[0] * num_edge
                                        for i in range(num_edge)]
            self.edgeTotalConnectInfo = []
            self.totalAgentMap = [[0] * 2 for i in range(num_edge)]
            self.totalAgentInfo = []
            self.edgeCountInfo = []
            for i in range(num_edge):
                self.edgeLengthInfo.append(0)
                self.alreadyVisitInfo.append(0)
                self.edgeTotalConnectInfo.append(0)
                self.totalAgentInfo.append(0)
                self.edgeCountInfo.append(0)
            self.featureUpdate = []
            for i in range(num_agent):
                j = set()
                self.featureUpdate.append(j)

    node_ALL = []
    edge_ALL = {}
    agent_ALL = []

    for i in range(num_node):
        k = i + 2
        line[k] = line[k].split()
        for j in range(len(line[k])):
            line[k][j] = int(line[k][j])
        l = Node((line[k][1], line[k][2]), line[k][0])
        node_ALL.append(l)

    for i in range(num_edge):
        k = num_node + i + 2
        line[k] = line[k].split()
        for j in range(len(line[k])):
            line[k][j] = int(line[k][j])
        l = Edge(line[k][2], i)
        line[k].pop()
        edge_ALL[tuple(line[k])] = l
        start = line[k][0]
        end = line[k][1]
        node_ALL[start].connected_node.append(end)
        node_ALL[end].connected_node.append(start)

    for i in range(num_agent):
        k = num_node + num_edge + i + 3
        line[k] = line[k].split()
        for j in range(len(line[k])):
            line[k][j] = int(line[k][j])
        l = Agent(int(line[k][1]), int(line[k][2]), int(line[k][0]))
        agent_ALL.append(l)
        if (maxspeed < int(line[k][2])): maxspeed = int(line[k][2])
        node_ALL[l.currnode].all_ag_here.append(i)

    #算哪些node在溝通範圍(constraint)內
    def cal_dis(a, b):
        return np.sqrt(
            np.square(abs(a.pos[0] - b.pos[0])) +
            np.square(abs(a.pos[1] - b.pos[1])))

    for i in range(num_node):
        for j in range(num_node):
            if (cal_dis(node_ALL[i], node_ALL[j]) <= constraint):
                node_ALL[i].in_commu_range.append(j)

    def find_edge(a, b):
        if tuple([a, b]) in edge_ALL: return tuple([a, b])
        else: return tuple([b, a])

    # 特徵矩陣 (todo)
    num_feature = 3

    def feature_matrix(ag):
        X = np.zeros((num_node, num_feature))
        for k in node_ALL[ag.currnode].connected_node:
            ed = edge_ALL[find_edge(ag.currnode, k)].number
            # 距離
            if ag.edgeLengthInfo[ed] != 0:
                X[k][0] = ag.edgeLengthInfo[ed]
            # # 被幾個edge走到
            X[k][1] = ag.edgeTotalConnectInfo[ed]
            # 此edge被走過幾次
            X[k][2] = ag.edgeCountInfo[ed]
        X = np.around((X), decimals=3)
        return X

    def update_info():
        for u in range(num_agent):
            for give in agent_ALL:
                for receive in agent_ALL:
                    if receive.currnode in node_ALL[
                            give.
                            currnode].in_commu_range and give.num != receive.num:
                        j = set()
                        for infomation in set(give.featureUpdate[receive.num]):
                            feat, edge = infomation
                            if feat == 0:
                                if receive.edgeLengthInfo[edge] == 0:
                                    receive.edgeLengthInfo[
                                        edge] = give.edgeLengthInfo[edge]
                                    j.add(infomation)
                            if feat == 1:
                                if receive.edgeTotalConnectInfo[
                                        edge] < give.edgeTotalConnectInfo[edge]:
                                    receive.edgeTotalConnectInfo[
                                        edge] = give.edgeTotalConnectInfo[edge]
                                    j.add(infomation)
                            if feat == 2:
                                if receive.edgeCountInfo[
                                        edge] < give.edgeCountInfo[edge]:
                                    receive.edgeCountInfo[
                                        edge] = give.edgeCountInfo[edge]
                                    j.add(infomation)
                        for i in range(num_agent):
                            if i != give.num and i != receive.num:
                                receive.featureUpdate[
                                    i] = receive.featureUpdate[i].union(j)
                        give.featureUpdate[receive.num].clear()
                    elif give.num == receive.num:
                        give.featureUpdate[receive.num].clear()

    model = DQN(nfeat=num_feature)
    model.load_state_dict(torch.load(lists))

    def pick_edge(ag):
        X = feature_matrix(ag)
        output = model(torch.from_numpy(X))
        outputnum = -1
        outputmax = -math.inf
        for i in range(num_node):
            if output[i] >= outputmax and i in node_ALL[
                    ag.togonode].connected_node:
                outputmax = output[i]
                outputnum = i
        return outputnum

    def walking(ag):
        if ag.currnode_ori != ag.togonode:
            edge_ALL[find_edge(ag.currnode_ori, ag.togonode)].ox = 'o'
            ag.edgeLengthInfo[edge_ALL[ag.togoedge].number] = ag.curedge_length
            ag.alreadyVisitInfo[edge_ALL[ag.togoedge].number] = 1
            for i in range(num_agent):
                ag.featureUpdate[i].add(
                    tuple([0, edge_ALL[ag.togoedge].number]))
        ag.currnode = ag.togonode
        ag.currnode_ori = ag.togonode
        ag.lastedge = ag.togoedge
        ag.historyaction.append(ag.togonode)
        ag.step = ag.step - ag.curedge_length
        ag.togonode = pick_edge(ag)
        togo_edge = find_edge(ag.currnode, ag.togonode)
        ag.curedge_length = edge_ALL[togo_edge].distance
        ag.togoedge = togo_edge
        if ag.lastedge != ag.togoedge and ag.lastedge != 0:
            head = edge_ALL[ag.lastedge].number
            tail = edge_ALL[ag.togoedge].number
            ag.edgeTotalConnectMap[head][tail] = 1
            ag.edgeTotalConnectMap[tail][head] = 1
            ag.edgeTotalConnectInfo[head] = sum(ag.edgeTotalConnectMap[head])
            ag.edgeTotalConnectInfo[tail] = sum(ag.edgeTotalConnectMap[tail])
            for i in range(num_agent):
                ag.featureUpdate[i].add(tuple([1, head]))
                ag.featureUpdate[i].add(tuple([1, tail]))
        edge_ALL[ag.togoedge].count += 1
        ag.edgeCountInfo[edge_ALL[ag.togoedge].number] = edge_ALL[
            ag.togoedge].count
        for i in range(num_agent):
            ag.featureUpdate[i].add(tuple([2, edge_ALL[ag.togoedge].number]))

    k = 100000
    while not all(edge_ALL[r].ox == 'o' for r in edge_ALL):
        for ag in agent_ALL:
            ag.step += ag.speed
            ag.cost += ag.speed
            while ag.curedge_length <= ag.step:
                update_info()
                node_ALL[ag.currnode].all_ag_here.remove(ag.num)
                walking(ag)
                node_ALL[ag.currnode].all_ag_here.append(ag.num)
            if ag.step > ag.curedge_length / 2:
                node_ALL[ag.currnode].all_ag_here.remove(ag.num)
                ag.currnode = ag.togonode
                node_ALL[ag.currnode].all_ag_here.append(ag.num)
                update_info()
        Cost += maxspeed
        if Cost > k:
            print(Cost)
            k += 100000

    # Write all action to file
    fileforHistoryaction = "Animation/RL_dist" + str(num_node) + ".txt"
    f = open(fileforHistoryaction, "w")
    print(num_node, file=f)
    for i in agent_ALL:
        print(i.historyaction, file=f)

    print("Model_Dist        = ", Cost)
    thecost[4] += Cost
示例#12
0
class FixedDQNAgent(DQNAgent):
    """
    DQN Agent with a target network to compute Q-targets.
    Extends DQNAgent.
    """
    def __init__(self,
                 input_dim,
                 output_dim,
                 lr,
                 gamma,
                 max_memory_size,
                 batch_size,
                 eps_start,
                 eps_end,
                 eps_decay,
                 device,
                 target_update=100,
                 linear1_units=64,
                 linear2_units=64,
                 decay_type="linear"):

        super().__init__(input_dim, output_dim, lr, gamma, max_memory_size,
                         batch_size, eps_start, eps_end, eps_decay, device,
                         linear1_units, linear2_units, decay_type)

        self.model_name = "FixedDQN"

        self.target_update_freq = target_update
        # networks
        self.output_dim = output_dim
        self.target_net = DQN(input_dim, output_dim, linear1_units,
                              linear2_units).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.updated = 0

    def learn(self):
        """
        Update the weights of the network, using
        target_net to compute Q-targets. Every self.target_update_freq
        updates, clone the policy_net.
        :return: the loss
        """
        states, next_states, actions, rewards, dones = self.memory.sample(
            self.batch_size)

        curr_q_vals = self.policy_net(states).gather(1, actions)
        next_q_vals = self.target_net(next_states).max(
            1, keepdim=True)[0].detach()
        target = (rewards + self.gamma * next_q_vals * (1 - dones)).to(
            self.device)
        loss = F.smooth_l1_loss(curr_q_vals, target)
        self.optim.zero_grad()
        loss.backward()

        self.optim.step()

        self.updated += 1

        if self.updated % self.target_update_freq == 0:
            self.target_hard_update()

        return loss.item()

    def target_hard_update(self):
        """ Clone the policy net weights into the target net """
        self.target_net.load_state_dict(self.policy_net.state_dict())
示例#13
0
        # Compute current Q value, q_func takes only state and output value for every state-action pair
        # We choose Q based on action taken.
        current_Q_values = Q(obs_batch).gather(1,
                                               act_batch.unsqueeze(1)).view(-1)
        # Compute next Q value based on which action gives max Q values
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
        next_Q_values = not_done_mask * next_max_q
        # Compute the target of the current Q values
        target_Q_values = rew_batch + (GAMMA * next_Q_values)
        # Compute Bellman error
        bellman_error = target_Q_values - current_Q_values
        # clip the bellman error between [-1 , 1]
        clipped_bellman_error = bellman_error.clamp(-1, 1)
        # Note: clipped_bellman_delta * -1 will be right gradient
        d_error = clipped_bellman_error * -1.0
        # Clear previous gradients before backward pass
        optimizer.zero_grad()
        # run backward pass
        current_Q_values.backward(d_error.data)

        # Perfom the update
        optimizer.step()
        num_param_updates += 1

        # Periodically update the target network by Q network to target Q network
        if num_param_updates % TARGERT_UPDATE_FREQ == 0:
            target_Q.load_state_dict(Q.state_dict())

print(np.mean(episodes_rewards))