Exemplo n.º 1
0
 def __init__(self, env, conf):
     self.env = env
     self.conf = conf
     self.set_cuda()
     
     self.agent = DDPG(conf, self.device)
     self.memory = ReplayMemory(conf)
Exemplo n.º 2
0
	def __init__(self):

		self.BATCH_SIZE = 128
		self.GAMMA = 0.99
		self.EPS_START = 1.0
		self.EPS_END = 0.05
		self.EPS_DECAY = 0.000005
		self.TARGET_UPDATE = 5

		self.pretrain_length = self.BATCH_SIZE
		#self.state_size = [55,3]
		
		self.action_size = 3
		self.hot_actions = np.array(np.identity(self.action_size).tolist())
		#self.action_size = len(self.hot_actions)
		self.learning_rate = 0.0005
		#self.total_episodes = 12
		self.max_steps = 1000

		self.env = Environment()

		self.memory_maxsize = 10000

		self.DQNetwork = DQNetwork(learning_rate = self.learning_rate,name = 'DQNetwork')

		self.TargetNetwork = DQNetwork(learning_rate = self.learning_rate , name = 'TargetNetwork')

		self.memory = ReplayMemory(max_size=self.memory_maxsize)

		self.saver = tf.train.Saver()
Exemplo n.º 3
0
    def __init__(self, conf, device):
        self.conf = conf
        self.state_dim = conf['state_dim']
        self.action_dim = conf['action_dim']
        self.device = device

        self.q = DQNNetwork(self.state_dim, self.action_dim).to(self.device)
        self.q_target = DQNNetwork(self.state_dim,
                                   self.action_dim).to(self.device)
        self.q_target.load_state_dict(self.q.state_dict())
        self.q_target.eval()

        self.memory = ReplayMemory(self.conf)

        self.optimizer = optim.Adam(self.q.parameters(), lr=lr_dqn)

        self.loss = HuberLoss()
        self.loss = self.loss.to(self.device)
        self.currIteration = 0
Exemplo n.º 4
0
Arquivo: car.py Projeto: Tzeusy/RL_car
def create_player(load_weights=True, user_model=False):
    env = create_env()
    env.reset()

    # Get screen size so that we can initialize layers correctly based on shape
    # returned from AI gym. Typical dimensions at this point are close to 3x40x90
    # which is the result of a clamped and down-scaled render buffer in get_screen()
    init_screen = get_screen(env)
    _, n_channels, screen_height, screen_width = init_screen.shape  # 3, 40, 60

    if user_model:
        policy_net = DQNUser(screen_height, screen_width, n_actions,
                             KERNEL_SIZE, N_LAYERS).to(device)
        policy_net.eval()
        target_net = DQNUser(screen_height, screen_width, n_actions,
                             KERNEL_SIZE, N_LAYERS).to(device)
        target_net.eval()
    else:
        policy_net = DQN(screen_height, screen_width, n_actions).to(device)
        policy_net.eval()
        target_net = DQN(screen_height, screen_width, n_actions).to(device)
        target_net.eval()

    if load_weights:
        model_dir = "models"
        model_file_name = "mean100_659.pth"
        policy_net.load_state_dict(
            torch.load(f"{model_dir}/{model_file_name}", map_location='cpu'))
        target_net.load_state_dict(policy_net.state_dict())

    optimizer = optim.Adam(policy_net.parameters(),
                           lr=LEARNING_RATE,
                           weight_decay=1e-6)
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9999999)
    memory = ReplayMemory(REPLAY_MEM)
    fake_memory = ReplayMemory(REPLAY_MEM)

    player = Player(env, policy_net, target_net, optimizer, scheduler, memory,
                    fake_memory)
    return player
Exemplo n.º 5
0
    def __init__(
        self,
        length,
        box_dimensions,
        device,
        BATCH_SIZE=128,
        GAMMA=0.999,
        EPS_START=0.9,
        EPS_END=0.05,
        EPS_DECAY=200,
        TARGET_UPDATE=10,
    ):
        self.length = length
        self.box_dimensions = box_dimensions
        self.width = box_dimensions[0]
        self.height = box_dimensions[1]
        self.epsilon = 0.01
        self.orientation = 'LEFT'
        self.device = device

        self.policy_net = DQN(self.height, self.width,
                              len(self.actions)).to(device)
        self.target_net = DQN(self.height, self.width,
                              len(self.actions)).to(device)
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        self.steps_done = 0
        self.cumulative_reward = 0.0
        self.episode = 1
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.BATCH_SIZE = BATCH_SIZE
        self.GAMMA = GAMMA
        self.EPS_START = EPS_START
        self.EPS_END = EPS_END
        self.EPS_DECAY = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE
        self._reset(setup=True)
Exemplo n.º 6
0
Arquivo: agent.py Projeto: wuyuup/dqn
    def train(self):
        # feature dimension
        state_dim = self.env.observation_space.shape[0]
        # number of actions
        n_actions = self.env.action_space.n

        self.q_net = DQN(state_dim, n_actions).to(self.args.device)
        self.target_net = DQN(state_dim, n_actions).to(self.args.device)
        self.target_net.load_state_dict(self.q_net.state_dict())
        self.target_net.train()
        self.q_net.train()
        # self.optimizer = optim.RMSprop(self.q_net.parameters())
        self.optimizer = optim.Adam(self.q_net.parameters(), self.args.LR)
        self.memory = ReplayMemory(self.args.memory_cap)
        self.steps_done = 0
        self.eps = self.args.EPS_START
        self.episode_durations = []

        for i_episode in range(self.args.num_episodes):
            self.state = self.env.reset()
            self.state = torch.Tensor(self.state).to(
                self.args.device).unsqueeze(0)
            for t in count():
                done = self.__step(t)
                if done or t >= self.args.END_EPSISODE:
                    self.episode_durations.append(t + 1)
                    if len(self.episode_durations) > 100:
                        ave = np.mean(self.episode_durations[-100:])
                    else:
                        ave = np.mean(self.episode_durations)
                    if i_episode % 10 == 0:
                        print("[Episode {:>5}]  steps: {:>5}   ave: {:>5}".
                              format(i_episode, t, ave))
                    break

        plt.figure()
        plt.clf()
        durations_t = torch.tensor(self.episode_durations, dtype=torch.float)
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        plt.plot(means.numpy())
        plt.title('training')
        plt.xlabel('episode')
        plt.ylabel('duration')
        plt.savefig('res.png')
Exemplo n.º 7
0
Arquivo: main.py Projeto: remit0/santa
def main():
    env = Environment()

    gamma = 0.99
    period = 100
    learning_rate = 1e-6
    min_memory_size = 1000
    max_memory_size = 10000
    batch_size = 32
    num_episodes = 100
    layers = [env.observation_space.n, 444, 222, env.action_space.n]

    memory = ReplayMemory(max_memory_size)
    dqn = DQN(layers, learning_rate)
    init_memory(env, memory, min_memory_size)

    iters = 0
    for n_ep in range(num_episodes):
        eps = compute_eps(n_ep, 10)
        reward, iters = train_one_episode(env, dqn, memory, gamma, batch_size,
                                          eps, period, iters)
        print(n_ep, reward)

    make_submission(env, dqn)
Exemplo n.º 8
0
TARGET_UPDATE = 10

init_screen = get_screen()
_, _, screen_height, screen_width = init_screen.shape

n_actions = env.action_space.n

policyNet = DQNAgent(screen_height, screen_width, n_actions).to(device)
targetNet = DQNAgent(screen_height, screen_width, n_actions).to(device)

targetNet.load_state_dict(policyNet.state_dict(
))  # Use the parameters of policyNet to evaluate targetNet
targetNet.eval()

optimizer = optim.RMSprop(policyNet.parameters())
memory = ReplayMemory(10000)

steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    #print('select_action received state of dim: ', state.size())
    if sample > eps_threshold:
        with torch.no_grad(
        ):  # Not updating, using the policyNet to simply predict the next action to take
            # t.max(1) will return largest column value of each row.
Exemplo n.º 9
0
                    torch.save(self.target.state_dict(), target_PATH)

        policy_PATH = f"policy_episode_{self.episodes}_{self.steps}"
        target_PATH = f"target_episode_{self.episodes}_{self.steps}"
        torch.save(self.policy.state_dict(), policy_PATH)
        torch.save(self.target.state_dict(), target_PATH)

        env.close()


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(f"Using Device {device}")


    target = DQN(device=device).to(device)
    policy = DQN(device=device).to(device)
    model = torch.load("policy_episode_14300_8617296", map_location=torch.device("cpu"))

    policy.load_state_dict(model)
    target.load_state_dict(model)

    mem = ReplayMemory(TrainPongV0.MEMORY_SIZE)
    trainer = TrainPongV0(target, policy, mem, device)
    try:
        trainer.train(50000)
    finally:
        np.save("rewards", trainer.total_rewards, allow_pickle=True)

Exemplo n.º 10
0
class Snake(object):
    actions = ['FORWARD', 'LEFT', 'RIGHT']
    orientations = ['UP', 'DOWN', 'LEFT', 'RIGHT']

    def __init__(
        self,
        length,
        box_dimensions,
        device,
        BATCH_SIZE=128,
        GAMMA=0.999,
        EPS_START=0.9,
        EPS_END=0.05,
        EPS_DECAY=200,
        TARGET_UPDATE=10,
    ):
        self.length = length
        self.box_dimensions = box_dimensions
        self.width = box_dimensions[0]
        self.height = box_dimensions[1]
        self.epsilon = 0.01
        self.orientation = 'LEFT'
        self.device = device

        self.policy_net = DQN(self.height, self.width,
                              len(self.actions)).to(device)
        self.target_net = DQN(self.height, self.width,
                              len(self.actions)).to(device)
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        self.steps_done = 0
        self.cumulative_reward = 0.0
        self.episode = 1
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.BATCH_SIZE = BATCH_SIZE
        self.GAMMA = GAMMA
        self.EPS_START = EPS_START
        self.EPS_END = EPS_END
        self.EPS_DECAY = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE
        self._reset(setup=True)

    def _reset(self, setup=False):
        self.i_pos = self.box_dimensions / 2
        self.body_position = [
            np.array([self.i_pos[0] - x, self.i_pos[1]], dtype=int)
            for x in range(self.length)
        ]
        self.cumulative_reward = 0.0

        if not setup:
            self.episode += 1

        print('On episode {}, step {}'.format(self.episode, self.steps_done))

    def act(self, state):
        a = self.select_action(state)
        return self.actions[a.item()]

    def _convert_action_to_tensor(self, action):
        return torch.tensor(self.actions.index(action)).view(1, -1)

    def _convert_move_to_point(self, move):
        if move == 'FORWARD':
            return self._handle_forward(move)
        elif move == 'LEFT':
            return self._handle_left(move)
        elif move == 'RIGHT':
            return self._handle_right(move)
        else:
            raise ValueError('Invalid move')

    def is_colliding(self, position):
        curr_body = list(self.body_position)
        curr_body.pop()

        for _, elt in enumerate(curr_body):
            if np.array_equal(elt, position):
                return True
        return False

    def select_action(self, state):
        sample = np.random.random()
        eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \
            m.exp(-1. * self.steps_done / self.EPS_DECAY)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                state_tensor = torch.tensor(state, device=self.device).double()
                state_shape = list(state_tensor.size())
                state_tensor = state_tensor.view(1, 1, state_shape[0],
                                                 state_shape[1])
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                return self.policy_net(state_tensor).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(len(self.actions))]],
                                device=self.device,
                                dtype=torch.long)

    def _handle_forward(self, move):
        x_pos, y_pos = self.body_position[0]
        if self.orientation == 'UP':
            return np.array([x_pos, y_pos + 1])
        elif self.orientation == 'DOWN':
            return np.array([x_pos, y_pos - 1])
        elif self.orientation == 'LEFT':
            return np.array([x_pos + 1, y_pos])
        elif self.orientation == 'RIGHT':
            return np.array([x_pos - 1, y_pos])
        else:
            raise ValueError('Invalid orientation')

    def _handle_left(self, move):
        x_pos, y_pos = self.body_position[0]
        if self.orientation == 'UP':
            self.orientation = 'LEFT'
            return np.array([x_pos + 1, y_pos])

        elif self.orientation == 'DOWN':
            self.orientation = 'RIGHT'
            return np.array([x_pos - 1, y_pos])

        elif self.orientation == 'LEFT':
            self.orientation = 'DOWN'
            return np.array([x_pos, y_pos - 1])

        elif self.orientation == 'RIGHT':
            self.orientation = 'UP'
            return np.array([x_pos, y_pos + 1])
        else:
            raise ValueError('Invalid orientation')

    def _handle_right(self, move):
        x_pos, y_pos = self.body_position[0]
        if self.orientation == 'UP':
            self.orientation = 'RIGHT'
            return np.array([x_pos - 1, y_pos])

        elif self.orientation == 'DOWN':
            self.orientation = 'LEFT'
            return np.array([x_pos + 1, y_pos])

        elif self.orientation == 'LEFT':
            self.orientation = 'UP'
            return np.array([x_pos, y_pos + 1])

        elif self.orientation == 'RIGHT':
            self.orientation = 'DOWN'
            return np.array([x_pos, y_pos - 1])
        else:
            raise ValueError('Invalid orientation')

    def process_reward(self, reward):
        self.cumulative_reward += reward

    def optimize_model(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))
        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [torch.from_numpy(s) for s in batch.next_state if s is not None])
        state_batch = torch.cat(
            [torch.from_numpy(s) for s in batch.state if s is not None])
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(
            state_batch.view(self.BATCH_SIZE, 1, 10,
                             10)).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states.view(self.BATCH_SIZE, 1, 10,
                                       10)).float().max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(
            state_action_values,
            expected_state_action_values.double().unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Exemplo n.º 11
0
            episode_reward += reward
            if render:
                env.render()
            if done:
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)


if __name__ == '__main__':
    # Create and wrap the environment
    env = gym.make('game-stock-exchange-continuous-v0')
    env = DummyVecEnv([lambda: env])
    action_dim = 2
    obs_shape = env.observation_space.shape
    rpm  = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim = action_dim)
    algorithm = DQN(model, act_dim = action_dim, gamma = GAMMA, lr = LEARNING_RATE)

    agent = Agent(algorithm, obs_shape[0],obs_shape[1],action_dim)

    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(env,agent,rpm)

    max_episode = 2000
    episode = 0
    while episode < max_episode:
        for i in range(0,50):
            total_reward = run_episode(env,agent,rpm)
            episode += 1
Exemplo n.º 12
0
class AgentTrainer():
	def __init__(self):

		self.BATCH_SIZE = 128
		self.GAMMA = 0.99
		self.EPS_START = 1.0
		self.EPS_END = 0.05
		self.EPS_DECAY = 0.000005
		self.TARGET_UPDATE = 5

		self.pretrain_length = self.BATCH_SIZE
		#self.state_size = [55,3]
		
		self.action_size = 3
		self.hot_actions = np.array(np.identity(self.action_size).tolist())
		#self.action_size = len(self.hot_actions)
		self.learning_rate = 0.0005
		#self.total_episodes = 12
		self.max_steps = 1000

		self.env = Environment()

		self.memory_maxsize = 10000

		self.DQNetwork = DQNetwork(learning_rate = self.learning_rate,name = 'DQNetwork')

		self.TargetNetwork = DQNetwork(learning_rate = self.learning_rate , name = 'TargetNetwork')

		self.memory = ReplayMemory(max_size=self.memory_maxsize)

		self.saver = tf.train.Saver()

		#self.TargetUpdate = update_target_graph()

	def select_action(self, sess,decay_step, state, actions):
		## EPSILON GREEDY STRATEGY Choose action a from state s using epsilon greedy.
		## First we randomize a number
		exp_exp_tradeoff = np.random.rand()

		
		explore_probability = self.EPS_END + (self.EPS_START - self.EPS_END) * np.exp(-self.EPS_DECAY * decay_step)

		if (explore_probability > exp_exp_tradeoff):
			# Make a random action (exploration)
			choice = random.randint(1,len(self.hot_actions))-1
			action = self.hot_actions[choice]
			#print('action_taken is random',action)

		else:
			# Get action from Q-network (exploitation)
			# Estimate the Qs values state
								
			Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state.reshape((1,) + state.shape)})


			# Take the biggest Q value (= the best action)
			choice = np.argmax(Qs)
			action = self.hot_actions[choice]
			
	    
	    
		return action, explore_probability

	#This function helps us to copy one set of variables to another
	
	
	def update_target_graph(self):

		
		return op_holder

	def train(self,num_episodes,sess):

		# Instantiate memory
		#memory = Memory(max_size = memory_size)
		for i in range(self.pretrain_length):
			# If it's the first step
			if i == 0:
				state = self.env.reset()
			    
			    		    
			# Get the next_state, the rewards, done by taking a random action
			choice = random.randint(1,len(self.hot_actions))-1
			action = self.hot_actions[choice]
			next_state, reward, done= self.env.step(np.argmax(action))
			

			# If the episode is finished (we're dead 3x)
			if done:
				# We finished the episode
				next_state = np.zeros(state.shape)
				# Add experience to memory
				self.memory.add((state, action, reward, next_state, done))
				# Start a new episode
				state = self.env.reset()
			    
			    
			    
			else:
			    # Add experience to memory
				self.memory.add((state, action, reward, next_state, done))
				#print("adding to memory")
				sys.stdout.flush()
				# Our new state is now the next_state
				state = next_state
		


		decay_step = 0

		rewards_list = []

		total_steps = 0

		
		for episode in range(num_episodes):



			#print('epidose',episode)


			# Set step to 0
			step = 0

			total_reward = 0

			# Initialize the rewards of the episode
			episode_rewards = []

			# Make a new episode and observe the first state
			state = self.env.reset()

			done = False

			#cv2.imshow(state)
			#cv2.waitKey(100)

			
			while not done:
				step += 1

				total_steps+=1
				
				
				
				#Increase decay_step
				decay_step +=1

				# Predict the action to take and take it

				action, explore_probability = self.select_action(sess,decay_step, state, self.hot_actions)

				#Perform the action and get the next_state, reward, and done information

				
				next_state, reward, done = self.env.step(np.argmax(action))


				

				# Add the reward to total reward
				episode_rewards.append(reward)

				# If the game is finished
				if done:
				    # The episode ends so no next state
					next_state = np.zeros(state.shape, dtype=np.int)

					
					steps_taken = step
					
					# Get the total reward of the episode
					total_reward = np.sum(episode_rewards)

					

					rewards_list.append((episode, total_reward))

					# Store transition <st,at,rt+1,st+1> in memory D
					self.memory.add((state, action, reward, next_state, done))

				else:
					# Stack the frame of the next_state
					# next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

					# Add experience to memory
					self.memory.add((state, action, reward, next_state, done))

					steps_taken = step

					

					# st+1 is now our current state
					state = next_state
	                

		            ### LEARNING PART            
	            # Obtain random mini-batch from memory
				batch = self.memory.sample(self.BATCH_SIZE)
				states_mb = np.array([each[0] for each in batch], ndmin=3)
				actions_mb = np.array([each[1] for each in batch])
				rewards_mb = np.array([each[2] for each in batch]) 
				next_states_mb = np.array([each[3] for each in batch], ndmin=3)
				dones_mb = np.array([each[4] for each in batch])

				target_Qs_batch = []

								
				# Get Q values for next_state 
				Qs_next_state = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: next_states_mb})
				# Calculate Qtarget for all actions that state
				q_target_next_state = sess.run(self.TargetNetwork.output, feed_dict = {self.TargetNetwork.inputs_: next_states_mb})

				# Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
				for i in range(0, len(batch)):
					terminal = dones_mb[i]

					# If we are in a terminal state, only equals reward
					if terminal:
						target_Qs_batch.append(rewards_mb[i])

					else:
						target = rewards_mb[i] + self.GAMMA * np.max(q_target_next_state[i])
						#print (target)
						#print(Qs_next_state[i])
						target_Qs_batch.append(target)

				targets_mb = np.array([each for each in target_Qs_batch])

				#print(targets_mb)

				loss, _ = sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer],
				                        feed_dict={self.DQNetwork.inputs_: states_mb,
				                                   self.DQNetwork.target_Q: targets_mb,
				                                   self.DQNetwork.actions_: actions_mb})


			if episode%self.TARGET_UPDATE==0:
				# Update the parameters of our TargetNetwork with DQN_weights
				#update_target = self.TargetUpdate()


				# Get the parameters of our DQNNetwork
				from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "DQNetwork")

				# Get the parameters of our Target_network
				to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TargetNetwork")

				op_holder = []

				# Update our target_network parameters with DQNNetwork parameters
				for from_var,to_var in zip(from_vars,to_vars):
					op_holder.append(to_var.assign(from_var))
				
				sess.run(op_holder)

				print("Target Model updated")

	            # Write TF Summaries
	           # summary = sess.run(write_op, feed_dict={self.DQNetwork.inputs_: states_mb,
	            #                                       self.DQNetwork.target_Q: targets_mb,
	            #                                       self.DQNetwork.actions_: actions_mb})
	           # writer.add_summary(summary, episode)
	           # writer.flush()

			print('Total steps: {}'.format(total_steps),'Episode: {}'.format(episode),'Step: {}'.format(steps_taken),
					              'Total reward: {}'.format(np.sum(episode_rewards)),
					              'Explore P: {:.4f}'.format(explore_probability),
					            'Training Loss {:.4f}'.format(loss))


	 		# Save model every 100 episodes
			if episode % 100 == 0:
				save_path = self.saver.save(sess, "./models/model.ckpt")
				print("Model Saved")
Exemplo n.º 13
0
class DQN():
    def __init__(self, conf, device):
        self.conf = conf
        self.state_dim = conf['state_dim']
        self.action_dim = conf['action_dim']
        self.device = device

        self.q = DQNNetwork(self.state_dim, self.action_dim).to(self.device)
        self.q_target = DQNNetwork(self.state_dim,
                                   self.action_dim).to(self.device)
        self.q_target.load_state_dict(self.q.state_dict())
        self.q_target.eval()

        self.memory = ReplayMemory(self.conf)

        self.optimizer = optim.Adam(self.q.parameters(), lr=lr_dqn)

        self.loss = HuberLoss()
        self.loss = self.loss.to(self.device)
        self.currIteration = 0

    def update(self):
        for i in range(1):
            if self.memory.length() < self.conf['batch_size']:
                return
            transitions = self.memory.sample_batch(self.conf['batch_size'])
            one_batch = Transition(*zip(*transitions))

            action_batch = torch.cat(one_batch.action).view(
                -1, 1)  # [batch-size, 1]
            reward_batch = torch.cat(one_batch.reward).view(
                -1, 1)  # [batch-size, 1]
            state_batch = torch.cat(one_batch.state).view(
                -1, self.conf['state_dim'])
            next_state_batch = torch.cat(one_batch.next_state).view(
                -1, self.conf['state_dim'])

            # dones_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)

            # # compute Q(s_t, a) - the model computes Q(s_t), then we select the
            # # columns of actions taken
            current_q = self.q(state_batch).gather(1, action_batch)

            # # compute V(s_{t+1}) for all next states and all actions,
            # # and we then take max_a { V(s_{t+1}) }
            next_q = self.q_target(next_state_batch).max(1)[0].view(-1, 1)

            # # compute target q by: r + gamma * max_a { V(s_{t+1}) }
            target_q = reward_batch + (self.conf['gamma'] * next_q)
            # print("current_q:%s, target_q:%s"%(current_q[0].item(), target_q[0].item()))

            # optimizer step
            loss = self.loss(current_q, target_q)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            return loss.cpu().item()

    ### epsilon-greedy algorithm ###
    def select_action(self, state_ts):
        '''
        input 'state_ts' is tensor [1, 2*num_plant, 1], which is unsqueezed type
        output 'action' is tensor [1X1]
        '''

        sample = random.random()
        eps_threshold = self.conf['epsilon_end'] + (
            self.conf['epsilon_start'] - self.conf['epsilon_end']) * math.exp(
                -1. * self.currIteration / self.conf['epsilon_decay'])
        self.currIteration += 1
        # if (self.currIteration % 1000) == 0:
        #     print("currIteration:%s, eps_threshold:%s"%(self.currIteration, eps_threshold))

        if sample > eps_threshold:
            with torch.no_grad():
                # argmax_a Q(s)
                action = self.q.forward(state_ts).argmax().view(
                    1
                )  # .max(0)[1] : (0) 차원에서 최대 값[0]/index[1] and .view(1,1) : from tuple to [1X1]
                return action
        else:
            action = torch.tensor(
                [random.randint(0, self.conf['action_dim'] - 1)],
                device=self.device,
                dtype=torch.long)  # [1X1]
            return action
Exemplo n.º 14
0
class MultipendulumSim():
    def __init__(self, env, conf):
        self.env = env
        self.conf = conf
        self.set_cuda()
        
        self.agent = DDPG(conf, self.device)
        self.memory = ReplayMemory(conf)
        
        
        
        
    def train(self):
        self.epi_rewards = []
        self.epi_losses = []
        
        for epi in range(self.conf['num_episode']):  # episodes
            print("--- episode %s ---"%(epi))
            epi_reward = 0.0
            state = self.env.reset()                    # [2*num_plant, 1]
            state_ts = to_tensor(state).unsqueeze(0)    # [1, 2*num_plant, 1] # unsqueeze(0) on 'state' is necessary for reply memory
            dataPlot = dataPlotter_v2(self.conf)
            
            t = P.t_start
            while t < P.t_end:        # one episode (simulation)
                t_next_plot = t + P.t_plot
                while t < t_next_plot:  # data plot period
                    if round(t,3)*1000 % 10 == 0: # every 10 ms, schedule udpate
                        
                        action = self.agent.select_action(state_ts)     # action type: tensor [1X1]
                        next_state, reward, done, info = self.env.step(action.item(), t) # shape of next_state : [(2*num_plant) X 1]
                        epi_reward += reward
                        # self.env.step(0, t)    # test for env.step() function
                        
                        if done: 
                            next_state_ts = None
                            break
                        else:
                            next_state_ts = to_tensor(next_state).unsqueeze(0)  # [1, 2*num_plant, 1]
                        reward_ts = to_tensor(np.asarray(reward).reshape(-1))   # it's size should be [1] for reply buffer
                        
                        # memory push
                        self.memory.push_transition(state_ts, action, next_state_ts, reward_ts)
                        
                        state_ts = next_state_ts
                        
                        # model optimization step
                        currLoss = self.agent.optimization_model(self.memory)
                    else:   # every 1 ms
                        self.env.update_plant_state(t) # plant status update
                    t = t + P.Ts
                # self.update_dataPlot(dataPlot, t) # update data plot
                if next_state_ts == None:   # episode terminates
                    dataPlot.close()
                    break
            
            # episode done
            self.epi_rewards.append(epi_reward)
            self.epi_losses.append(currLoss)
            # The target network has its weights kept frozen most of the time
            if epi % self.conf['target_update'] == 0:
                self.agent.scheduler_target.load_state_dict(self.agent.scheduler.state_dict())
    
        # Save satet_dict
        torch.save(self.agent.scheduler.state_dict(), MODEL_PATH)
        self.save_log()
        self.load_log()
        
    def save_log(self):
        combined_stats = dict()
        combined_stats['rollout/return'] = np.mean(self.epi_rewards)
        combined_stats['rollout/return_history'] = self.epi_rewards
        combined_stats['train/loss'] = self.epi_losses
        with open(LOG_PATH + LOG_FILE, 'wb') as f:
            pickle.dump(combined_stats,f)
        # combined-stats['train/loss_scheduler'] = 
        
        
        # combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        # combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        # combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        # combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        # combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        # combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        # combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        # combined_stats['total/duration'] = duration
        # combined_stats['total/steps_per_second'] = float(t) / float(duration)
        # combined_stats['total/episodes'] = episodes
        # combined_stats['rollout/episodes'] = epoch_episodes
        # combined_stats['rollout/actions_std'] = np.std(epoch_actions)
    
    def load_log(self):
        with open(LOG_PATH + LOG_FILE, 'rb') as f:
            data = pickle.load(f)
        print("data:", data)
    
    def set_cuda(self):
        self.is_cuda = torch.cuda.is_available()
        print("torch version: ", torch.__version__)
        print("is_cuda: ", self.is_cuda)
        print(torch.cuda.get_device_name(0))
        if self.is_cuda:
            self.device = torch.device("cuda:0")
            print("Program will run on *****GPU-CUDA***** ")
        else:
            self.device = torch.device("cpu")
            print("Program will run on *****CPU***** ")
            
    def update_dataPlot(self, dataPlot, t):
        r_buff, x_buff, u_buff = self.env.get_current_plant_status()
        for i in range(self.env.num_plant):
            dataPlot.update(i, t, r_buff[i], x_buff[i], u_buff[i])
        dataPlot.plot()
        plt.pause(0.0001)