def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device('cpu') num_action = 2 num_state = 4 num_process = 5 global_Actor = NeuralNet.ActorNet(inputs=num_state, outputs=num_action, num_hidden_layers=2, hidden_dim=8).to(device) #summary(global_Actor, input_size=(10,num_state)) global_Critic = NeuralNet.CriticNet(inputs=num_state, outputs=1, num_hidden_layers=2, hidden_dim=8).to(device) #summary(global_Critic, input_size=(10,num_state)) batch_size = 64 GAMMA = 0.95 max_episodes = 5000 max_step = 1000 global_Actor.share_memory() global_Critic.share_memory() processes = [] processes_socket = [] processes_agent = [] mp.set_start_method('spawn') print("MP start method:", mp.get_start_method()) ip = '110.76.78.109' port = 1111 for rank in range(num_process): processes_socket.append(0) processes_socket[rank] = ClientSocket.MySocket(port, 'f', 'ffff?f') processes_agent.append(0) processes_agent[rank] = Agent.Brain(GlobalActorNet=global_Actor, GlobalCriticNet=global_Critic, device=device, socket=processes_socket[rank], num_action=num_action, max_episodes=max_episodes, max_step=max_step, batch_size=batch_size, GAMMA=GAMMA) p = mp.Process(target=processes_agent[rank].train, args=()) p.start() processes.append(p) for p in processes: p.join()
def main(): processes = [] processes_socket = [] processes_agent = [] device = torch.device('cpu') num_action = 2 num_state = 4 num_process = 1 batch_size = 64 GAMMA = 0.95 max_episodes = 5000 max_step = 1000 global_Actor = NeuralNet.ActorNet(inputs=num_state, outputs=num_action, num_hidden_layers=2, hidden_dim=8).to(device) global_Critic = NeuralNet.CriticNet(inputs=num_state, outputs=1, num_hidden_layers=2, hidden_dim=8).to(device) dic = torch.load(f"D:/modelDict/actor/modelDict.pt") global_Actor.load_state_dict(torch.load("D:/modelDict/actor/modelDict.pt")) global_Critic.load_state_dict( torch.load("D:/modelDict/critic/modelDict.pt")) port = 1111 for rank in range(num_process): processes_socket.append(0) processes_socket[rank] = ClientSocket.MySocket(port, 'f', 'ffff?f') processes_agent.append(0) processes_agent[rank] = Agent.Brain(GlobalActorNet=global_Actor, GlobalCriticNet=global_Critic, device=device, socket=processes_socket[rank], num_action=num_action, max_episodes=max_episodes, max_step=max_step, batch_size=batch_size, GAMMA=GAMMA) p = mp.Process(target=processes_agent[rank].test, args=(global_Actor, global_Critic)) p.start() processes.append(p) for p in processes: p.join()
def train(global_actor, global_critic, num_state, num_action, device, socket, max_episodes, max_step, batch_size, GAMMA): local_actor = NeuralNet.ActorNet(inputs=num_state, outputs=num_action, num_hidden_layers=3, hidden_dim=32).to(device) local_critic = NeuralNet.CriticNet(inputs=num_state, outputs=1, num_hidden_layers=3, hidden_dim=32).to(device) """ print("local actor") for param_tensor in local_actor.state_dict(): print(param_tensor, "\t", local_actor.state_dict()[param_tensor].size()) print("global actor") for param_tensor in global_actor.state_dict(): print(param_tensor, "\t", global_actor.state_dict()[param_tensor].size()) """ local_actor.load_state_dict(global_actor.state_dict()) local_critic.load_state_dict(global_critic.state_dict()) entropy_ceof = 0.001 actor_optimizer = optim.Adam(local_actor.parameters()) critic_optimizer = optim.Adam(local_critic.parameters()) memory = Advantage.AdvantageMemory(batch_size, num_state, device=device, GAMMA=GAMMA, kind_action=1) for epi in range(max_episodes): local_actor.load_state_dict(global_actor.state_dict()) local_critic.load_state_dict(global_critic.state_dict()) state, reward, done, height = socket.getdata() local_actor.eval() local_critic.eval() for step in range(max_step): local_actor.eval() local_critic.eval() #local_actor.train() #local_critic.train() action_prob = local_actor( torch.from_numpy(state).float().to(device)) #print(action_prob) action_distrib = Categorical(action_prob) action = action_distrib.sample() #print("start send data") socket.senddata(float(action.item())) next_state, reward, done, height = socket.getdata() if done is True: reward = -10 mask = 0 else: reward = 0 #reward = (height - 3)/10 mask = 1 #print(epi," ",step," ",done) #print(action_prob[action],state, next_state, reward , mask) memory.input_data(state, next_state, reward, mask) state = next_state if memory.fill_batch(): action_prob = local_actor(memory.states).float().to(device) action_distrib = Categorical(action_prob) action = action_distrib.sample() action = action.unsqueeze(1) #print('action is ', action) #print('action_prob before is ', action_prob) #print('action_prob is ', action_prob) action_prob = action_prob.gather(1, action) #print('action_prob is ', action_prob) state_value = local_critic(memory.states) next_state_value = local_critic(memory.next_states) #print('memory rewards',memory.rewards) #print('next_state value is ',next_state_value) Q = memory.rewards + GAMMA * next_state_value.detach( ) * memory.masks #print('memory.rewards',memory.rewards,'GAMMA',GAMMA,'next_state_value.detach()',next_state_value.detach()) A = Q - state_value #print('Q is ',Q, ' state value is ',state_value,' A is ',A) #print(memory.masks, Q) #print('next_state_value is ',next_state_value,'Q is ',Q) #print('A is ',A) local_actor.train() local_critic.train() critic_optimizer.zero_grad() critic_loss = F.smooth_l1_loss(state_value, Q) #print('critic loss is ', critic_loss) critic_loss.backward() critic_optimizer.step() global_critic.load_state_dict(local_critic.state_dict()) log_prob = torch.log(action_prob + 1e-3) entropy = -log_prob * action_prob #print('action_prob is ', action_prob) #print('log_prob is ', log_prob) #print('A is ', A) #print('entropy_ceof * entropy',entropy_ceof * entropy) #print('-A.detach() * log_prob',-A.detach() * log_prob) actor_loss = -A.detach() * log_prob - entropy_ceof * entropy #print('Q is ', Q, ' state value is ', state_value, ' A is ', A,'actorloss ',actor_loss) #print('actor loss is ', -A.detach() * log_prob) actor_loss = torch.sum(actor_loss, -1) actor_loss = torch.sum(actor_loss, -1) #print('actor_loss is ', actor_loss) actor_loss /= len(action_prob) #print('actor_loss is ', actor_loss) #print('actor loss is ',actor_loss) actor_loss.backward() actor_optimizer.step() global_actor.load_state_dict(local_actor.state_dict()) if done is True: print('epi : ', epi, " is end and step is :", step) break
def train(global_actor, global_critic, num_state, num_action, device, socket, max_episodes, max_step, batch_size, GAMMA): local_actor = NeuralNet.ActorNet(inputs = num_state, outputs = num_action,num_hidden_layers = 2 , hidden_dim = 8).to(device) local_critic = NeuralNet.CriticNet(inputs = num_state, outputs = 1,num_hidden_layers = 2 , hidden_dim = 8).to(device) """ print("local actor") for param_tensor in local_actor.state_dict(): print(param_tensor, "\t", local_actor.state_dict()[param_tensor].size()) print("global actor") for param_tensor in global_actor.state_dict(): print(param_tensor, "\t", global_actor.state_dict()[param_tensor].size()) """ local_actor.load_state_dict(global_actor.state_dict()) local_critic.load_state_dict(global_critic.state_dict()) entropy_ceof = 0.001 actor_optimizer = optim.Adam(global_actor.parameters()) critic_optimizer = optim.Adam(global_critic.parameters()) memory= Advantage.AdvantageMemory( batch_size, num_state,device=device,GAMMA=GAMMA, kind_action = 1) for epi in range(max_episodes): state,reward,done,height = socket.getdata() local_actor.eval() local_critic.eval() for step in range(max_step): local_actor.eval() local_critic.eval() local_actor.load_state_dict(global_actor.state_dict()) local_critic.load_state_dict(global_critic.state_dict()) action_prob = local_actor(torch.from_numpy(state).float().to(device)) action_distrib = Categorical(action_prob) action = action_distrib.sample() #print("start send data") socket.senddata(float(action.item())) next_state, reward, done, height = socket.getdata() if done is True : reward = -10 else : reward = (height - 3)/10 #print(epi," ",step," ",done) mask = 0 if done is True else 1 #print(action_prob[action],state, next_state, reward , mask) memory.input_data(state,next_state,reward,mask) state = next_state if memory.fill_batch() : action_prob = local_actor(memory.states).float().to(device) action_distrib = Categorical(action_prob) action = action_distrib.sample() action_prob = action_prob[action] state_value = local_critic(memory.states) next_state_value = local_critic(memory.next_states) Q = memory.rewards + GAMMA * next_state_value.detach()*memory.masks A = Q - state_value local_actor.train() local_critic.train() critic_optimizer.zero_grad() critic_loss = F.mse_loss(state_value, Q.detach()) critic_loss.backward() critic_optimizer.step() global_critic.load_state_dict(local_critic.state_dict()) log_prob = torch.log(action_prob) entropy = -(log_prob * action_prob) actor_loss = -A.detach() * torch.log(action_prob) - entropy_ceof * entropy actor_loss = torch.sum(actor_loss,-1) actor_loss = torch.sum(actor_loss, -1) actor_loss /= len(action_prob) actor_loss.backward() actor_optimizer.step() global_actor.load_state_dict(local_actor.state_dict()) if done is True : print('epi : ', epi, " is end and step is :",step) break