def update(expbuffer, vin, oldvin, p=False): #(action,state,place,next_state,next_place,reward,over) action, X, S1, S2, oldX, oldS1, oldS2, Y = expbuffer.sample() #Qmax=torch.Tensor([replay[x[0]][x[1]][4] for x in index]).float() .cuda() oldoutputs, _ = oldvin(oldX, oldS1, oldS2, myvin.Config()) oldouputs = oldoutputs.detach() Qmax = (torch.max(oldoutputs, dim=1)[0]).squeeze().cuda() outputs, _ = vin(X, S1, S2, myvin.Config()) print(outputs.shape, action.unsqueeze(1).shape) Qvalue = outputs.gather(index=action.unsqueeze(1).long(), dim=1).squeeze().cuda() #print(Qvalue.shape) #print(Y.shape) TDtarget = (Y + gamma * Qmax).cuda() criterion = torch.nn.MSELoss(size_average=False) loss = criterion(Qvalue, Y).cuda() optimizer = optim.RMSprop(VIN.parameters(), lr=myvin.Config().lr, eps=1e-6) optimizer.zero_grad() loss.backward() optimizer.step() if p: print(outputs[0], Qvalue[0], TDtarget[0], Y[0].cpu().numpy()) grid.plot2(X[0].cpu().numpy(), int(S1[0].item()), int(S2[0].item())) return loss
def vinPredict(status,place,vin): if np.random.random()<e: action=np.random.randint(9) return action S1=torch.Tensor([place[0]]).cuda() S2=torch.Tensor([place[1]]).cuda() X=torch.Tensor(status).expand(1, len(status),status[0].shape[0],status[0].shape[1]).cuda() config=myvin.Config() q1,q2=vin(X,S1,S2,myvin.Config()) q1=q1.cuda() return q1
def update(experience, vin, oldvin): #(action,state,place,next_state,next_place,reward,over) X = [] S1 = [] S2 = [] oldS1 = [] #next action oldS2 = [] #next action oldX = [] action = [] Y = [] #torch.Tensor(reward[::-1]) index = [] for j in range(myvin.Config().batch_size): # sample experience from replay x = np.random.randint(len(experience)) #status,place,reward,over,action Y.append(experience[x][5]) action.append(experience[x][0]) X.append(experience[x][1]) oldX.append(experience[x][3]) S1.append(experience[x][2][0]) S2.append(experience[x][2][1]) oldS1.append(experience[x][4][0]) oldS2.append(experience[x][4][1]) #index.append((x1,x2+1)) X = torch.from_numpy(np.array( X)).float().cuda() #do not change it to torch.Tensor(X).float() S1 = torch.from_numpy(np.array(S1)).float().cuda() S2 = torch.from_numpy(np.array(S2)).float().cuda() oldS1 = torch.from_numpy(np.array(oldS1)).float().cuda() oldS2 = torch.from_numpy(np.array(oldS2)).float().cuda() oldX = torch.from_numpy(np.array(oldX)).float().cuda() action = torch.from_numpy(np.array(action)).unsqueeze(dim=1).long().cuda() Y = torch.from_numpy(np.array(Y)).float().cuda() #Qmax=torch.Tensor([replay[x[0]][x[1]][4] for x in index]).float() .cuda() oldoutputs, _ = oldvin(oldX, oldS1, oldS2, myvin.Config()) Qmax = (torch.max(oldoutputs, dim=1)[0]).squeeze().cuda() outputs, _ = vin(X, S1, S2, myvin.Config()) Qvalue = outputs.gather(index=action, dim=1).squeeze().cuda() #print(Qvalue.shape) #print(Y.shape) TDtarget = (Y + gamma * Qmax).cuda() criterion = torch.nn.MSELoss(size_average=False) loss = criterion(Qvalue, Y).cuda() optimizer = optim.RMSprop(VIN.parameters(), lr=myvin.Config().lr, eps=1e-6) optimizer.zero_grad() loss.backward() # Update params optimizer.step()
def vinPolicy(status, place): if np.random.random() < e: action = np.random.randint(9) return action S1 = torch.Tensor([place[0]]) S2 = torch.Tensor([place[1]]) X = torch.Tensor(status).expand(1, len(status), status[0].shape[0], status[0].shape[1]) config = myvin.Config() q1, q2 = VIN(X, S1, S2, myvin.Config()) #print(q1) #print(q2.shape) _, action = torch.max(q1, dim=1) action = int(action) #print(action) assert 0 <= action and action < 9 return action
#import torchvision.transforms as transforms grid = gw.GridWorld_8dir(nobstacle=3) #grid.show() #grid.plot() #for _ in range(100): # grid.step(grid.sample()) #grid.plot() def randomWalk(status, place): return np.random.randint(9) #evaluate(grid,randomWalk) VIN = myvin.VIN(myvin.Config()) def vinPolicy(status, place): if np.random.random() < e: action = np.random.randint(9) return action S1 = torch.Tensor([place[0]]) S2 = torch.Tensor([place[1]]) X = torch.Tensor(status).expand(1, len(status), status[0].shape[0], status[0].shape[1]) config = myvin.Config() q1, q2 = VIN(X, S1, S2, myvin.Config()) #print(q1) #print(q2.shape) _, action = torch.max(q1, dim=1)
for i in range(iters): status, place, reward, over, action = env.reset() while over == False: status, place, reward, over, action = env.step( policy(status, place)) total_reward += env.total_reward + 0.0 if i % 100 == 0: print(i) return total_reward / iters device = 0 if len(sys.argv) > 1: device = int(sys.argv[1]) with torch.cuda.device(device): VIN = myvin.VIN(myvin.Config()) VIN.cuda() def vinPolicy(status, place): if np.random.random() < e: action = np.random.randint(9) return action S1 = torch.Tensor([place[0]]).cuda() S2 = torch.Tensor([place[1]]).cuda() X = torch.Tensor(status).expand(1, len(status), status[0].shape[0], status[0].shape[1]).cuda() config = myvin.Config() q1, q2 = VIN(X, S1, S2, myvin.Config()) q1 = q1.cuda() q2 = q2.cuda() #print(q1)
for i in range(iters): status, place, reward, over = env.reset() t = 0 while over == False and t < 100: action = policy(status, place) status, place, reward, over = env.step(action) t += 1 total_reward += env.total_reward + 0.0 if i % 100 == 0: print(i) return total_reward / iters, time.time() - time2 device = 0 VIN = myvin.VIN(myvin.Config()) #VIN.load_state_dict(torch.load("model/model1020.pkl")) print(VIN) oldVIN = myvin.VIN(myvin.Config()) oldVIN.load_state_dict(VIN.state_dict()) grid = gw.GridWorld2_8dir(8, 8, nobstacle=4, moving=True) e = 0 #print(evaluate(grid,vinPolicy,1000)) #print(evaluate(grid,randomWalk)) maxStep = 5000000 episodes = 20000 gamma = 0.99 Tmax = 1000 replay = [] max_exp = 5000 learning_begin = 1000
status, place, reward, over, action = env.reset() while over == False: status, place, reward, over, action = env.step( policy(status, place)) total_reward += env.total_reward + 0.0 if i % 100 == 0: print(i) return total_reward / iters device = 1 if len(sys.argv) > 1: device = int(sys.argv[1]) with torch.cuda.device(device): VIN = myvin.VIN(myvin.Config()).cuda() def vinPolicy(status, place): if np.random.random() < e: action = np.random.randint(9) return action S1 = torch.Tensor([place[0]]).cuda() S2 = torch.Tensor([place[1]]).cuda() X = torch.Tensor(status).expand(1, len(status), status[0].shape[0], status[0].shape[1]).cuda() config = myvin.Config() q1, q2 = VIN(X, S1, S2, myvin.Config()) q1 = q1.cuda() q2 = q2.cuda() #print(q1) #print(q2.shape)
status,place,reward,over=env.reset() t=0 while over==False and t<100: action=policy(status,place) status,place,reward,over=env.step(action) t+=1 total_reward+=env.total_reward+0.0 if i%100==0: print(i) return total_reward/iters,time.time()-time2 device=0 if len(sys.argv)>1: device=int(sys.argv[1]) with torch.cuda.device(device): VIN=myvin.VIN(myvin.Config()).cuda() #VIN.load_state_dict(torch.load("model/model1020.pkl")) print(VIN) oldVIN=myvin.VIN(myvin.Config()).cuda() oldVIN.load_state_dict(VIN.state_dict()) grid=gw.GridWorld2_8dir(8,8,nobstacle=4,moving=True) e=0 #print(evaluate(grid,vinPolicy,1000)) print(evaluate(grid,randomWalk)) maxStep=5000000 episodes=20000 gamma=0.99 Tmax=1000 replay=[] max_exp=5000 learning_begin=1000
def update(experience,vin,oldvin,p=False): #(action,state,place,next_state,next_place,reward,over) X=[] S1=[] S2=[] oldS1=[]#next action oldS2=[]#next action oldX=[] action=[] Y=[]#torch.Tensor(reward[::-1]) index=[] for j in range(myvin.Config().batch_size):# sample experience from replay x=np.random.randint(len(experience)) #status,place,reward,over,action while experience[x][6]==True: x=np.random.randint(len(experience)) Y.append(experience[x][5]) action.append(experience[x][0]) X.append(experience[x][1]) oldX.append(experience[x][3]) S1.append(experience[x][2][0]) S2.append(experience[x][2][1]) oldS1.append(experience[x][4][0]) oldS2.append(experience[x][4][1]) #index.append((x1,x2+1)) X=torch.from_numpy(np.array(X)).float()#do not change it to torch.Tensor(X).float() S1=torch.from_numpy(np.array(S1)).float()#.unsqueeze(1) S2=torch.from_numpy(np.array(S2)).float()#.unsqueeze(1) oldX=torch.from_numpy(np.array(oldX)).float() oldS1=torch.from_numpy(np.array(oldS1)).float()#.unsqueeze(1) oldS2=torch.from_numpy(np.array(oldS2)).float()#.unsqueeze(1) #print("here",S1.shape) action=torch.from_numpy(np.array(action)).unsqueeze(dim=1).long() Y=torch.from_numpy(np.array(Y)).float() #Qmax=torch.Tensor([replay[x[0]][x[1]][4] for x in index]).float() oldoutputs, _ = oldvin(oldX,oldS1,oldS2 , myvin.Config()) oldouputs=oldoutputs.detach()###################################3 Qmax=(torch.max(oldoutputs,dim=1)[0]).squeeze() outputs, _ = vin(X,S1,S2 , myvin.Config()) Qvalue=outputs.gather(index=action,dim=1).squeeze()##################### #print(Qvalue.shape) #print(Y.shape) TDtarget=(Y+gamma*Qmax) bellman_error=-(TDtarget-Qvalue) optimizer = optim.RMSprop(VIN.parameters(), lr=myvin.Config().lr, eps=1e-6) optimizer.zero_grad() Qvalue.backward(bellman_error.data)################################# optimizer.step() if p: print(outputs[0],Qvalue[0],TDtarget[0],Y[0].cpu().numpy()) grid.plot2(X[0].cpu().numpy(),int(S1[0].item()),int(S2[0].item())) return bellman_error
def evaluate(env,policy,iters=500): total_reward=0 for i in range(iters): status,place,reward,over,action=env.reset() while over==False: status,place,reward,over,action=env.step(policy(status,place)) total_reward+=env.total_reward+0.0 if i%100==0: print(i) return total_reward/iters device=1 if len(sys.argv)>1: device=int(sys.argv[1]) with torch.cuda.device(device): VIN=myvin.VIN(myvin.Config()).cuda() def vinPolicy(status,place): if np.random.random()<e: action=np.random.randint(9) return action S1=torch.Tensor([place[0]]).cuda() S2=torch.Tensor([place[1]]).cuda() X=torch.Tensor(status).expand(1, len(status),status[0].shape[0],status[0].shape[1]).cuda() config=myvin.Config() q1,q2=VIN(X,S1,S2,myvin.Config()) q1=q1.cuda() q2=q2.cuda() #print(q1) #print(q2.shape) _,action=torch.max(q1,dim=1) action=int(action)
while over == False and t < 100: action = policy(status, place) status, place, reward, over = env.step(action) t += 1 total_reward += env.total_reward + 0.0 if i % 100 == 0: print(i) return total_reward / iters, time.time() - time2 device = 0 if len(sys.argv) > 1: device = int(sys.argv[1]) with torch.cuda.device(device): VIN = myvin.VIN(myvin.Config()) VIN = VIN.cuda() #VIN.load_state_dict(torch.load("model2/moving-model-9-3920.pkl")) print(VIN) oldVIN = myvin.VIN(myvin.Config()).cuda() oldVIN.load_state_dict(VIN.state_dict()) grid = gw.GridWorld3_8dir(8, 8, nobstacle=4, moving=True) e = 0 #print(evaluate(grid,vinPolicy,1000)) #print(evaluate(grid,randomWalk)) maxStep = 5000000 episodes = 20000 gamma = 0.99 Tmax = 1000 replay = [] max_exp = 10000