Пример #1
0
def update(expbuffer, vin, oldvin, p=False):
    #(action,state,place,next_state,next_place,reward,over)
    action, X, S1, S2, oldX, oldS1, oldS2, Y = expbuffer.sample()
    #Qmax=torch.Tensor([replay[x[0]][x[1]][4] for x in index]).float() .cuda()

    oldoutputs, _ = oldvin(oldX, oldS1, oldS2, myvin.Config())
    oldouputs = oldoutputs.detach()
    Qmax = (torch.max(oldoutputs, dim=1)[0]).squeeze().cuda()

    outputs, _ = vin(X, S1, S2, myvin.Config())
    print(outputs.shape, action.unsqueeze(1).shape)
    Qvalue = outputs.gather(index=action.unsqueeze(1).long(),
                            dim=1).squeeze().cuda()
    #print(Qvalue.shape)
    #print(Y.shape)

    TDtarget = (Y + gamma * Qmax).cuda()

    criterion = torch.nn.MSELoss(size_average=False)
    loss = criterion(Qvalue, Y).cuda()
    optimizer = optim.RMSprop(VIN.parameters(), lr=myvin.Config().lr, eps=1e-6)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if p:
        print(outputs[0], Qvalue[0], TDtarget[0], Y[0].cpu().numpy())
        grid.plot2(X[0].cpu().numpy(), int(S1[0].item()), int(S2[0].item()))
    return loss
Пример #2
0
def vinPredict(status,place,vin):
    if np.random.random()<e:
        action=np.random.randint(9)
        return action
    S1=torch.Tensor([place[0]]).cuda()
    S2=torch.Tensor([place[1]]).cuda()
    X=torch.Tensor(status).expand(1, len(status),status[0].shape[0],status[0].shape[1]).cuda()
    config=myvin.Config()
    q1,q2=vin(X,S1,S2,myvin.Config())
    q1=q1.cuda()
    
    return q1
Пример #3
0
def update(experience, vin, oldvin):
    #(action,state,place,next_state,next_place,reward,over)
    X = []
    S1 = []
    S2 = []
    oldS1 = []  #next action
    oldS2 = []  #next action
    oldX = []
    action = []
    Y = []  #torch.Tensor(reward[::-1])
    index = []
    for j in range(myvin.Config().batch_size):  # sample experience from replay
        x = np.random.randint(len(experience))
        #status,place,reward,over,action

        Y.append(experience[x][5])
        action.append(experience[x][0])
        X.append(experience[x][1])
        oldX.append(experience[x][3])
        S1.append(experience[x][2][0])
        S2.append(experience[x][2][1])
        oldS1.append(experience[x][4][0])
        oldS2.append(experience[x][4][1])
        #index.append((x1,x2+1))

    X = torch.from_numpy(np.array(
        X)).float().cuda()  #do not change it to torch.Tensor(X).float()
    S1 = torch.from_numpy(np.array(S1)).float().cuda()
    S2 = torch.from_numpy(np.array(S2)).float().cuda()
    oldS1 = torch.from_numpy(np.array(oldS1)).float().cuda()
    oldS2 = torch.from_numpy(np.array(oldS2)).float().cuda()
    oldX = torch.from_numpy(np.array(oldX)).float().cuda()
    action = torch.from_numpy(np.array(action)).unsqueeze(dim=1).long().cuda()
    Y = torch.from_numpy(np.array(Y)).float().cuda()
    #Qmax=torch.Tensor([replay[x[0]][x[1]][4] for x in index]).float() .cuda()

    oldoutputs, _ = oldvin(oldX, oldS1, oldS2, myvin.Config())
    Qmax = (torch.max(oldoutputs, dim=1)[0]).squeeze().cuda()

    outputs, _ = vin(X, S1, S2, myvin.Config())
    Qvalue = outputs.gather(index=action, dim=1).squeeze().cuda()
    #print(Qvalue.shape)
    #print(Y.shape)
    TDtarget = (Y + gamma * Qmax).cuda()
    criterion = torch.nn.MSELoss(size_average=False)
    loss = criterion(Qvalue, Y).cuda()
    optimizer = optim.RMSprop(VIN.parameters(), lr=myvin.Config().lr, eps=1e-6)
    optimizer.zero_grad()
    loss.backward()
    # Update params
    optimizer.step()
Пример #4
0
def vinPolicy(status, place):
    if np.random.random() < e:
        action = np.random.randint(9)
        return action
    S1 = torch.Tensor([place[0]])
    S2 = torch.Tensor([place[1]])
    X = torch.Tensor(status).expand(1, len(status), status[0].shape[0],
                                    status[0].shape[1])
    config = myvin.Config()
    q1, q2 = VIN(X, S1, S2, myvin.Config())
    #print(q1)
    #print(q2.shape)
    _, action = torch.max(q1, dim=1)
    action = int(action)
    #print(action)
    assert 0 <= action and action < 9
    return action
Пример #5
0
#import torchvision.transforms as transforms
grid = gw.GridWorld_8dir(nobstacle=3)


#grid.show()
#grid.plot()
#for _ in range(100):
#    grid.step(grid.sample())
#grid.plot()
def randomWalk(status, place):
    return np.random.randint(9)


#evaluate(grid,randomWalk)

VIN = myvin.VIN(myvin.Config())


def vinPolicy(status, place):
    if np.random.random() < e:
        action = np.random.randint(9)
        return action
    S1 = torch.Tensor([place[0]])
    S2 = torch.Tensor([place[1]])
    X = torch.Tensor(status).expand(1, len(status), status[0].shape[0],
                                    status[0].shape[1])
    config = myvin.Config()
    q1, q2 = VIN(X, S1, S2, myvin.Config())
    #print(q1)
    #print(q2.shape)
    _, action = torch.max(q1, dim=1)
Пример #6
0
    for i in range(iters):
        status, place, reward, over, action = env.reset()
        while over == False:
            status, place, reward, over, action = env.step(
                policy(status, place))
        total_reward += env.total_reward + 0.0
        if i % 100 == 0:
            print(i)
    return total_reward / iters


device = 0
if len(sys.argv) > 1:
    device = int(sys.argv[1])
with torch.cuda.device(device):
    VIN = myvin.VIN(myvin.Config())
    VIN.cuda()

    def vinPolicy(status, place):
        if np.random.random() < e:
            action = np.random.randint(9)
            return action
        S1 = torch.Tensor([place[0]]).cuda()
        S2 = torch.Tensor([place[1]]).cuda()
        X = torch.Tensor(status).expand(1, len(status), status[0].shape[0],
                                        status[0].shape[1]).cuda()
        config = myvin.Config()
        q1, q2 = VIN(X, S1, S2, myvin.Config())
        q1 = q1.cuda()
        q2 = q2.cuda()
        #print(q1)
Пример #7
0
    for i in range(iters):
        status, place, reward, over = env.reset()
        t = 0
        while over == False and t < 100:
            action = policy(status, place)
            status, place, reward, over = env.step(action)
            t += 1
        total_reward += env.total_reward + 0.0
        if i % 100 == 0:
            print(i)
    return total_reward / iters, time.time() - time2


device = 0

VIN = myvin.VIN(myvin.Config())
#VIN.load_state_dict(torch.load("model/model1020.pkl"))
print(VIN)
oldVIN = myvin.VIN(myvin.Config())
oldVIN.load_state_dict(VIN.state_dict())
grid = gw.GridWorld2_8dir(8, 8, nobstacle=4, moving=True)
e = 0
#print(evaluate(grid,vinPolicy,1000))
#print(evaluate(grid,randomWalk))
maxStep = 5000000
episodes = 20000
gamma = 0.99
Tmax = 1000
replay = []
max_exp = 5000
learning_begin = 1000
Пример #8
0
        status, place, reward, over, action = env.reset()
        while over == False:
            status, place, reward, over, action = env.step(
                policy(status, place))
        total_reward += env.total_reward + 0.0

        if i % 100 == 0:
            print(i)
    return total_reward / iters


device = 1
if len(sys.argv) > 1:
    device = int(sys.argv[1])
with torch.cuda.device(device):
    VIN = myvin.VIN(myvin.Config()).cuda()

    def vinPolicy(status, place):
        if np.random.random() < e:
            action = np.random.randint(9)
            return action
        S1 = torch.Tensor([place[0]]).cuda()
        S2 = torch.Tensor([place[1]]).cuda()
        X = torch.Tensor(status).expand(1, len(status), status[0].shape[0],
                                        status[0].shape[1]).cuda()
        config = myvin.Config()
        q1, q2 = VIN(X, S1, S2, myvin.Config())
        q1 = q1.cuda()
        q2 = q2.cuda()
        #print(q1)
        #print(q2.shape)
Пример #9
0
		status,place,reward,over=env.reset()
		t=0
		while over==False and t<100:
			action=policy(status,place)
			status,place,reward,over=env.step(action)
			t+=1
		total_reward+=env.total_reward+0.0
		if i%100==0:
			print(i)
	return total_reward/iters,time.time()-time2
device=0
if len(sys.argv)>1:
   device=int(sys.argv[1])
with torch.cuda.device(device):
	
	VIN=myvin.VIN(myvin.Config()).cuda()
	#VIN.load_state_dict(torch.load("model/model1020.pkl"))
	print(VIN)
	oldVIN=myvin.VIN(myvin.Config()).cuda()
	oldVIN.load_state_dict(VIN.state_dict())
	grid=gw.GridWorld2_8dir(8,8,nobstacle=4,moving=True)
	e=0
	#print(evaluate(grid,vinPolicy,1000))
	print(evaluate(grid,randomWalk))
	maxStep=5000000
	episodes=20000
	gamma=0.99
	Tmax=1000
	replay=[]
	max_exp=5000
	learning_begin=1000
Пример #10
0
def update(experience,vin,oldvin,p=False):
    #(action,state,place,next_state,next_place,reward,over)
    X=[]
    S1=[]
    S2=[]
    oldS1=[]#next action
    oldS2=[]#next action
    oldX=[]
    action=[]
    Y=[]#torch.Tensor(reward[::-1])
    index=[]
    for j in range(myvin.Config().batch_size):# sample experience from replay
        x=np.random.randint(len(experience))
        #status,place,reward,over,action
        while experience[x][6]==True:
            x=np.random.randint(len(experience))
        Y.append(experience[x][5])
        action.append(experience[x][0])
        X.append(experience[x][1])
        oldX.append(experience[x][3])
        S1.append(experience[x][2][0])
        S2.append(experience[x][2][1])
        oldS1.append(experience[x][4][0])
        oldS2.append(experience[x][4][1])
        #index.append((x1,x2+1))

    X=torch.from_numpy(np.array(X)).float()#do not change it to torch.Tensor(X).float()
    S1=torch.from_numpy(np.array(S1)).float()#.unsqueeze(1)
    S2=torch.from_numpy(np.array(S2)).float()#.unsqueeze(1)

    oldX=torch.from_numpy(np.array(oldX)).float()
    oldS1=torch.from_numpy(np.array(oldS1)).float()#.unsqueeze(1)
    oldS2=torch.from_numpy(np.array(oldS2)).float()#.unsqueeze(1)
    #print("here",S1.shape)
    action=torch.from_numpy(np.array(action)).unsqueeze(dim=1).long()

    Y=torch.from_numpy(np.array(Y)).float()
    #Qmax=torch.Tensor([replay[x[0]][x[1]][4] for x in index]).float() 


    oldoutputs, _ = oldvin(oldX,oldS1,oldS2 ,  myvin.Config())
    oldouputs=oldoutputs.detach()###################################3
    Qmax=(torch.max(oldoutputs,dim=1)[0]).squeeze()

    outputs, _ = vin(X,S1,S2 ,  myvin.Config())
    Qvalue=outputs.gather(index=action,dim=1).squeeze()#####################
    #print(Qvalue.shape)
    #print(Y.shape)

    TDtarget=(Y+gamma*Qmax)
    bellman_error=-(TDtarget-Qvalue)
    
    optimizer = optim.RMSprop(VIN.parameters(), lr=myvin.Config().lr, eps=1e-6) 
    optimizer.zero_grad()  
    Qvalue.backward(bellman_error.data)#################################
    optimizer.step()

    if p:
        print(outputs[0],Qvalue[0],TDtarget[0],Y[0].cpu().numpy())
        grid.plot2(X[0].cpu().numpy(),int(S1[0].item()),int(S2[0].item()))
    return bellman_error
Пример #11
0
def evaluate(env,policy,iters=500):
	    total_reward=0
	    for i in range(iters):
		status,place,reward,over,action=env.reset()
		while over==False:
		    status,place,reward,over,action=env.step(policy(status,place))
		total_reward+=env.total_reward+0.0

		if i%100==0:
		    print(i)
	    return total_reward/iters
device=1
if len(sys.argv)>1:
   device=int(sys.argv[1])
with torch.cuda.device(device):
	VIN=myvin.VIN(myvin.Config()).cuda()
	def vinPolicy(status,place):
	    if np.random.random()<e:
		action=np.random.randint(9)
		return action
	    S1=torch.Tensor([place[0]]).cuda()
	    S2=torch.Tensor([place[1]]).cuda()
	    X=torch.Tensor(status).expand(1, len(status),status[0].shape[0],status[0].shape[1]).cuda()
	    config=myvin.Config()
	    q1,q2=VIN(X,S1,S2,myvin.Config())
	    q1=q1.cuda()
	    q2=q2.cuda()
	    #print(q1)
	    #print(q2.shape)
	    _,action=torch.max(q1,dim=1)
	    action=int(action)    
Пример #12
0
        while over == False and t < 100:
            action = policy(status, place)
            status, place, reward, over = env.step(action)
            t += 1
        total_reward += env.total_reward + 0.0
        if i % 100 == 0:
            print(i)
    return total_reward / iters, time.time() - time2


device = 0
if len(sys.argv) > 1:
    device = int(sys.argv[1])
with torch.cuda.device(device):

    VIN = myvin.VIN(myvin.Config())
    VIN = VIN.cuda()
    #VIN.load_state_dict(torch.load("model2/moving-model-9-3920.pkl"))
    print(VIN)
    oldVIN = myvin.VIN(myvin.Config()).cuda()
    oldVIN.load_state_dict(VIN.state_dict())
    grid = gw.GridWorld3_8dir(8, 8, nobstacle=4, moving=True)
    e = 0
    #print(evaluate(grid,vinPolicy,1000))
    #print(evaluate(grid,randomWalk))
    maxStep = 5000000
    episodes = 20000
    gamma = 0.99
    Tmax = 1000
    replay = []
    max_exp = 10000