def __init__(self, memory, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 10 self.TRANSFER = 10 self.BATCH_SIZE = 128 #self.BATCH_SIZE = 5 self.GAMMA = 0.99 #self.SAMPLE_ALPHA = 0.5 #self.SAMPLE_EPISLON = 0. #self.SAMPLE_BETA = 0. #self.SAMPLE_S = 44.8 self.SAMPLE_S = 5.0 self.SAMPLE_Q = 1.0 LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.memory = memory self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore
def __init__(self, inputs): mp.Process.__init__(self) self.BATCH_SIZE = 32 self.TRAIN_MAX = 500 self.TRANSFER = 100 self.GAMMA = 1.0 LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.demonet = DQN() self.targetnet = DQN() self.copy_weights() self.demonet.setOptimizer( optim.RMSprop(self.demonet.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.inputs = inputs
def __init__(self, shared): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 500 self.TRANSFER = 100 self.BATCH_SIZE = 32 self.GAMMA = 1.0 LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.shared = shared # shared resources, {'memory', 'SENT_FLAG'}
#from Env.Environment import Environment from Env.gymEnv_V2 import myGym #from Env.gymEnv import myGym from DQN.Improver_Q_Learning import Improver from DQN.Evaluator_Dense_Q_Learning import Evaluator from DQN.ReplayMemory import ReplayMemory import os os.system( "taskset -p 0xff %d" % os.getpid() ) #https://stackoverflow.com/questions/15639779/why-does-multiprocessing-use-only-a-single-core-after-i-import-numpy if __name__ == '__main__': # hyperparameters MEMORY_SIZE = 5000 #MEMORY_SIZE = 5 imp_net = DQN() # populate memory # let improver populate first manager = SyncManager() manager.start() memory = ReplayMemory(MEMORY_SIZE) s = multiprocessing.Semaphore(1) #memory = multiprocessing.Queue(MEMORY_SIZE) memory = manager.list() shared = manager.dict({'SENT_FLAG': True, 'weights': None}) #shared = manager.dict({'memory':memory, 'SENT_FLAG':True, 'weights':None}) #improver = Improver(imp_net, shared, myGym(), s) improver = Improver(imp_net, MEMORY_SIZE, memory, shared, myGym(), s) # improver is executed by the main process evaluator = Evaluator(memory, shared, s)
class Evaluator(multiprocessing.Process): def __init__(self, memory, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 10 self.TRANSFER = 10 self.BATCH_SIZE = 128 #self.BATCH_SIZE = 5 self.GAMMA = 0.99 #self.SAMPLE_ALPHA = 0.5 #self.SAMPLE_EPISLON = 0. #self.SAMPLE_BETA = 0. #self.SAMPLE_S = 44.8 self.SAMPLE_S = 5.0 self.SAMPLE_Q = 1.0 LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.memory = memory self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore def minibatch(self, exp_replay, pretrain=False): #batch = exp_replay.sample(self.BATCH_SIZE) #print(batch) unzipped = list(zip(*exp_replay)) state_batch = Variable(torch.from_numpy(np.array(unzipped[0])), volatile=True) action_batch = Variable(torch.from_numpy(np.array( unzipped[1])).type(LongTensor), volatile=True) reward_batch = Variable(torch.from_numpy(np.array( unzipped[2])).type(FloatTensor), volatile=True) target_batch = None if pretrain: # only use reward target_batch = reward_batch else: term_batch = Variable(torch.from_numpy(np.array( unzipped[4])).type(FloatTensor), volatile=True) next_state_batch = Variable(torch.from_numpy(np.array( unzipped[3])), volatile=True) #print('average distance: {}' . format(dist_norm)) #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1) next_state_values = self.targetNet(next_state_batch).max( 1)[0].unsqueeze(1) #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch) #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True) #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch) #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0])) #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0])) next_state_values = term_batch * next_state_values next_state_values.volatile = False target_batch = reward_batch + (self.GAMMA * next_state_values) # calculate the probability for each transition # calculate distance matrix state_feature_batch = self.targetNet.getstate(state_batch) inner_product = state_feature_batch.matmul( state_feature_batch.transpose(1, 0)) state_feature_batch_l2 = (state_feature_batch**2).sum( dim=1, keepdim=True).expand_as(inner_product) distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose( 1, 0) - 2 * inner_product #print('distance state') #print(distance_matrix.data) # calculate Q value ditance matrix # Here use target value to calculate Q_dist_matrix = target_batch.expand_as(distance_matrix) Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose( 1, 0) # not absolute value Q_dist_matrix = Q_dist_matrix.abs() #print('distance q') #print(Q_dist_matrix.data) # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j]) # only consider same actions Action_Mask = (action_batch.expand_as(distance_matrix)) == ( action_batch.transpose(1, 0).expand_as(distance_matrix)) Mask = (distance_matrix.data <= (self.SAMPLE_S)) & (Q_dist_matrix.data <= self.SAMPLE_Q) & Action_Mask.data Cluster = [] #print('mask') counter = 0 while True: # clustering by VERTEX-COVER-ALL-VERTEX, always find largest degree #print('counter = {}' . format(counter)) counter += 1 Number = Mask.sum(dim=1) value, indx = Number.max(dim=0) #print('indx= {}' . format(indx)) if value[0] == 0: # already empty break v = Mask[indx] #print(v) #print(Mask) Cluster.append(v) # delete vertices Delete = v.expand_as(Mask) | v.transpose(1, 0).expand_as(Mask) Delete = Delete ^ 1 #Delete = v.transpose(1,0).matmul(v) ^ 1 #print(Delete) Mask = Mask & Delete k = len(Cluster) Cluster = torch.cat(Cluster) #print('cluster') #print(Cluster) Number = Cluster.sum(dim=1).type(LongTensor) probability_batch = torch.ones(k) / float(k) cluster_is = torch.multinomial(probability_batch, self.BATCH_SIZE, replacement=True) # convert the cluster indices to number of items in each cluster Sample_num = torch.eye(k).index_select( 0, cluster_is).sum(dim=0).type(LongTensor) #N = Cluster[0].size()[0] # number of vertices state_sample = [] action_sample = [] target_sample = [] for i in range(k): n = Sample_num[i] N = Number[i] if n == 0: continue cluster = Cluster[i] # get nonzero indices v_indices = cluster.nonzero().squeeze(1) if n == N: # pick up all state_sample.append(state_batch.index_select(0, v_indices)) action_sample.append(action_batch.index_select(0, v_indices)) target_sample.append(target_batch.index_select(0, v_indices)) continue prob = torch.ones(v_indices.size()) / n if n < N: # uniformly pick v_indices_is = torch.multinomial(prob, n) v_indices = v_indices.index_select(0, v_indices_is) state_sample.append(state_batch.index_select(0, v_indices)) action_sample.append(action_batch.index_select(0, v_indices)) target_sample.append(target_batch.index_select(0, v_indices)) continue # uniformly pick with replacement v_indices_is = torch.multinomial(prob, n, replacement=True) v_indices = v_indices.index_select(0, v_indices_is) state_sample.append(state_batch.index_select(0, v_indices)) action_sample.append(action_batch.index_select(0, v_indices)) target_sample.append(target_batch.index_select(0, v_indices)) state_batch = torch.cat(state_sample) action_batch = torch.cat(action_sample) target_batch = torch.cat(target_sample) state_batch.volatile = False state_batch.requires_grad = True action_batch.volatile = False target_batch.volatile = False return state_batch, action_batch, target_batch def copy_weights(self): self.targetNet.load_state_dict(self.net.state_dict()) def run(self): # keep two nets: Q-net, and target-net # keep looping: # 0. loop until SENT_FLAG is not set # # 1. loop for a fixed # of steps: # minibatch, and get the target value for the batch # optimize the net parameters by this batch # for some fixed time, copy weights from Q-net to target-net # # 2. set copy weights from Q-net to shared weights # set SENT_FLAG to true # TODO: pretrain in the first loop os.system("taskset -p 0xff %d" % os.getpid()) pretrain = True i = 0 while True: while self.shared['SENT_FLAG']: # loop until it is set to 0 print('sleeping...') time.sleep(0.1) for step_i in range(1, self.TRAIN_MAX + 1): # minibatch, and get the target value #print('training... step {}' . format(step_i)) #self.semaphore.acquire() #memory = copy.deepcopy(self.memory) memory = self.memory #self.semaphore.release() if len(memory) < self.BATCH_SIZE: continue i += 1 print('training... {}'.format(i)) batch_tuple = self.minibatch(memory, pretrain) loss = self.net.optimize(batch_tuple) #print('loss: {}' . format(loss)) #print('optimized') if step_i % self.TRANSFER == 0: #self.semaphore.acquire() self.copy_weights() #self.semaphore.release() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True if i == 50: pretrain = False
class Evaluator(multiprocessing.Process): def __init__(self, shared): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 500 self.TRANSFER = 100 self.BATCH_SIZE = 32 self.GAMMA = 1.0 LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} def minibatch(self, exp_replay, pretrain=False): batch = exp_replay.sample(self.BATCH_SIZE) unzipped = list(zip(*batch)) state_batch = np.concatenate(list(unzipped[0])) state_batch = Variable(torch.from_numpy(state_batch)) action_batch = np.concatenate(list(unzipped[1])) action_batch = Variable(torch.from_numpy(action_batch)) reward_batch = np.concatenate(list(unzipped[2])) reward_batch = Variable(torch.from_numpy(reward_batch), requires_grad=False) #state_batch = Variable(torch.cat(list(unzipped[0])).clone()) #action_batch = Variable(torch.cat(list(unzipped[1])).clone()) #reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), requires_grad=False) if pretrain: # only use reward return state_batch, action_batch, reward_batch else: term_batch = np.concatenate(list(unzipped[5])) term_batch = Variable(torch.from_numpy(term_batch), volatile=True) next_action_batch = np.concatenate(list(unzipped[4])) next_action_batch = Variable(torch.from_numpy(next_action_batch), volatile=True) next_state_batch = np.concatenate(list(unzipped[3])) next_state_batch = Variable(torch.from_numpy(next_state_batch), volatile=True) next_state_values = self.targetNet(next_state_batch).gather( 1, next_action_batch) #term_batch = Variable(torch.cat(list(unzipped[5]).clone()), volatile=True) #next_action_batch = Variable(torch.cat(list(unzipped[4]).clone()), volatile=True) #next_state_values = self.targetNet.evaluate(list(unzipped[3]).clone()).gather(1, next_action_batch) next_state_values = term_batch * next_state_values print(next_state_values) next_state_values.volatile = False next_state_values.requires_grad = False target_batch = reward_batch + (self.GAMMA * next_state_values) return state_batch, action_batch, target_batch def copy_weights(self): self.targetNet.load_state_dict(self.net.state_dict()) def run(self): # keep two nets: Q-net, and target-net # keep looping: # 0. loop until SENT_FLAG is not set # # 1. loop for a fixed # of steps: # minibatch, and get the target value for the batch # optimize the net parameters by this batch # for some fixed time, copy weights from Q-net to target-net # # 2. set copy weights from Q-net to shared weights # set SENT_FLAG to true # TODO: pretrain in the first loop pretrain = True while True: #print('evaluator starts...') while self.shared['SENT_FLAG']: # loop until it is set to 0 print('sleeping... size: {}'.format(len( self.shared['memory']))) time.sleep(1) for step_i in range(self.TRAIN_MAX): # minibatch, and get the target value print('training... step {}'.format(step_i)) #memory = deepcopy(self.shared['memory']) batch_tuple = self.minibatch(self.shared['memory'], pretrain) #print('got batch tuple') loss = self.net.optimize(batch_tuple) #print('optimized') if step_i % self.TRANSFER == 0: self.copy_weights() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True pretrain = False
class Evaluator(multiprocessing.Process): def __init__(self, memory, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 10 self.TRANSFER = 10 self.BATCH_SIZE = 32 self.GAMMA = 0.99 self.SAMPLE_ALPHA = 0.5 self.SAMPLE_EPISLON = 0. self.SAMPLE_BETA = 0. LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.memory = memory self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore def minibatch(self, exp_replay, pretrain=False): #batch = exp_replay.sample(self.BATCH_SIZE) #print(batch) unzipped = list(zip(*exp_replay)) state_batch = Variable(torch.from_numpy(np.array(unzipped[0])), volatile=True) action_batch = Variable(torch.from_numpy(np.array( unzipped[1])).type(LongTensor), volatile=True) reward_batch = Variable(torch.from_numpy(np.array( unzipped[2])).type(FloatTensor), volatile=True) target_batch = None #state_batch = Variable(torch.cat(list(unzipped[0])).clone(), volatile=True) #action_batch = Variable(torch.cat(list(unzipped[1])).clone(), volatile=True) #reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), volatile=True) #target_batch = None if pretrain: # only use reward target_batch = reward_batch else: term_batch = Variable(torch.from_numpy(np.array( unzipped[4])).type(FloatTensor), volatile=True) #term_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True) #next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True) next_state_batch = Variable(torch.from_numpy(np.array( unzipped[3])), volatile=True) dist_norm = self.targetNet.getdistance(state_batch, next_state_batch) #print('average distance: {}' . format(dist_norm)) #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1) next_state_values = self.targetNet(next_state_batch).max( 1)[0].unsqueeze(1) #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch) #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True) #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch) #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0])) #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0])) next_state_values = term_batch * next_state_values next_state_values.volatile = False target_batch = reward_batch + (self.GAMMA * next_state_values) # calculate the probability for each transition state_values = self.net(state_batch).gather(1, action_batch) probability_batch = torch.pow(torch.abs(target_batch - state_values), self.SAMPLE_ALPHA).squeeze(1) print(probability_batch) sample_is = torch.multinomial(probability_batch, self.BATCH_SIZE) state_batch = state_batch.index_select(0, sample_is) action_batch = action_batch.index_select(0, sample_is) target_batch = target_batch.index_select(0, sample_is) state_batch.volatile = False state_batch.requires_grad = True action_batch.volatile = False target_batch.volatile = False return state_batch, action_batch, target_batch def copy_weights(self): self.targetNet.load_state_dict(self.net.state_dict()) def run(self): # keep two nets: Q-net, and target-net # keep looping: # 0. loop until SENT_FLAG is not set # # 1. loop for a fixed # of steps: # minibatch, and get the target value for the batch # optimize the net parameters by this batch # for some fixed time, copy weights from Q-net to target-net # # 2. set copy weights from Q-net to shared weights # set SENT_FLAG to true # TODO: pretrain in the first loop os.system("taskset -p 0xff %d" % os.getpid()) pretrain = True i = 0 while True: while self.shared['SENT_FLAG']: # loop until it is set to 0 print('sleeping...') time.sleep(0.1) for step_i in range(1, self.TRAIN_MAX + 1): # minibatch, and get the target value #print('training... step {}' . format(step_i)) #self.semaphore.acquire() #memory = copy.deepcopy(self.memory) memory = self.memory #self.semaphore.release() if len(memory) < self.BATCH_SIZE: continue i += 1 print('training... {}'.format(i)) batch_tuple = self.minibatch(memory, pretrain) loss = self.net.optimize(batch_tuple) #print('loss: {}' . format(loss)) #print('optimized') if step_i % self.TRANSFER == 0: #self.semaphore.acquire() self.copy_weights() #self.semaphore.release() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True pretrain = False
class Evaluator(multiprocessing.Process): def __init__(self, memory, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 1 self.TRANSFER = 1 self.BATCH_SIZE = 9 #self.BATCH_SIZE = 5 self.GAMMA = 0.99 #self.SAMPLE_ALPHA = 0.5 #self.SAMPLE_EPISLON = 0. #self.SAMPLE_BETA = 0. #self.SAMPLE_S = 44.8 self.SAMPLE_S = 5.0 self.SAMPLE_Q = 1.0 LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.memory = memory self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore def minibatch(self, exp_replay, pretrain=False): #batch = exp_replay.sample(self.BATCH_SIZE) #print(batch) unzipped = list(zip(*exp_replay)) state_batch = Variable(torch.from_numpy(np.array(unzipped[0])), volatile=True) action_batch = Variable(torch.from_numpy(np.array( unzipped[1])).type(LongTensor), volatile=True) reward_batch = Variable(torch.from_numpy(np.array( unzipped[2])).type(FloatTensor), volatile=True) target_batch = None if pretrain: # only use reward target_batch = reward_batch else: term_batch = Variable(torch.from_numpy(np.array( unzipped[4])).type(FloatTensor), volatile=True) next_state_batch = Variable(torch.from_numpy(np.array( unzipped[3])), volatile=True) #print('average distance: {}' . format(dist_norm)) #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1) next_state_values = self.targetNet(next_state_batch).max( 1)[0].unsqueeze(1) #prediction_state_values = self.targetNet(state_batch).gather(1, action_batch) #not_action_batch = Variable(torch.from_numpy(1-np.array(unzipped[1])).type(LongTensor), volatile=True) #prediction_state_nonterm_values = self.targetNet(state_batch).gather(1, not_action_batch) #print('term average value: {}' . format(torch.sum((1-term_batch) * prediction_state_values).data[0]/torch.sum(1-term_batch).data[0])) #print('nonterm average value: {}' . format(torch.sum((1-term_batch) * prediction_state_nonterm_values).data[0]/torch.sum(1-term_batch).data[0])) next_state_values = term_batch * next_state_values next_state_values.volatile = False target_batch = reward_batch + (self.GAMMA * next_state_values) # calculate the probability for each transition # calculate distance matrix state_feature_batch = self.targetNet.getstate(state_batch) inner_product = state_feature_batch.matmul( state_feature_batch.transpose(1, 0)) state_feature_batch_l2 = (state_feature_batch**2).sum( dim=1, keepdim=True).expand_as(inner_product) distance_matrix = state_feature_batch_l2 + state_feature_batch_l2.transpose( 1, 0) - 2 * inner_product #print('distance state') #print(distance_matrix.data) # calculate Q value ditance matrix # Here use target value to calculate Q_dist_matrix = target_batch.expand_as(distance_matrix) Q_dist_matrix = Q_dist_matrix - Q_dist_matrix.transpose( 1, 0) # not absolute value Q_dist_matrix = Q_dist_matrix.abs() #print('distance q') #print(Q_dist_matrix.data) # Number[i,j] = Number[i,j] + (D_f[i,j] <= sample_S^2 AND D_Q[i,j] <= sample_Q AND action[i]=action[j]) # only consider same actions Action_Mask = (action_batch.expand_as(distance_matrix)) == ( action_batch.transpose(1, 0).expand_as(distance_matrix)) Mask = (distance_matrix.data <= (self.SAMPLE_S**2)) & (Q_dist_matrix.data <= self.SAMPLE_Q) & Action_Mask.data Mask = Mask.type(FloatTensor) Number = Mask.sum(dim=1, keepdim=True) # using the mask to calculate the number used for each transition probability_batch = Mask.matmul(1. / Number) / Number probability_batch = probability_batch.squeeze(1) #print(probability_batch) sample_is = torch.multinomial(probability_batch, self.BATCH_SIZE) state_batch = state_batch.index_select(0, sample_is) action_batch = action_batch.index_select(0, sample_is) target_batch = target_batch.index_select(0, sample_is) state_batch.volatile = False state_batch.requires_grad = True action_batch.volatile = False target_batch.volatile = False return state_batch, action_batch, target_batch def copy_weights(self): self.targetNet.load_state_dict(self.net.state_dict()) def run(self): # keep two nets: Q-net, and target-net # keep looping: # 0. loop until SENT_FLAG is not set # # 1. loop for a fixed # of steps: # minibatch, and get the target value for the batch # optimize the net parameters by this batch # for some fixed time, copy weights from Q-net to target-net # # 2. set copy weights from Q-net to shared weights # set SENT_FLAG to true # TODO: pretrain in the first loop os.system("taskset -p 0xff %d" % os.getpid()) pretrain = True i = 0 while True: while self.shared['SENT_FLAG']: # loop until it is set to 0 print('sleeping...') time.sleep(0.1) for step_i in range(1, self.TRAIN_MAX + 1): # minibatch, and get the target value #print('training... step {}' . format(step_i)) #self.semaphore.acquire() #memory = copy.deepcopy(self.memory) memory = self.memory #self.semaphore.release() if len(memory) < self.BATCH_SIZE: continue i += 1 print('training... {}'.format(i)) batch_tuple = self.minibatch(memory, pretrain) loss = self.net.optimize(batch_tuple) #print('loss: {}' . format(loss)) #print('optimized') if step_i % self.TRANSFER == 0: #self.semaphore.acquire() self.copy_weights() #self.semaphore.release() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True if i == 50: pretrain = False
class Evaluator(multiprocessing.Process): def __init__(self, shared, semaphore): multiprocessing.Process.__init__(self) # hyperparameters self.TRAIN_MAX = 50 self.TRANSFER = 50 self.BATCH_SIZE = 32 self.GAMMA = 0.99 self.SAMPLE_ALPHA = 0.5 self.SAMPLE_EPISLON = 0. self.SAMPLE_BETA = 0. LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.net = DQN() # Deep Net self.targetNet = DQN() self.copy_weights() self.net.setOptimizer( optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.shared = shared # shared resources, {'memory', 'SENT_FLAG'} self.semaphore = semaphore def minibatch(self, exp_replay, pretrain=False): batch = exp_replay.sample(self.BATCH_SIZE) #print(batch) unzipped = list(zip(*batch)) #state_batch = torch.from_numpy(np.concatenate(list(unzipped[0]))) #state_batch = Variable(state_batch) #action_batch = torch.from_numpy(np.concatenate(list(unzipped[1]))) #action_batch = Variable(action_batch) #reward_batch = torch.from_numpy(np.concatenate(list(unzipped[2]))) #reward_batch = Variable(reward_batch, requires_grad=False) state_batch = Variable(torch.cat(list(unzipped[0])).clone()) action_batch = Variable(torch.cat(list(unzipped[1])).clone()) reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), requires_grad=False) if pretrain: # only use reward return state_batch, action_batch, reward_batch else: #term_batch = torch.from_numpy(np.concatenate(list(unzipped[5]))) #term_batch = Variable(term_batch, volatile=True) term_batch = Variable(torch.cat(list(unzipped[5])).clone(), volatile=True) #next_action_batch = torch.from_numpy(np.concatenate(list(unzipped[4]))) #next_action_batch = Variable(next_action_batch, volatile=True) next_action_batch = Variable(torch.cat(list(unzipped[4])).clone(), volatile=True) #next_state = torch.from_numpy(np.concatenate(list(unzipped[3]))) #next_state = Variable(next_state, volatile=True) #next_state_values = self.targetNet(next_state).gather(1, next_action_batch) next_state_values = self.targetNet.evaluate(list( unzipped[3])).gather(1, next_action_batch) #non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, list(unzipped[3])))) #non_final_next_states = Variable(torch.cat([s for s in list(unzipped[3]) if s is not None]), # volatile=True) #next_state_values = Variable(torch.zeros(self.BATCH_SIZE).type(Tensor)) #next_state_values[non_final_mask] = self.targetNet(non_final_next_states).gather(1, next_action_batch) #next_state_values.volatile = False #print(next_state_values) #next_state_values = self.targetNet.evaluate(list(unzipped[3])).max(1)[0].unsqueeze(1) next_state_values = term_batch * next_state_values next_state_values.volatile = False #next_state_values.requires_grad = False target_batch = reward_batch + (self.GAMMA * next_state_values) return state_batch, action_batch, target_batch def copy_weights(self): self.targetNet.load_state_dict(self.net.state_dict()) def run(self): # keep two nets: Q-net, and target-net # keep looping: # 0. loop until SENT_FLAG is not set # # 1. loop for a fixed # of steps: # minibatch, and get the target value for the batch # optimize the net parameters by this batch # for some fixed time, copy weights from Q-net to target-net # # 2. set copy weights from Q-net to shared weights # set SENT_FLAG to true # TODO: pretrain in the first loop pretrain = True while True: while self.shared['SENT_FLAG']: # loop until it is set to 0 print('sleeping... size: {}'.format(len( self.shared['memory']))) time.sleep(0.1) print('training...') for step_i in range(1, self.TRAIN_MAX + 1): # minibatch, and get the target value #print('training... step {}' . format(step_i)) self.semaphore.acquire() memory = copy.deepcopy(self.shared['memory']) self.semaphore.release() if len(memory) < self.BATCH_SIZE: continue batch_tuple = self.minibatch(memory, pretrain) #print('got batch tuple') #print(batch_tuple[0].type) loss = self.net.optimize(batch_tuple) #print('loss: {}' . format(loss)) #print('optimized') if step_i % self.TRANSFER == 0: self.copy_weights() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True pretrain = False
from Env.Environment import Environment from DQN.ReplayMemory import ReplayMemory import torch.nn as nn import torch.optim as optim from Env.gymEnv import myGym import time import numpy as np import cv2 import matplotlib import matplotlib.pyplot as plt from multiprocessing import Manager, Event from multiprocessing.managers import SyncManager # hyperparameters MEMORY_SIZE = 100 imp_net = DQN() #eval_net = DQN() #eval_target_net = DQN() #eval_net.setOptimizer(optim.RMSprop(eval_net.parameters(), lr=LEARNING_RATE, # momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, # eps=MIN_SQUARED_GRAD)) imp_net.share_memory() #eval_net.share_memory() #eval_target_net.share_memory() #env = Environment() #env = myGym() # populate memory # let improver populate first SyncManager.register('ReplayMemory', ReplayMemory,
from multiprocessing.managers import SyncManager import torch.nn as nn import time from DQN.DQNcartpole import DQN from Env.gymEnv import myGym from DQN.CartPoleDQN import CartPoleDQN from DQN.ReplayMemory import ReplayMemory if __name__ == '__main__': demonet = DQN() #manager = SyncManager() #manager.start() memory = ReplayMemory(10000) #for i in range(memory.capacity): # memory.push(torch.FloatTensor(1, 3, 40, 80)) shared = dict({'memory': memory, 'SENT_FLAG': True, 'weights': None}) p = CartPoleDQN(DQN(), shared, myGym()) p.run()
class MyProcess(mp.Process): def __init__(self, inputs): mp.Process.__init__(self) self.BATCH_SIZE = 32 self.TRAIN_MAX = 500 self.TRANSFER = 100 self.GAMMA = 1.0 LEARNING_RATE = 0.00025 MOMENTUM = 0.95 SQUARED_MOMENTUM = 0.95 MIN_SQUARED_GRAD = 0.01 self.demonet = DQN() self.targetnet = DQN() self.copy_weights() self.demonet.setOptimizer( optim.RMSprop(self.demonet.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, alpha=SQUARED_MOMENTUM, eps=MIN_SQUARED_GRAD)) self.inputs = inputs #self.demonet.setOptimizer(optim.Adam(params=self.demonet.parameters())) def copy_weights(self): self.targetnet.load_state_dict(self.demonet.state_dict()) def minibatch(self, exp_replay, pretrain=False): batch = exp_replay.sample(self.BATCH_SIZE) unzipped = list(zip(*batch)) #state_batch = np.concatenate(list(unzipped[0])) #state_batch = Variable(torch.from_numpy(state_batch)) #action_batch = np.concatenate(list(unzipped[1])) #action_batch = Variable(torch.from_numpy(action_batch)) #reward_batch = np.concatenate(list(unzipped[2])) #reward_batch = Variable(torch.from_numpy(reward_batch), requires_grad=False) state_batch = Variable(torch.cat(list(unzipped[0])).clone()) action_batch = Variable(torch.cat(list(unzipped[1])).clone()) reward_batch = Variable(torch.cat(list(unzipped[2])).clone(), requires_grad=False) if pretrain: # only use reward return state_batch, action_batch, reward_batch else: #term_batch = np.concatenate(list(unzipped[5])) #term_batch = Variable(torch.from_numpy(term_batch), volatile=True) #next_action_batch = np.concatenate(list(unzipped[4])) #next_action_batch = Variable(torch.from_numpy(next_action_batch), volatile=True) #next_state_batch = np.concatenate(list(unzipped[3])) #next_state_batch = Variable(torch.from_numpy(next_state_batch), volatile=True) #next_state_values = self.targetNet(next_state_batch).gather(1,next_action_batch) term_batch = Variable(torch.cat(list(unzipped[5]).clone()), volatile=True) next_action_batch = Variable(torch.cat(list(unzipped[4]).clone()), volatile=True) next_state_values = self.targetNet.evaluate( list(unzipped[3]).clone()).gather(1, next_action_batch) next_state_values = term_batch * next_state_values print(next_state_values) next_state_values.volatile = False next_state_values.requires_grad = False target_batch = reward_batch + (self.GAMMA * next_state_values) return state_batch, action_batch, target_batch def run(self): pretrain = True while True: while self.inputs['SENT_FLAG']: print('sleeping... size: {}'.format(len( self.inputs['inputs']))) time.sleep(1) for step_i in range(self.TRAIN_MAX): sample = self.minibatch(self.inputs['inputs'], pretrain) self.demonet(sample[0]) print('hello world') loss = self.demonet.optimize(sample) #time.sleep(1) if step_i % self.TRANSFER == 0: self.copy_weights() self.shared['weights'] = self.net.state_dict() self.shared['SENT_FLAG'] = True