def main(log_file, h_sizes, improve_loss_min=0.001): x_train, y_train, x_test, y_test = generate_cases(log_file) in_size = LINE_MAX_CHAR out_size = 2 layers = [in_size] + h_sizes + [out_size] model = FunctionSet() for li in range(1, len(layers)): setattr(model, "l%d" % li, F.Linear(layers[li-1], layers[li])) optimizer = optimizers.SGD() optimizer.setup(model.collect_parameters()) last_loss = None for epoch in range(3000000): optimizer.zero_grads() loss, accuracy = forward(model, x_train, y_train) loss.backward() if epoch % 100 == 0: print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data) if last_loss is not None and last_loss - improve_loss_min < loss.data: print "Finish Training" break last_loss = loss.data optimizer.update() if epoch % 1000 == 0: loss, accuracy = forward(model, x_test, y_test) print "epoch: %s, Try Test Result: loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data) # result loss, accuracy = forward(model, x_test, y_test) print "epoch: %s, Test Result: loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data) return epoch, accuracy.data
def main(args): def forward(x_data, y_data): x = Variable(x_data) t = Variable(y_data) h1 = F.relu(model.l1(x)) # activation function h2 = F.relu(model.l2(h1)) # ReLU does not have parameters to optimize y = model.l3(h2) # the loss function of softmax regression return F.softmax_cross_entropy(y, t), F.accuracy(y, t) # current accuracy def evaluate(): sum_loss, sum_accuracy = 0, 0 for i in xrange(0, 10000, batchsize): x_batch = x_test[i:i+batchsize] y_batch = y_test[i:i+batchsize] loss, accuracy = forward(x_batch, y_batch) sum_loss += loss.data * batchsize sum_accuracy += accuracy.data * batchsize mean_loss = sum_loss / 10000 mean_accuracy = sum_accuracy / 10000 print(mean_loss[0], mean_accuracy) return global debug, verbose debug = args.debug if debug == True: verbose = True else: verbose = args.verbose mnist = fetch_mldata('MNIST original') x_all = mnist.data.astype(np.float32) / 255 # Scaling features to [0, 1] y_all = mnist.target.astype(np.int32) x_train, x_test = np.split(x_all, [60000]) # 60000 for training, 10000 for test y_train, y_test = np.split(y_all, [60000]) # Simple three layer rectfier network model = FunctionSet( l1 = F.Linear(784, 100), # 784 pixels -> 100 units l2 = F.Linear(100, 100), # 100 units -> 100 units l3 = F.Linear(100, 10), # 100 units -> 10 digits ) optimizer = optimizers.SGD() optimizer.setup(model.collect_parameters()) batchsize = 100 for epoch in xrange(20): if verbose: logger.info('epoch: {}'.format(epoch)) indexes = np.random.permutation(60000) for i in xrange(0, 60000, batchsize): x_batch = x_train[indexes[i:i+batchsize]] y_batch = y_train[indexes[i:i+batchsize]] optimizer.zero_grads() # Initialize gradient arrays loss, accuracy = forward(x_batch, y_batch) # loss function loss.backward() # Backpropagation optimizer.update() evaluate() return 0
class LinearModel(object): UNIT_NUM = 10 BATCH_SIZE = 32 EPOCH = 100 def __init__(self, optimizer): self.model = FunctionSet( l = Linear(self.UNIT_NUM, 2) ) self.optimizer = optimizer # true parameters self.w = np.random.uniform(-1, 1, (self.UNIT_NUM, 1)).astype(np.float32) self.b = np.random.uniform(-1, 1, (1, )).astype(np.float32) def _train_linear_classifier(self, model, optimizer, gpu): def _make_label(x): a = (np.dot(x, self.w) + self.b).reshape((self.BATCH_SIZE, )) t = np.empty_like(a).astype(np.int32) t[a>=0] = 0 t[a< 0] = 1 return t def _make_dataset(batch_size, unit_num, gpu): x_data = np.random.uniform(-1, 1, (batch_size, unit_num)).astype(np.float32) t_data = _make_label(x_data) if gpu: x_data = cuda.to_gpu(x_data) t_data = cuda.to_gpu(t_data) x = Variable(x_data) t = Variable(t_data) return x, t for epoch in xrange(self.EPOCH): x, t = _make_dataset(self.BATCH_SIZE, self.UNIT_NUM, gpu) optimizer.zero_grads() y = model.l(x) loss = softmax_cross_entropy(y, t) loss.backward() optimizer.update() x_test, t_test = _make_dataset(self.BATCH_SIZE, self.UNIT_NUM, gpu) y_test = model.l(x_test) return accuracy(y_test, t_test) def _accuracy_cpu(self): self.optimizer.setup(self.model.collect_parameters()) return self._train_linear_classifier(self.model, self.optimizer, False) def _accuracy_gpu(self): model = self.model optimizer = self.optimizer model.to_gpu() optimizer.setup(model.collect_parameters()) return self._train_linear_classifier(model, optimizer, True) def accuracy(self, gpu): if gpu: return cuda.to_cpu(self._accuracy_gpu().data) else: return self._accuracy_cpu().data
def main(): if P.use_mean_var: conv6_output = 126 else: conv6_output = 128 if P.model_name is None: model = FunctionSet( conv1 = F.Convolution2D( 1, 128, 3, stride=1), conv2 = F.Convolution2D(128, 128, 3, stride=1), conv3 = F.Convolution2D(128, 128, 3, stride=1), conv4 = F.Convolution2D(128, 128, 3, stride=1), conv5 = F.Convolution2D(128, 128, 3, stride=1), conv6 = F.Convolution2D(128, conv6_output, 3, stride=1), conv7 = F.Convolution2D(128, 128, 1, stride=1), conv8 = F.Convolution2D(128, 1, 1, stride=1) ) if P.gpu >= 0: cuda.init(P.gpu) model.to_gpu() else: if P.gpu >= 0: cuda.init(P.gpu) model = pickle.load(open(os.path.join(P.model_dir, P.model_name), 'rb')) optimizer = optimizers.MomentumSGD(lr=P.lr, momentum=P.momentum) optimizer.setup(model.collect_parameters()) train(model, optimizer) return
def setup_model(n_dimention, n_units): model = FunctionSet(l1=F.Linear(n_dimention, n_units), l2=F.Linear(n_units, n_dimention)) # Setup optimizer optimizer = optimizers.Adam() optimizer.setup(model.collect_parameters()) return model, optimizer
class ConvolutionalDenoisingAutoencoder(): def __init__(self, imgsize, n_in_channels, n_out_channels, ksize, stride=1, pad=0, use_cuda=False): self.model = FunctionSet( encode=F.Convolution2D(n_in_channels, n_out_channels, ksize, stride, pad), decode=F.Linear(n_out_channels*(math.floor((imgsize+2*pad-ksize)/stride)+1)**2, n_in_channels*imgsize**2) ) self.use_cuda = use_cuda if self.use_cuda: self.model.to_gpu() self.optimizer = optimizers.Adam() self.optimizer.setup(self.model.collect_parameters()) def encode(self, x_var): return F.sigmoid(self.model.encode(x_var)) def decode(self, x_var): return self.model.decode(x_var) def predict(self, x_data): if self.use_cuda: x_data = cuda.to_gpu(x_data) x = Variable(x_data) p = self.encode(x) if self.use_cuda: return cuda.to_cpu(p.data) else: return p.data def cost(self, x_data): x = Variable(x_data) t = Variable(x_data.reshape(x_data.shape[0], x_data.shape[1]*x_data.shape[2]*x_data.shape[3])) h = F.dropout(x) h = self.encode(h) y = self.decode(h) return F.mean_squared_error(y, t) def train(self, x_data): if self.use_cuda: x_data = cuda.to_gpu(x_data) self.optimizer.zero_grads() loss = self.cost(x_data) loss.backward() self.optimizer.update() if self.use_cuda: return float(cuda.to_cpu(loss.data)) else: return loss.data def test(self, x_data): if self.use_cuda: x_data = cuda.to_gpu(x_data) loss = self.cost(x_data) return float(cuda.to_cpu(loss.data))
class NNQLearningPlayer(object): ALPHA = 0.1 GAMMA = 0.99 E_GREEDY = 0.3 def __init__(self): self.actions = [1, 2, 3, 4] self.model = FunctionSet( l1=F.EmbedID(10, 10), l2=F.Linear(10, 10), l3=F.Linear(10, 4), ) self.optimizer = optimizers.SGD() self.optimizer.setup(self.model.collect_parameters()) self.last_action = None self.last_q_list = None self.training = True def action(self, state, last_reward): if self.last_action is not None and self.training: self.update_q_table(self.last_action, state, last_reward) next_action = self.select_action(state) self.last_action = next_action return self.actions[next_action] def forward(self, state): x = Variable(np.array([state], dtype=np.int32)) y = None for i in range(1, 1000): # 1000 は適当な数 if hasattr(self.model, "l%d" % i): x = getattr(self.model, "l%d" % i)(x) else: y = x break return y def select_action(self, state): self.last_q_list = self.forward(state) if self.training and random() < self.E_GREEDY: # http://www.sist.ac.jp/~kanakubo/research/reinforcement_learning.html return randint(0, len(self.actions)-1) else: return np.argmax(self.last_q_list.data) def update_q_table(self, last_action, cur_state, last_reward): target_val = last_reward + self.GAMMA * np.max(self.forward(cur_state).data) self.optimizer.zero_grads() # 結構無理やりLossを計算・・・ この辺の実装は自信がない tt = np.copy(self.last_q_list.data) tt[0][last_action] = target_val target = Variable(tt) loss = 0.5 * (target - self.last_q_list) ** 2 loss.grad = np.array([[self.ALPHA]], dtype=np.float32) loss.backward() self.optimizer.update()
class DenoisingAutoencoder: def __init__( self, n_input, n_hidden, tied=True, noise=None, ratio=None, optimizer=optimizers.Adam(), loss_function=F.sigmoid_cross_entropy, activation_function=F.sigmoid, ): self.model = FunctionSet(encoder=F.Linear(n_input, n_hidden), decoder=F.Linear(n_hidden, n_input)) if tied: self.model.decoder.W = self.model.encoder.W.T self.noise = noise self.ratio = ratio self.optimizer = optimizer self.optimizer.setup(self.model.collect_parameters()) self.loss_function = loss_function self.activation_function = activation_function def train(self, x_data): self.optimizer.zero_grads() loss = self.autoencode(x_data, train=True) loss.backward() self.optimizer.update() return loss def test(self, x_data): return self.autoencode(x_data, train=False) def autoencode(self, x_data, train=True): x = Variable(x_data) if self.noise and train: nx = Variable(self.noise.noise(x_data)) else: nx = Variable(x_data) if self.ratio: h = F.dropout(self.encode(nx), ratio=self.ratio, train=train) else: h = self.encode(nx) y = self.decode(h) return self.loss_function(y, x) def encode(self, x): return self.activation_function(self.model.encoder(x)) def decode(self, x): return self.activation_function(self.model.decoder(x))
def setup_model(gpu_id, n_channel, n_output): model = FunctionSet( conv1=F.Convolution2D(n_channel, 32, 5, pad=2), conv2=F.Convolution2D(32, 32, 5, pad=2), conv3=F.Convolution2D(32, 64, 5, pad=2), fl5=F.Linear(960, 64), fl6=F.Linear(64, n_output), ) # optimizer = optimizers.MomentumSGD(lr=1e-03) optimizer = optimizers.AdaGrad() optimizer.setup(model.collect_parameters()) mlp = ChainerModel(model, optimizer, forward_function=forward) return mlp
class DeepLearning: def __init__(self, input_size, hidden_size, output_size): self.model = FunctionSet(l1=F.Linear(input_size, hidden_size), l2=F.Linear(hidden_size, hidden_size), l3=F.Linear(hidden_size, output_size)) self.optimizer = optimizers.Adam() self.optimizer.setup(self.model.collect_parameters()) def batch(self, X_train, y_train, batch_size, perm): train_size = X_train.shape[0] for i in xrange(0, train_size, batch_size): X_batch = X_train[perm[i: i+batch_size]] y_batch = y_train[perm[i: i+batch_size]] # Chainer用に型変換 x = Variable(X_batch) t = Variable(y_batch) self.optimizer.zero_grads() y = self.forward(x) # 予測結果 loss = F.softmax_cross_entropy(y, t) loss.backward() self.optimizer.update() def forward(self, x, train=True): h1 = F.dropout(F.sigmoid(self.model.l1(x)), train=train) h2 = F.dropout(F.sigmoid(self.model.l2(h1)), train=train) return self.model.l3(h2) def predicate(self, x_data): x = np.array([x_data], dtype=np.float32) x = Variable(x) y = self.forward(x, train=False) return np.argmax(y.data) def save(self, fpath): pickle.dump(self.model, open(fpath, 'wb'), -1) def load(self, fpath): self.model = pickle.load(open(fpath,'rb'))
def main(n_bit, h_sizes): in_size = n_bit+n_bit+1 out_size = 2**(n_bit+1) layers = [in_size] + h_sizes + [out_size] model = FunctionSet() for li in range(1, len(layers)): setattr(model, "l%d" % li, F.Linear(layers[li-1], layers[li])) optimizer = optimizers.SGD() optimizer.setup(model.collect_parameters()) x_data, t_data = generate_training_cases(n_bit) for epoch in range(3000000): optimizer.zero_grads() loss, accuracy = forward(model, x_data, t_data) loss.backward() if epoch % 100 == 0: print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data) if accuracy.data == 1: break optimizer.update() print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data) return epoch, accuracy.data
class TestNestedFunctionSet(TestCase): def setUp(self): self.fs1 = FunctionSet( a = MockFunction((1, 2))) self.fs2 = FunctionSet( fs1 = self.fs1, b = MockFunction((3, 4))) def test_get_sorted_funcs(self): self.assertItemsEqual([k for (k, v) in self.fs2._get_sorted_funcs()], ('b', 'fs1')) def test_collect_parameters(self): p_b = np.zeros((3, 4)).astype(np.float32) p_a = np.zeros((1, 2)).astype(np.float32) gp_b = np.ones((3, 4)).astype(np.float32) gp_a = np.ones((1, 2)).astype(np.float32) actual = self.fs2.collect_parameters() self.assertTrue(map(len, actual) == [2, 2]) self.assertTrue((actual[0][0] == p_b).all()) self.assertTrue((actual[0][1] == p_a).all()) self.assertTrue((actual[1][0] == gp_b).all()) self.assertTrue((actual[1][1] == gp_a).all()) def test_pickle_cpu(self): fs2_serialized = pickle.dumps(self.fs2) fs2_loaded = pickle.loads(fs2_serialized) self.assertTrue((self.fs2.b.p == fs2_loaded.b.p).all()) self.assertTrue((self.fs2.fs1.a.p == fs2_loaded.fs1.a.p).all()) @attr.gpu def test_pickle_gpu(self): self.fs2.to_gpu() fs2_serialized = pickle.dumps(self.fs2) fs2_loaded = pickle.loads(fs2_serialized) fs2_loaded.to_cpu() self.fs2.to_cpu() self.assertTrue((self.fs2.b.p == fs2_loaded.b.p).all()) self.assertTrue((self.fs2.fs1.a.p == fs2_loaded.fs1.a.p).all())
class MNISTNet(): def __init__(self): n_in = 28 * 28 n_hidden = 100 self.model = FunctionSet( encode=F.Linear(n_in, n_hidden), decode=F.Linear(n_hidden, n_in) ) self.optimizer = optimizers.Adam() self.optimizer.setup(self.model.collect_parameters()) def encode(self, x_var): return F.sigmoid(self.model.encode(x_var)) def decode(self, x_var): return F.sigmoid(self.model.decode(x_var)) def predict(self, x_data): x = Variable(x_data) p = self.encode(x) return p.data def cost(self, x_data, dropout=True): x = Variable(x_data) t = Variable(x_data) if dropout: x_n = F.dropout(x, ratio=0.4) else: x_n = x h = self.encode(x_n) y = self.decode(h) return F.mean_squared_error(y, t) def train(self, x_data): self.optimizer.zero_grads() loss = self.cost(x_data) loss.backward() self.optimizer.update() return float(loss.data)
def main(n_bit, h1_size): if h1_size > 0: model = FunctionSet( l1=F.Linear(n_bit, h1_size), l2=F.Linear(h1_size, 2**n_bit) ) else: model = FunctionSet( l1=F.Linear(n_bit, 2**n_bit) ) optimizer = optimizers.SGD() optimizer.setup(model.collect_parameters()) x_data, t_data = generate_training_cases(n_bit) for epoch in range(100000): optimizer.zero_grads() loss, accuracy = forward(model, x_data, t_data) loss.backward() if epoch % 100 == 0: print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data) if accuracy.data == 1: break optimizer.update() print "epoch: %s, loss: %s, accuracy: %s" % (epoch, loss.data, accuracy.data) return epoch, accuracy.data
class DQN_class: gamma = 0.99 #initial_exploration = 10**2 initial_exploration = 10**2 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 #data_size = 10**6 data_size = 10**6 def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8]): # """ [ 0, 0], # [ 0, 1], # [ 0,-1], # [ 1, 0], # [ 1, 1], # [ 1,-1], # [-1, 0], # [-1, 1], # [-1,-1]]):""" self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller print "Initializing DQN..." print "CUDA init" #cuda.init() print "Model Building" self.model = FunctionSet( #l1 = F.Linear(INPUT_SIZE, 5000), # input map[100, 100] + v[2] + w[1] + wp[2] l1 = F.Linear(INPUT_SIZE, 100), # input map[100, 100] + v[2] + w[1] + wp[2] #l2 = F.Linear(5000, 1000), #l3 = F.Linear(1000, 500), #l4 = F.Linear(500, 100), #l5 = F.Linear(100, self.num_of_actions, l2 = F.Linear(100, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 100), dtype=np.float32)) ).to_gpu() self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) ### 重要!!!! RMSProp!! self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.float32), np.zeros((self.data_size, INPUT_SIZE), dtype=np.float32), np.zeros((self.data_size, 1), dtype=np.bool)] #self.D = [np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8), # np.zeros(self.data_size, dtype=np.uint8), # np.zeros((self.data_size, 1), dtype=np.int8), # np.zeros((self.data_size, INPUT_SIZE), dtype=np.uint8), # np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) #action_index = self.action_to_index(action[i]) #target[i, action_index] = tmp_ target[i, action[i]] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error #print "td-error" #print "np.max(td.data) : ", #print np.max(td.data.get()) # 何のためにあるのか不明 td = td_clipとなっている td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) #print "td_clip.data :", #print td_clip.data zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions))).astype(np.float32)) #zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions)))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q # Dataを保存 def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag # mini batch学習 def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) #s_replay = np.ndarray(shape=(self.replay_size, 100, 100), dtype=np.float32) s_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, INPUT_SIZE), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] if i == 0: print "s", s_replay[0][0], s_replay[0][1]*180/np.pi print "a", a_replay[0] print "s\'", s_dash_replay[0][0], s_dash_replay[0][1]*180/np.pi print "r", r_replay[0] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() ### 逆伝播 self.optimizer.update() ### 学習!!!!!!!!!, ネットワークの更新 def Q_func(self, state): #h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs into [0.0 1.0] h1 = F.relu(self.model.l1(state)) # scale inputs into [0.0 1.0] #h2 = F.relu(self.model.l2(h1)) #h3 = F.relu(self.model.l3(h2)) #h4 = F.relu(self.model.l4(h3)) #Q = self.model.l5(h4) Q = self.model.l2(h1) return Q def Q_func_target(self, state): #h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs into [0.0 1.0] h1 = F.relu(self.model_target.l1(state)) # scale inputs into [0.0 1.0] #h2 = F.relu(self.model_target.l2(h1)) #h3 = F.relu(self.model_target.l3(h2)) #h4 = F.relu(self.model_target.l4(h3)) #Q = self.model.l5(h4) Q = self.model.l2(h1) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: #index_action = np.random.randint(0, self.num_of_actions) action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: #index_action = np.argmax(Q.get()) action = np.argmax(Q.get()) print "GREEDY" #return self.index_to_action(index_action), Q return action, Q def action_to_vec(self, action, vec): # """ [ 0, 0], # [ 0, 1], # [ 0,-1], # [ 1, 0], # [ 1, 1], # [ 1,-1], # [-1, 0], # [-1, 1], # [-1,-1]]):""" #vec = Twist() if action == 3 or action == 4 or action == 5: #vec.linear.x += 0.1 vec.linear.x = 0.3 elif action == 6 or action == 7 or action == 8: #vec.linear.x -= 0.1 vec.linear.x = -0.3 else: vec.linear.x = 0.0 if action == 1 or action == 4 or action == 7: #vec.angular.z += 0.1 vec.angular.z = 0.3 elif action == 2 or action == 5 or action == 8: #vec.angular.z -= 0.1 vec.angular.z = -0.3 else: vec.angular.z = 0.0 if vec.linear.x > 1: vec.linear.x = 1 elif vec.linear.x < -1: vec.linear.x = -1 if vec.angular.z > 1: vec.angular.z = 1 elif vec.angular.z < -1: vec.angular.z = -1 return vec
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original replay_size = 32 # Replay (batch) size target_model_update_freq = 10**2 # Target update frequancy. original data_size = 10**5 # Data size of history. original #actions are 0 => do nothing, 1 -> buy, -1 sell def __init__(self, input_vector_length,enable_controller=[0, 1, 2]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" self.input_vector_length = input_vector_length print "Initializing DQN..." # Initialization for Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() #inputs --> 5 * 14 (with 10 temporality) + 5 (of last one hour) + 5 (of last 24 hour) print "Model Building" self.model = FunctionSet( l1=F.Linear(input_vector_length, 500), l2=F.Linear(500, 250), l3=F.Linear(250, 80), q_value=F.Linear(80, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 80), dtype=np.float32)) ).to_gpu() print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.input_vector_length), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals max_Q_dash_ = self.Q_func(s_dash) tmp = list(map(np.max, max_Q_dash_.data.get())) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) target[i, self.action_to_index(action[i])] = tmp_ loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.input_vector_length), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): #todo might want to normalize input, but for now I will do that outside this class h1 = F.relu(self.model.l1(state)) h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor def __init__(self, enable_controller=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" w = math.sqrt(2) # MSRA scaling self.model = FunctionSet( conv1=F.Convolution2D(3, 64, 7, wscale=w, stride=2, pad=3), conv2_1_1=F.Convolution2D(64, 64, 1, wscale=w, stride=1), conv2_1_2=F.Convolution2D(64, 64, 3, wscale=w, stride=1, pad=1), conv2_1_3=F.Convolution2D(64, 256, 1, wscale=w, stride=1), conv2_1_ex=F.Convolution2D(64, 256, 1, wscale=w, stride=1), conv2_2_1=F.Convolution2D(256, 64, 1, wscale=w, stride=1), conv2_2_2=F.Convolution2D(64, 64, 3, wscale=w, stride=1, pad=1), conv2_2_3=F.Convolution2D(64, 256, 1, wscale=w, stride=1), conv2_3_1=F.Convolution2D(256, 64, 1, wscale=w, stride=1), conv2_3_2=F.Convolution2D(64, 64, 3, wscale=w, stride=1, pad=1), conv2_3_3=F.Convolution2D(64, 256, 1, wscale=w, stride=1), conv3_1_1=F.Convolution2D(256, 128, 1, wscale=w, stride=2), conv3_1_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_1_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_1_ex=F.Convolution2D(256, 512, 1, wscale=w, stride=2), conv3_2_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_2_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_2_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_3_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_3_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_3_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_4_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_4_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_4_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_5_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_5_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_5_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_6_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_6_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_6_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_7_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_7_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_7_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv3_8_1=F.Convolution2D(512, 128, 1, wscale=w, stride=1), conv3_8_2=F.Convolution2D(128, 128, 3, wscale=w, stride=1, pad=1), conv3_8_3=F.Convolution2D(128, 512, 1, wscale=w, stride=1), conv4_1_1=F.Convolution2D(512, 256, 1, wscale=w, stride=2), conv4_1_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_1_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_1_ex=F.Convolution2D(512, 1024, 1, wscale=w, stride=2), conv4_2_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_2_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_2_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_3_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_3_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_3_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_4_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_4_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_4_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_5_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_5_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_5_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_6_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_6_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_6_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_7_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_7_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_7_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_8_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_8_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_8_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_9_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_9_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_9_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_10_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_10_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_10_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_11_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_11_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_11_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_12_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_12_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_12_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_13_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_13_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_13_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_14_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_14_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_14_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_15_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_15_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_15_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_16_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_16_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_16_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_17_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_17_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_17_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_18_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_18_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_18_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_19_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_19_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_19_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_20_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_20_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_20_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_21_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_21_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_21_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_22_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_22_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_22_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_23_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_23_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_23_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_24_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_24_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_24_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_25_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_25_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_25_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_26_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_26_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_26_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_27_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_27_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_27_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_28_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_28_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_28_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_29_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_29_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_29_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_30_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_30_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_30_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_31_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_31_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_31_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_32_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_32_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_32_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_33_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_33_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_33_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_34_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_34_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_34_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_35_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_35_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_35_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv4_36_1=F.Convolution2D(1024, 256, 1, wscale=w, stride=1), conv4_36_2=F.Convolution2D(256, 256, 3, wscale=w, stride=1, pad=1), conv4_36_3=F.Convolution2D(256, 1024, 1, wscale=w, stride=1), conv5_1_1=F.Convolution2D(1024, 512, 1, wscale=w, stride=2), conv5_1_2=F.Convolution2D(512, 512, 3, wscale=w, stride=1, pad=1), conv5_1_3=F.Convolution2D(512, 2048, 1, wscale=w, stride=1), conv5_1_ex=F.Convolution2D(1024, 2048, 1, wscale=w, stride=2), conv5_2_1=F.Convolution2D(2048, 512, 1, wscale=w, stride=1), conv5_2_2=F.Convolution2D(512, 512, 3, wscale=w, stride=1, pad=1), conv5_2_3=F.Convolution2D(512, 2048, 1, wscale=w, stride=1), conv5_3_1=F.Convolution2D(2048, 512, 1, wscale=w, stride=1), conv5_3_2=F.Convolution2D(512, 512, 3, wscale=w, stride=1, pad=1), conv5_3_3=F.Convolution2D(512, 2048, 1, wscale=w, stride=1), q_value=F.Linear(2048, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 2048), dtype=np.float32)) ) self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.Adam() self.optimizer.setup(self.model.collect_parameters()) def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((num_of_batch, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def Q_func(self, state): h = F.relu(self.model.conv1(state)) h = F.max_pooling_2d(h, 3, stride=2) h_rem = self.model.conv2_1_ex(h) h = F.relu(self.model.conv2_1_1(h)) h = F.relu(self.model.conv2_1_2(h)) h = self.model.conv2_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv2_2_1(h)) h = F.relu(self.model.conv2_2_2(h)) h = self.model.conv2_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv2_3_1(h)) h = F.relu(self.model.conv2_3_2(h)) h = self.model.conv2_3_3(h) h = F.relu(h + h_rem) h_rem = self.model.conv3_1_ex(h) h = F.relu(self.model.conv3_1_1(h)) h = F.relu(self.model.conv3_1_2(h)) h = self.model.conv3_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_2_1(h)) h = F.relu(self.model.conv3_2_2(h)) h = self.model.conv3_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_3_1(h)) h = F.relu(self.model.conv3_3_2(h)) h = self.model.conv3_3_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_4_1(h)) h = F.relu(self.model.conv3_4_2(h)) h = self.model.conv3_4_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_5_1(h)) h = F.relu(self.model.conv3_5_2(h)) h = self.model.conv3_5_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_6_1(h)) h = F.relu(self.model.conv3_6_2(h)) h = self.model.conv3_6_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_7_1(h)) h = F.relu(self.model.conv3_7_2(h)) h = self.model.conv3_7_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv3_8_1(h)) h = F.relu(self.model.conv3_8_2(h)) h = self.model.conv3_8_3(h) h = F.relu(h + h_rem) h_rem = self.model.conv4_1_ex(h) h = F.relu(self.model.conv4_1_1(h)) h = F.relu(self.model.conv4_1_2(h)) h = self.model.conv4_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_2_1(h)) h = F.relu(self.model.conv4_2_2(h)) h = self.model.conv4_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_3_1(h)) h = F.relu(self.model.conv4_3_2(h)) h = self.model.conv4_3_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_4_1(h)) h = F.relu(self.model.conv4_4_2(h)) h = self.model.conv4_4_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_5_1(h)) h = F.relu(self.model.conv4_5_2(h)) h = self.model.conv4_5_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_6_1(h)) h = F.relu(self.model.conv4_6_2(h)) h = self.model.conv4_6_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_7_1(h)) h = F.relu(self.model.conv4_7_2(h)) h = self.model.conv4_7_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_8_1(h)) h = F.relu(self.model.conv4_8_2(h)) h = self.model.conv4_8_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_9_1(h)) h = F.relu(self.model.conv4_9_2(h)) h = self.model.conv4_9_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_10_1(h)) h = F.relu(self.model.conv4_10_2(h)) h = self.model.conv4_10_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_11_1(h)) h = F.relu(self.model.conv4_11_2(h)) h = self.model.conv4_11_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_12_1(h)) h = F.relu(self.model.conv4_12_2(h)) h = self.model.conv4_12_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_13_1(h)) h = F.relu(self.model.conv4_13_2(h)) h = self.model.conv4_13_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_14_1(h)) h = F.relu(self.model.conv4_14_2(h)) h = self.model.conv4_14_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_15_1(h)) h = F.relu(self.model.conv4_15_2(h)) h = self.model.conv4_15_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_16_1(h)) h = F.relu(self.model.conv4_16_2(h)) h = self.model.conv4_16_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_17_1(h)) h = F.relu(self.model.conv4_17_2(h)) h = self.model.conv4_17_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_18_1(h)) h = F.relu(self.model.conv4_18_2(h)) h = self.model.conv4_18_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_19_1(h)) h = F.relu(self.model.conv4_19_2(h)) h = self.model.conv4_19_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_20_1(h)) h = F.relu(self.model.conv4_20_2(h)) h = self.model.conv4_20_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_21_1(h)) h = F.relu(self.model.conv4_21_2(h)) h = self.model.conv4_21_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_22_1(h)) h = F.relu(self.model.conv4_22_2(h)) h = self.model.conv4_22_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_23_1(h)) h = F.relu(self.model.conv4_23_2(h)) h = self.model.conv4_23_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_24_1(h)) h = F.relu(self.model.conv4_24_2(h)) h = self.model.conv4_24_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_25_1(h)) h = F.relu(self.model.conv4_25_2(h)) h = self.model.conv4_25_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_26_1(h)) h = F.relu(self.model.conv4_26_2(h)) h = self.model.conv4_26_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_27_1(h)) h = F.relu(self.model.conv4_27_2(h)) h = self.model.conv4_27_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_28_1(h)) h = F.relu(self.model.conv4_28_2(h)) h = self.model.conv4_28_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_29_1(h)) h = F.relu(self.model.conv4_29_2(h)) h = self.model.conv4_29_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_30_1(h)) h = F.relu(self.model.conv4_30_2(h)) h = self.model.conv4_30_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_31_1(h)) h = F.relu(self.model.conv4_31_2(h)) h = self.model.conv4_31_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_32_1(h)) h = F.relu(self.model.conv4_32_2(h)) h = self.model.conv4_32_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_33_1(h)) h = F.relu(self.model.conv4_33_2(h)) h = self.model.conv4_33_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_34_1(h)) h = F.relu(self.model.conv4_34_2(h)) h = self.model.conv4_34_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_35_1(h)) h = F.relu(self.model.conv4_35_2(h)) h = self.model.conv4_35_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv4_36_1(h)) h = F.relu(self.model.conv4_36_2(h)) h = self.model.conv4_36_3(h) h = F.relu(h + h_rem) h_rem = self.model.conv5_1_ex(h) h = F.relu(self.model.conv5_1_1(h)) h = F.relu(self.model.conv5_1_2(h)) h = self.model.conv5_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv5_2_1(h)) h = F.relu(self.model.conv5_2_2(h)) h = self.model.conv5_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model.conv5_3_1(h)) h = F.relu(self.model.conv5_3_2(h)) h = self.model.conv5_3_3(h) h = F.relu(h + h_rem) h = F.average_pooling_2d(h, 7) Q = self.model.q_value(h) return Q def Q_func_target(self, state): h = F.relu(self.model_target.conv1(state)) h = F.max_pooling_2d(h, 3, stride=2) h_rem = self.model_target.conv2_1_ex(h) h = F.relu(self.model_target.conv2_1_1(h)) h = F.relu(self.model_target.conv2_1_2(h)) h = self.model_target.conv2_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv2_2_1(h)) h = F.relu(self.model_target.conv2_2_2(h)) h = self.model_target.conv2_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv2_3_1(h)) h = F.relu(self.model_target.conv2_3_2(h)) h = self.model_target.conv2_3_3(h) h = F.relu(h + h_rem) h_rem = self.model_target.conv3_1_ex(h) h = F.relu(self.model_target.conv3_1_1(h)) h = F.relu(self.model_target.conv3_1_2(h)) h = self.model_target.conv3_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_2_1(h)) h = F.relu(self.model_target.conv3_2_2(h)) h = self.model_target.conv3_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_3_1(h)) h = F.relu(self.model_target.conv3_3_2(h)) h = self.model_target.conv3_3_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_4_1(h)) h = F.relu(self.model_target.conv3_4_2(h)) h = self.model_target.conv3_4_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_5_1(h)) h = F.relu(self.model_target.conv3_5_2(h)) h = self.model_target.conv3_5_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_6_1(h)) h = F.relu(self.model_target.conv3_6_2(h)) h = self.model_target.conv3_6_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_7_1(h)) h = F.relu(self.model_target.conv3_7_2(h)) h = self.model_target.conv3_7_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv3_8_1(h)) h = F.relu(self.model_target.conv3_8_2(h)) h = self.model_target.conv3_8_3(h) h = F.relu(h + h_rem) h_rem = self.model_target.conv4_1_ex(h) h = F.relu(self.model_target.conv4_1_1(h)) h = F.relu(self.model_target.conv4_1_2(h)) h = self.model_target.conv4_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_2_1(h)) h = F.relu(self.model_target.conv4_2_2(h)) h = self.model_target.conv4_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_3_1(h)) h = F.relu(self.model_target.conv4_3_2(h)) h = self.model_target.conv4_3_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_4_1(h)) h = F.relu(self.model_target.conv4_4_2(h)) h = self.model_target.conv4_4_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_5_1(h)) h = F.relu(self.model_target.conv4_5_2(h)) h = self.model_target.conv4_5_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_6_1(h)) h = F.relu(self.model_target.conv4_6_2(h)) h = self.model_target.conv4_6_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_7_1(h)) h = F.relu(self.model_target.conv4_7_2(h)) h = self.model_target.conv4_7_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_8_1(h)) h = F.relu(self.model_target.conv4_8_2(h)) h = self.model_target.conv4_8_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_9_1(h)) h = F.relu(self.model_target.conv4_9_2(h)) h = self.model_target.conv4_9_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_10_1(h)) h = F.relu(self.model_target.conv4_10_2(h)) h = self.model_target.conv4_10_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_11_1(h)) h = F.relu(self.model_target.conv4_11_2(h)) h = self.model_target.conv4_11_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_12_1(h)) h = F.relu(self.model_target.conv4_12_2(h)) h = self.model_target.conv4_12_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_13_1(h)) h = F.relu(self.model_target.conv4_13_2(h)) h = self.model_target.conv4_13_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_14_1(h)) h = F.relu(self.model_target.conv4_14_2(h)) h = self.model_target.conv4_14_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_15_1(h)) h = F.relu(self.model_target.conv4_15_2(h)) h = self.model_target.conv4_15_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_16_1(h)) h = F.relu(self.model_target.conv4_16_2(h)) h = self.model_target.conv4_16_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_17_1(h)) h = F.relu(self.model_target.conv4_17_2(h)) h = self.model_target.conv4_17_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_18_1(h)) h = F.relu(self.model_target.conv4_18_2(h)) h = self.model_target.conv4_18_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_19_1(h)) h = F.relu(self.model_target.conv4_19_2(h)) h = self.model_target.conv4_19_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_20_1(h)) h = F.relu(self.model_target.conv4_20_2(h)) h = self.model_target.conv4_20_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_21_1(h)) h = F.relu(self.model_target.conv4_21_2(h)) h = self.model_target.conv4_21_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_22_1(h)) h = F.relu(self.model_target.conv4_22_2(h)) h = self.model_target.conv4_22_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_23_1(h)) h = F.relu(self.model_target.conv4_23_2(h)) h = self.model_target.conv4_23_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_24_1(h)) h = F.relu(self.model_target.conv4_24_2(h)) h = self.model_target.conv4_24_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_25_1(h)) h = F.relu(self.model_target.conv4_25_2(h)) h = self.model_target.conv4_25_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_26_1(h)) h = F.relu(self.model_target.conv4_26_2(h)) h = self.model_target.conv4_26_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_27_1(h)) h = F.relu(self.model_target.conv4_27_2(h)) h = self.model_target.conv4_27_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_28_1(h)) h = F.relu(self.model_target.conv4_28_2(h)) h = self.model_target.conv4_28_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_29_1(h)) h = F.relu(self.model_target.conv4_29_2(h)) h = self.model_target.conv4_29_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_30_1(h)) h = F.relu(self.model_target.conv4_30_2(h)) h = self.model_target.conv4_30_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_31_1(h)) h = F.relu(self.model_target.conv4_31_2(h)) h = self.model_target.conv4_31_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_32_1(h)) h = F.relu(self.model_target.conv4_32_2(h)) h = self.model_target.conv4_32_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_33_1(h)) h = F.relu(self.model_target.conv4_33_2(h)) h = self.model_target.conv4_33_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_34_1(h)) h = F.relu(self.model_target.conv4_34_2(h)) h = self.model_target.conv4_34_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_35_1(h)) h = F.relu(self.model_target.conv4_35_2(h)) h = self.model_target.conv4_35_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv4_36_1(h)) h = F.relu(self.model_target.conv4_36_2(h)) h = self.model_target.conv4_36_3(h) h = F.relu(h + h_rem) h_rem = self.model_target.conv5_1_ex(h) h = F.relu(self.model_target.conv5_1_1(h)) h = F.relu(self.model_target.conv5_1_2(h)) h = self.model_target.conv5_1_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv5_2_1(h)) h = F.relu(self.model_target.conv5_2_2(h)) h = self.model_target.conv5_2_3(h) h = F.relu(h + h_rem) h_rem = h h = F.relu(self.model_target.conv5_3_1(h)) h = F.relu(self.model_target.conv5_3_2(h)) h = self.model_target.conv5_3_3(h) h = F.relu(h + h_rem) h = F.average_pooling_2d(h, 7) Q = self.model_target.q_value(h) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action) def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." # Initialization for Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 16, ksize=8, stride=4, wscale=np.sqrt(2)), l2=F.Convolution2D(16, 32, ksize=4, stride=2, wscale=np.sqrt(2)), l3=F.Linear(2592, 256), q_value=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256), dtype=np.float32)) ).to_gpu() print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.0002, alpha=0.3, momentum=0.2) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals max_Q_dash_ = self.Q_func(s_dash) tmp = list(map(np.max, max_Q_dash_.data.get())) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) target[i, self.action_to_index(action[i])] = tmp_ loss = F.mean_squared_error(Variable(cuda.to_gpu(target)), Q) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0, 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) Q = self.model.q_value(h3) return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100#10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 #10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." print "Model Building" self.CNN_model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), ) self.model = FunctionSet( l4=F.Linear(3136, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32)) ).to_gpu() d = 'elite/' self.CNN_model.l1.W.data = np.load(d+'l1_W.npy')#.astype(np.float32) self.CNN_model.l1.b.data = np.load(d+'l1_b.npy')#.astype(np.float32) self.CNN_model.l2.W.data = np.load(d+'l2_W.npy')#.astype(np.float32) self.CNN_model.l2.b.data = np.load(d+'l2_b.npy')#.astype(np.float32) self.CNN_model.l3.W.data = np.load(d+'l3_W.npy')#.astype(np.float32) self.CNN_model.l3.b.data = np.load(d+'l3_b.npy')#.astype(np.float32) self.CNN_model = self.CNN_model.to_gpu() self.CNN_model_target = copy.deepcopy(self.CNN_model) self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool), np.zeros((self.data_size, 1), dtype=np.uint8)] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, lstm_reward, state_dash, episode_end_flag, ale_reward): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = lstm_reward self.D[5][data_index] = ale_reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = lstm_reward self.D[3][data_index] = state_dash self.D[5][data_index] = ale_reward self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.CNN_model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model.l2(h1)) h3 = F.relu(self.CNN_model.l3(h2)) h4 = F.relu(self.model.l4(h3)) #test now #print h3.data.shape Q = self.model.q_value(h4) return Q def Q_func_LSTM(self, state): h1 = F.relu(self.CNN_model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model.l2(h1)) h3 = F.relu(self.CNN_model.l3(h2)) return h3.data.get() def Q_func_target(self, state): h1 = F.relu(self.CNN_model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model_target.l2(h1)) h3 = F.relu(self.CNN_model_target.l3(h2)) h4 = F.relu(self.model.l4(h3)) Q = self.model_target.q_value(h4) return Q def LSTM_reward(self, lstm_out, state_next): lstm_reward = np.sign((self.lstm_loss - (lstm_out - state_next)**2)) return lstm_reward def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
# Neural net architecture # ニューラルネットの構造 def forward(x_data, y_data, train=True): x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(model.l1(x)), train=train) h2 = F.dropout(F.relu(model.l2(h1)), train=train) y = model.l3(h2) # 多クラス分類なので誤差関数としてソフトマックス関数の # 交差エントロピー関数を用いて、誤差を導出 return F.softmax_cross_entropy(y, t), F.accuracy(y, t) # Setup optimizer optimizer = optimizers.Adam() optimizer.setup(model.collect_parameters()) train_loss = [] train_acc = [] test_loss = [] test_acc = [] l1_W = [] l2_W = [] l3_W = [] # Learning loop for epoch in xrange(1, n_epoch + 1): print 'epoch', epoch # training
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 4 def __init__(self, use_gpu, num_of_action_type, num_of_pad, dim): self.use_gpu = use_gpu self.num_of_action_type = num_of_action_type self.num_of_pad = num_of_pad self.num_of_actions = num_of_action_type * num_of_pad self.dim = dim print("Initializing Q-Network...\n") self.q_net_filename = "q_net.pickle" if os.path.exists(self.q_net_filename): print("Loading Q-Network Model...\n") self.model = self.load_model() else: hidden_dim = 256 self.model = FunctionSet(l4=F.Linear(self.dim * self.hist_size, hidden_dim, wscale=np.sqrt(2)), q_value=F.Linear( hidden_dim, self.num_of_actions, initialW=np.zeros( (self.num_of_actions, hidden_dim), dtype=np.float32))) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [ np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, self.num_of_pad), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def load_model(self): return pickle.load(open(self.q_net_filename, 'rb')) def dump_model(self): pickle.dump(self.model, open(self.q_net_filename, 'wb')) def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_indexes = self.action_to_indexes(action[i]) for index in action_indexes: target[i, index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward else: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward self.d[3][data_index] = state_dash self.d[4][data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, self.num_of_pad), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state)) q = self.model.q_value(h4 / 255.0) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) q = self.model_target.q_value(h4) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: print(" Random") action = [ np.random.randint(0, self.num_of_action_type) for i in range(self.num_of_pad) ] else: print("#Greedy") if self.use_gpu >= 0: action = self.indexes_to_action([ np.argmax(sq) for sq in np.split(q.get()[0], self.num_of_pad) ]) else: action = self.indexes_to_action( [np.argmax(sq) for sq in np.split(q[0], self.num_of_pad)]) return action, q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def indexes_to_action(self, indexs_of_action): return [index % self.num_of_action_type for index in indexs_of_action] def action_to_indexes(self, action): return [ self.num_of_action_type * i + a for (i, a) in enumerate(action) ]
def Main(): import argparse import numpy as np from chainer import cuda, Variable, FunctionSet, optimizers import chainer.functions as F parser = argparse.ArgumentParser(description='Chainer example: regression') parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() batchsize = 10 n_epoch = NEpoch n_units = 300 #TEST # Prepare dataset data_x, data_y = LoadData() batchsize= max(1,min(batchsize, len(data_y)/20)) #TEST: adjust batchsize #dx2,dy2=GenData(300, noise=0.0); data_x.extend(dx2); data_y.extend(dy2) data = np.array(data_x).astype(np.float32) target = np.array(data_y).astype(np.int32) #DIFF_REG N= len(data) #batchsize * 30 x_train= data y_train= target #For test: mi,ma,me= GetStat(data_x) f_reduce=lambda xa:[xa[0],xa[1]] f_repair=lambda xa:[xa[0],xa[1]] nt= 20+1 N_test= nt*nt x_test= np.array(sum([[f_repair([x1,x2]) for x2 in FRange1(f_reduce(mi)[1],f_reduce(ma)[1],nt)] for x1 in FRange1(f_reduce(mi)[0],f_reduce(ma)[0],nt)],[])).astype(np.float32) y_test= np.array([0.0 for x in x_test]).astype(np.int32) #DIFF_REG #No true test data (just for plotting) print 'Num of samples for train:',len(y_train),'batchsize:',batchsize # Dump data for plot: DumpData('/tmp/nn/smpl_train.dat', x_train, [[y] for y in y_train], f_reduce) #DIFF_REG # Prepare multi-layer perceptron model model = FunctionSet(l1=F.Linear(2, n_units), l2=F.Linear(n_units, n_units), l3=F.Linear(n_units, 3)) #TEST: Random bias initialization #, bias=Rand() #model.l1.b[:]= [Rand() for k in range(n_units)] #model.l2.b[:]= [Rand() for k in range(n_units)] #model.l3.b[:]= [Rand() for k in range(1)] #print model.l2.__dict__ if args.gpu >= 0: cuda.init(args.gpu) model.to_gpu() # Neural net architecture def forward(x_data, y_data, train=True): #train= False #TEST: Turn off dropout dratio= 0.2 #0.5 #TEST: Dropout ratio x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(model.l1(x)), ratio=dratio, train=train) h2 = F.dropout(F.relu(model.l2(h1)), ratio=dratio, train=train) #h1 = F.dropout(F.leaky_relu(model.l1(x),slope=0.2), ratio=dratio, train=train) #h2 = F.dropout(F.leaky_relu(model.l2(h1),slope=0.2), ratio=dratio, train=train) #h1 = F.dropout(F.sigmoid(model.l1(x)), ratio=dratio, train=train) #h2 = F.dropout(F.sigmoid(model.l2(h1)), ratio=dratio, train=train) #h1 = F.dropout(F.tanh(model.l1(x)), ratio=dratio, train=train) #h2 = F.dropout(F.tanh(model.l2(h1)), ratio=dratio, train=train) #h1 = F.dropout(model.l1(x), ratio=dratio, train=train) #h2 = F.dropout(model.l2(h1), ratio=dratio, train=train) #h1 = F.relu(model.l1(x)) #h2 = F.relu(model.l2(h1)) #h1 = model.l1(x) #h2 = model.l2(h1) y = model.l3(h2) #return F.mean_squared_error(y, t), y return F.softmax_cross_entropy(y, t), F.softmax(y) #DIFF_REG # Setup optimizer optimizer = optimizers.AdaDelta(rho=0.9) #optimizer = optimizers.AdaGrad(lr=0.5) #optimizer = optimizers.RMSprop() #optimizer = optimizers.MomentumSGD() #optimizer = optimizers.SGD(lr=0.8) optimizer.setup(model.collect_parameters()) # Learning loop for epoch in xrange(1, n_epoch+1): print 'epoch', epoch # training perm = np.random.permutation(N) sum_loss = 0 for i in xrange(0, N, batchsize): x_batch = x_train[perm[i:i+batchsize]] y_batch = y_train[perm[i:i+batchsize]] if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) optimizer.zero_grads() loss, pred = forward(x_batch, y_batch) loss.backward() #Computing gradients optimizer.update() sum_loss += float(cuda.to_cpu(loss.data)) * batchsize print 'train mean loss={}'.format( sum_loss / N) if epoch%10==0: #''' # testing all data preds = [] x_batch = x_test[:] y_batch = y_test[:] if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) loss, pred = forward(x_batch, y_batch, train=False) preds = cuda.to_cpu(pred.data) sum_loss = float(cuda.to_cpu(loss.data)) * len(y_test) #''' print 'test mean loss={}'.format( sum_loss / N_test) # Dump data for plot: y_pred= [[y.index(max(y))]+y for y in preds.tolist()] #DIFF_REG DumpData('/tmp/nn/nn_test%04i.dat'%epoch, x_test, y_pred, f_reduce, lb=nt+1)
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 # original: 4 def __init__(self, use_gpu, enable_controller, dim, epsilon, epsilon_delta, min_eps): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim self.epsilon = epsilon self.epsilon_delta = epsilon_delta self.min_eps = min_eps self.time = 0 app_logger.info("Initializing Q-Network...") hidden_dim = 256 self.model = FunctionSet( l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)), q_value=F.Linear(hidden_dim, self.num_of_actions, initialW=np.zeros((self.num_of_actions, hidden_dim), dtype=np.float32)) ) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) q = self.model.q_value(h4) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) q = self.model_target.q_value(h4) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) app_logger.info(" Random") else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) app_logger.info("#Greedy") return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action) def start(self, feature): self.state = np.zeros((self.hist_size, self.dim), dtype=np.uint8) self.state[0] = feature state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Generate an Action e-greedy action, q_now = self.e_greedy(state_, self.epsilon) return_action = action return return_action def update_model(self, replayed_experience): if replayed_experience[0]: self.optimizer.zero_grads() loss, _ = self.forward(replayed_experience[1], replayed_experience[2], replayed_experience[3], replayed_experience[4], replayed_experience[5]) loss.backward() self.optimizer.update() # Target model update if replayed_experience[0] and np.mod(self.time, self.target_model_update_freq) == 0: app_logger.info("Model Updated") self.target_model_update() self.time += 1 app_logger.info("step: {}".format(self.time)) def step(self, features): if self.hist_size == 4: self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], features], dtype=np.uint8) elif self.hist_size == 2: self.state = np.asanyarray([self.state[1], features], dtype=np.uint8) elif self.hist_size == 1: self.state = np.asanyarray([features], dtype=np.uint8) else: app_logger.error("self.DQN.hist_size err") state_ = np.asanyarray(self.state.reshape(1, self.hist_size, self.dim), dtype=np.float32) if self.use_gpu >= 0: state_ = cuda.to_gpu(state_) # Exploration decays along the time sequence if self.initial_exploration < self.time: self.epsilon -= self.epsilon_delta if self.epsilon < self.min_eps: self.epsilon = self.min_eps eps = self.epsilon else: # Initial Exploation Phase app_logger.info("Initial Exploration : {}/{} steps".format(self.time, self.initial_exploration)) eps = 1.0 # Generate an Action by e-greedy action selection action, q_now = self.e_greedy(state_, eps) if self.use_gpu >= 0: q_max = np.max(q_now.get()) else: q_max = np.max(q_now) return action, eps, q_max
class ConvolutionalDenoisingAutoencoder(): def __init__(self, imgsize, n_in_channels, n_out_channels, ksize, stride=1, pad=0, use_cuda=False): self.model = FunctionSet( encode=F.Convolution2D(n_in_channels, n_out_channels, ksize, stride, pad), decode=F.Linear( n_out_channels * (math.floor( (imgsize + 2 * pad - ksize) / stride) + 1)**2, n_in_channels * imgsize**2)) self.use_cuda = use_cuda if self.use_cuda: self.model.to_gpu() self.optimizer = optimizers.Adam() self.optimizer.setup(self.model.collect_parameters()) def encode(self, x_var): return F.sigmoid(self.model.encode(x_var)) def decode(self, x_var): return self.model.decode(x_var) def predict(self, x_data): if self.use_cuda: x_data = cuda.to_gpu(x_data) x = Variable(x_data) p = self.encode(x) if self.use_cuda: return cuda.to_cpu(p.data) else: return p.data def cost(self, x_data): x = Variable(x_data) t = Variable( x_data.reshape(x_data.shape[0], x_data.shape[1] * x_data.shape[2] * x_data.shape[3])) h = F.dropout(x) h = self.encode(h) y = self.decode(h) return F.mean_squared_error(y, t) def train(self, x_data): if self.use_cuda: x_data = cuda.to_gpu(x_data) self.optimizer.zero_grads() loss = self.cost(x_data) loss.backward() self.optimizer.update() if self.use_cuda: return float(cuda.to_cpu(loss.data)) else: return loss.data def test(self, x_data): if self.use_cuda: x_data = cuda.to_gpu(x_data) loss = self.cost(x_data) return float(cuda.to_cpu(loss.data))
# Prepare multi-layer perceptron model model = FunctionSet(l1=F.Linear(784, n_units), l2=F.Linear(n_units, n_units), l3=F.Linear(n_units, 10)) # Neural net architecture def forward(x_data, y_data, train=True): x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(model.l1(x)), train=train) h2 = F.dropout(F.relu(model.l2(h1)), train=train) y = model.l3(h2) return F.softmax_cross_entropy(y, t), F.accuracy(y, t) # Setup optimizer optimizer = optimizers.Adam() optimizer.setup(model.collect_parameters()) # Learning loop for epoch in xrange(1, n_epoch+1): print 'epoch', epoch # training perm = np.random.permutation(N) sum_accuracy = 0 sum_loss = 0 for i in xrange(0, N, batchsize): x_batch = x_train[perm[i:i+batchsize]] y_batch = y_train[perm[i:i+batchsize]] optimizer.zero_grads() loss, acc = forward(x_batch, y_batch) loss.backward() optimizer.update() sum_loss += float(loss.data) * batchsize
class DQN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100 #10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 #10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Pong" print "Initializing DQN..." print "Model Building" self.CNN_model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), ) self.model = FunctionSet( l4=F.Linear(3136, 512, wscale=np.sqrt(2)), q_value=F.Linear(512, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 512), dtype=np.float32))).to_gpu() d = 'elite/' self.CNN_model.l1.W.data = np.load(d + 'l1_W.npy') #.astype(np.float32) self.CNN_model.l1.b.data = np.load(d + 'l1_b.npy') #.astype(np.float32) self.CNN_model.l2.W.data = np.load(d + 'l2_W.npy') #.astype(np.float32) self.CNN_model.l2.b.data = np.load(d + 'l2_b.npy') #.astype(np.float32) self.CNN_model.l3.W.data = np.load(d + 'l3_W.npy') #.astype(np.float32) self.CNN_model.l3.b.data = np.load(d + 'l3_b.npy') #.astype(np.float32) self.CNN_model = self.CNN_model.to_gpu() self.CNN_model_target = copy.deepcopy(self.CNN_model) self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.D = [ np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool), np.zeros((self.data_size, 1), dtype=np.uint8) ] def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp = self.Q_func_target(s_dash) # Q(s',*) tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) max_Q_dash = np.asanyarray(tmp, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable( cuda.to_gpu( np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, lstm_reward, state_dash, episode_end_flag, ale_reward): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = lstm_reward self.D[5][data_index] = ale_reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = lstm_reward self.D[3][data_index] = state_dash self.D[5][data_index] = ale_reward self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): h1 = F.relu(self.CNN_model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model.l2(h1)) h3 = F.relu(self.CNN_model.l3(h2)) h4 = F.relu(self.model.l4(h3)) #test now #print h3.data.shape Q = self.model.q_value(h4) return Q def Q_func_LSTM(self, state): h1 = F.relu(self.CNN_model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model.l2(h1)) h3 = F.relu(self.CNN_model.l3(h2)) return h3.data.get() def Q_func_target(self, state): h1 = F.relu(self.CNN_model_target.l1( state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.CNN_model_target.l2(h1)) h3 = F.relu(self.CNN_model_target.l3(h2)) h4 = F.relu(self.model.l4(h3)) Q = self.model_target.q_value(h4) return Q def LSTM_reward(self, lstm_out, state_next): lstm_reward = np.sign((self.lstm_loss - (lstm_out - state_next)**2)) return lstm_reward def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 #original: 4 def __init__(self, use_gpu, enable_controller, dim): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim print("Initializing Q-Network...") #hidden_dim = 256 hidden_dim128 = 128 self.model = FunctionSet( l4=F.Linear(self.dim * self.hist_size, hidden_dim128, wscale=np.sqrt(2)), l5=F.Linear(self.dim * self.hist_size, hidden_dim128, wscale=np.sqrt(2)), l6=F.Linear(hidden_dim128, 1, wscale=np.sqrt(2), initialW=np.zeros((1, hidden_dim128), dtype=np.float32)), #V(s,a) l7=F.Linear(hidden_dim128, self.num_of_actions, wscale=np.sqrt(2), initialW=np.zeros((self.num_of_actions, hidden_dim128), dtype=np.float32)), #A(a) q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias=True)) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [ np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward else: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward self.d[3][data_index] = state_dash self.d[4][data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) h5 = F.relu(self.model.l5(state / 255.0)) #h6 = F.relu(self.model.l6(h4)) #h7 = relu_l7.relu(self.model.l7(h5)) h6 = self.model.l6(h4) h7 = self.model.l7(h5) q = self.model.q_value(h6, h7) return q def q_func_target(self, state): #h4 = F.relu(self.model_target.l4(state / 255.0)) #q = self.model_target.q_value(h4) h4 = F.relu(self.model_target.l4(state / 255.0)) h5 = F.relu(self.model_target.l5(state / 255.0)) #h6 = F.relu(self.model_target.l6(h4)) #h7 = relu_l7.relu(self.model_target.l7(h5)) h6 = self.model_target.l6(h4) h7 = self.model_target.l7(h5) q = self.model_target.q_value(h6, h7) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print(" Random"), else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) print("#Greedy"), return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
def Main(): import argparse import numpy as np from sklearn.datasets import load_diabetes from chainer import cuda, Variable, FunctionSet, optimizers import chainer.functions as F parser = argparse.ArgumentParser(description='Chainer example: regression') parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() batchsize = 13 n_epoch = 100 n_units = 30 # Prepare dataset print 'fetch diabetes dataset' diabetes = load_diabetes() data = diabetes['data'].astype(np.float32) target = diabetes['target'].astype(np.float32).reshape( len(diabetes['target']), 1) N = batchsize * 30 #Number of training data x_train, x_test = np.split(data, [N]) y_train, y_test = np.split(target, [N]) N_test = y_test.size print 'Num of samples for train:', len(y_train) print 'Num of samples for test:', len(y_test) # Dump data for plot: fp1 = file('/tmp/smpl_train.dat', 'w') for x, y in zip(x_train, y_train): fp1.write('%s #%i# %s\n' % (' '.join(map(str, x)), len(x) + 1, ' '.join(map(str, y)))) fp1.close() # Dump data for plot: fp1 = file('/tmp/smpl_test.dat', 'w') for x, y in zip(x_test, y_test): fp1.write('%s #%i# %s\n' % (' '.join(map(str, x)), len(x) + 1, ' '.join(map(str, y)))) fp1.close() # Prepare multi-layer perceptron model model = FunctionSet(l1=F.Linear(10, n_units), l2=F.Linear(n_units, n_units), l3=F.Linear(n_units, 1)) if args.gpu >= 0: cuda.init(args.gpu) model.to_gpu() # Neural net architecture def forward(x_data, y_data, train=True): x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.relu(model.l1(x)), train=train) h2 = F.dropout(F.relu(model.l2(h1)), train=train) y = model.l3(h2) return F.mean_squared_error(y, t), y # Setup optimizer optimizer = optimizers.AdaDelta(rho=0.9) optimizer.setup(model.collect_parameters()) # Learning loop for epoch in xrange(1, n_epoch + 1): print 'epoch', epoch # training perm = np.random.permutation(N) sum_loss = 0 for i in xrange(0, N, batchsize): x_batch = x_train[perm[i:i + batchsize]] y_batch = y_train[perm[i:i + batchsize]] if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) optimizer.zero_grads() loss, pred = forward(x_batch, y_batch) loss.backward() optimizer.update() sum_loss += float(cuda.to_cpu(loss.data)) * batchsize print 'train mean loss={}'.format(sum_loss / N) ''' # testing per batch sum_loss = 0 preds = [] for i in xrange(0, N_test, batchsize): x_batch = x_test[i:i+batchsize] y_batch = y_test[i:i+batchsize] if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) loss, pred = forward(x_batch, y_batch, train=False) preds.extend(cuda.to_cpu(pred.data)) sum_loss += float(cuda.to_cpu(loss.data)) * batchsize pearson = np.corrcoef(np.asarray(preds).reshape(len(preds),), np.asarray(y_test).reshape(len(preds),)) #''' #''' # testing all data preds = [] x_batch = x_test[:] y_batch = y_test[:] if args.gpu >= 0: x_batch = cuda.to_gpu(x_batch) y_batch = cuda.to_gpu(y_batch) loss, pred = forward(x_batch, y_batch, train=False) preds = cuda.to_cpu(pred.data) sum_loss = float(cuda.to_cpu(loss.data)) * len(y_test) pearson = np.corrcoef( np.asarray(preds).reshape(len(preds), ), np.asarray(y_test).reshape(len(preds), )) #''' print 'test mean loss={}, corrcoef={}'.format(sum_loss / N_test, pearson[0][1]) # Dump data for plot: fp1 = file('/tmp/nn_test%04i.dat' % epoch, 'w') for x, y in zip(x_test, preds): fp1.write( '%s #%i# %s\n' % (' '.join(map(str, x)), len(x) + 1, ' '.join(map(str, y)))) fp1.close()
l1 = da1.model.encoder, l2 = da2.model.encoder, l3 = F.Linear(49, 10), ) # 前向き学習の定義 def forward(x_data, y_data, train=True): x, t = Variable(x_data), Variable(y_data) h1 = F.dropout(F.sigmoid(da3.l1(x)), train=train) h2 = F.dropout(F.sigmoid(da3.l2(h1)), train=train) y = da3.l3(h2) return F.softmax_cross_entropy(y, t), F.accuracy(y, t) # optimizerの定義 optimizer = optimizers.Adam() optimizer.setup(da3.collect_parameters()) # 学習 n_epoch = 200 all_loss_accuracy = [] for epoch in xrange(n_epoch): print 'epoch', epoch indexes = np.random.permutation(N) loss_accuracy = [] sum_loss, sum_accuracy = 0, 0 for i in xrange(0, N, batchsize): x_batch = x_train[indexes[i:i+batchsize]] y_batch = y_train[indexes[i:i+batchsize]] optimizer.zero_grads() loss, accuracy = forward(x_batch, y_batch) loss.backward()
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 #original: 4 def __init__(self, use_gpu, enable_controller, dim): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim print("Initializing Q-Network...") hidden_dim = 256 self.model = FunctionSet( l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)), q_value=F.Linear(hidden_dim, self.num_of_actions, initialW=np.zeros((self.num_of_actions, hidden_dim), dtype=np.float32)) ) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward else: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward self.d[3][data_index] = state_dash self.d[4][data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) q = self.model.q_value(h4) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) q = self.model_target.q_value(h4) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print(" Random"), else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) print("#Greedy"), return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class QNet: # Hyper-Parameters gamma = 0.99 # Discount factor #-------0.99---0.39--0.99 initial_exploration = 10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4#------ data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 #original: 4 def __init__(self, use_gpu, enable_controller, dim): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim print("Initializing Q-Network...") hidden_dim = 256 # 256--128---256 self.model = FunctionSet( l4=F.Linear(self.dim * self.hist_size, hidden_dim, wscale=np.sqrt(2)), #wscall=np.sqrt(2)---None-- q_value=F.Linear(hidden_dim, self.num_of_actions, initialW=np.zeros( (self.num_of_actions, hidden_dim), dtype=np.float32))) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) #self.optimizer = optimizers.Adam(alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-08)#alpha=0.0015-0.0125-0.005-0.001 self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag]r, int8---------------------------------------- self.d = [ np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.float32), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool) ] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[ 0] #188行s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), # dtype=np.float32) =32 s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td / abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time, state, action, reward, state_dash, episode_end_flag): #def agent_end self.data_index = time % self.data_size #---------------------------------------------------data_index if episode_end_flag is True: self.d[0][self.data_index] = state self.d[1][self.data_index] = action self.d[2][self.data_index] = reward #print d[2] else: self.d[0][self.data_index] = state self.d[1][self.data_index] = action self.d[2][self.data_index] = reward self.d[3][self.data_index] = state_dash self.d[4][self.data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] # if not (r_replay[i] == 1.):#-------------- # r_replay[i] = -3.#----------------- s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward( s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay ) # def forward(self, state, action, reward, state_dash, episode_end): loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) #---------- F.leaky_relu #dp4 = F.dropout(h4, ratio=0.4, train=True)#----------ratio=0.3--0.4 q = self.model.q_value(h4) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) #--------F.leaky_ #dp4 = F.dropout(h4, ratio=0.3, train=True)#------------- dp4 = F.dropout(h4, ratio=0.3, train=True) q = self.model_target.q_value(h4) #dp4 return q def e_greedy(self, state, epsilon): #agent.start().eps =1.0 s = Variable(state) q = self.q_func(s) q = q.data #print q.data.size#----------------------------------------------------------------------------- if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print(" Random"), #¥---------------------2 " Random" else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) print("#Greedy"), return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[ index_of_action] #[index_of_action]=np.argmax(q) def action_to_index(self, action): return self.enable_controller.index(action)
class DN_class: # Hyper-Parameters gamma = 0.99 # Discount factor initial_exploration = 100#10**4 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 def __init__(self, enable_controller=[0, 1, 3, 4]): self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller # Default setting : "Breakout" print "Initializing DN..." # Initialization of Chainer 1.1.0 or older. # print "CUDA init" # cuda.init() print "Model Building" self.model = FunctionSet( l1=F.Convolution2D(4, 32, ksize=8, stride=4, nobias=False, wscale=np.sqrt(2)), l2=F.Convolution2D(32, 64, ksize=4, stride=2, nobias=False, wscale=np.sqrt(2)), l3=F.Convolution2D(64, 64, ksize=3, stride=1, nobias=False, wscale=np.sqrt(2)), l4=F.Linear(3136, 256, wscale=np.sqrt(2)), l5=F.Linear(3136, 256, wscale=np.sqrt(2)), l6=F.Linear(256, 1, initialW=np.zeros((1, 256), dtype=np.float32)), l7=F.Linear(256, self.num_of_actions, initialW=np.zeros((self.num_of_actions, 256), dtype=np.float32)), q_value=DN_out.DN_out(1, self.num_of_actions, self.num_of_actions, nobias = True) ).to_gpu() if args.resumemodel: # load saved model serializers.load_npz(args.resumemodel, self.model) print "load model from resume.model" self.model_target = copy.deepcopy(self.model) print "Initizlizing Optimizer" self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] if args.resumeD1 and args.resumeD2: # load saved D1 and D2 npz_tmp1 = np.load(args.resumeD1) print "finished loading half of D data" npz_tmp2 = np.load(args.resumeD2) self.D = [npz_tmp1['D0'], npz_tmp1['D1'], npz_tmp1['D2'], npz_tmp2['D3'], npz_tmp2['D4']] npz_tmp1.close() npz_tmp2.close() print "loaded stored all D data" else: self.D = [np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, 4, 84, 84), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] print "initialized D data" def forward(self, state, action, Reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) Q = self.Q_func(s) # Get Q-value # Generate Target Signals tmp2 = self.Q_func(s_dash) tmp2 = list(map(np.argmax, tmp2.data.get())) # argmaxQ(s',a) tmp = self.Q_func_target(s_dash) # Q'(s',*) tmp = list(tmp.data.get()) # select Q'(s',*) due to argmaxQ(s',a) res1 = [] for i in range(num_of_batch): res1.append(tmp[i][tmp2[i]]) #max_Q_dash = np.asanyarray(tmp, dtype=np.float32) max_Q_dash = np.asanyarray(res1, dtype=np.float32) target = np.asanyarray(Q.data.get(), dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = np.sign(Reward[i]) + self.gamma * max_Q_dash[i] else: tmp_ = np.sign(Reward[i]) action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping td = Variable(cuda.to_gpu(target)) - Q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = Variable(cuda.to_gpu(np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32))) loss = F.mean_squared_error(td_clip, zero_val) return loss, Q def stockExperience(self, time, state, action, reward, state_dash, episode_end_flag): data_index = time % self.data_size if episode_end_flag is True: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward else: self.D[0][data_index] = state self.D[1][data_index] = action self.D[2][data_index] = reward self.D[3][data_index] = state_dash self.D[4][data_index] = episode_end_flag def experienceReplay(self, time): if self.initial_exploration < time: # Pick up replay_size number of samples from the Data if time < self.data_size: # during the first sweep of the History Data replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, 4, 84, 84), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.D[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.D[1][replay_index[i]] r_replay[i] = self.D[2][replay_index[i]] s_dash_replay[i] = np.array(self.D[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.D[4][replay_index[i]] s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def Q_func(self, state): print 'now Q_func is implemented' h1 = F.relu(self.model.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model.l2(h1)) h3 = F.relu(self.model.l3(h2)) h4 = F.relu(self.model.l4(h3)) # left side connected with s value h5 = F.relu(self.model.l5(h3)) # right side connected with A value h6 = self.model.l6(h4) # s value h7 = self.model.l7(h5) # A value Q = self.model.q_value(h6, h7) # Q value return Q def Q_func_target(self, state): print 'now Q_func_target is implemented' h1 = F.relu(self.model_target.l1(state / 254.0)) # scale inputs in [0.0 1.0] h2 = F.relu(self.model_target.l2(h1)) h3 = F.relu(self.model_target.l3(h2)) h4 = F.relu(self.model_target.l4(h3)) # left side connected with s value h5 = F.relu(self.model_target.l5(h3)) # right side connected with A value h6 = self.model_target.l6(h4) # s value h7 = self.model_target.l7(h5) # A value Q = self.model_target.q_value(h6, h7) # Q value return Q def e_greedy(self, state, epsilon): s = Variable(state) Q = self.Q_func(s) Q = Q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print "RANDOM" else: index_action = np.argmax(Q.get()) print "GREEDY" return self.index_to_action(index_action), Q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action)
class QNet: # Hyper-Parameters gamma = 0.99 # 報酬の割引率 initial_exploration = 10**3 # Initial exploratoin. original: 5x10^4 replay_size = 32 # Replay (batch) size target_model_update_freq = 10**4 # Target update frequancy. original: 10^4 data_size = 10**5 # Data size of history. original: 10^6 hist_size = 1 #original: 4 save_model_freq = 10**4 # モデルを保存する頻度 def __init__(self, use_gpu, enable_controller, dim): self.use_gpu = use_gpu self.num_of_actions = len(enable_controller) self.enable_controller = enable_controller self.dim = dim print("Initializing Q-Network...") print("Input Dim of Q-Network : ",self.dim*self.hist_size) hidden_dim = 256 self.model = FunctionSet( l4=F.Linear(self.dim*self.hist_size, hidden_dim, wscale=np.sqrt(2)), q_value=F.Linear(hidden_dim, self.num_of_actions, initialW=np.zeros((self.num_of_actions, hidden_dim), dtype=np.float32)) ) if self.use_gpu >= 0: self.model.to_gpu() self.model_target = copy.deepcopy(self.model) self.optimizer = optimizers.RMSpropGraves(lr=0.00025, alpha=0.95, momentum=0.95, eps=0.0001) self.optimizer.setup(self.model.collect_parameters()) # History Data : D=[s, a, r, s_dash, end_episode_flag] self.d = [np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros(self.data_size, dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.int8), np.zeros((self.data_size, self.hist_size, self.dim), dtype=np.uint8), np.zeros((self.data_size, 1), dtype=np.bool)] def forward(self, state, action, reward, state_dash, episode_end): num_of_batch = state.shape[0] s = Variable(state) s_dash = Variable(state_dash) q = self.q_func(s) # Get Q-value # Generate Target Signals tmp = self.q_func_target(s_dash) # Q(s',*) if self.use_gpu >= 0: tmp = list(map(np.max, tmp.data.get())) # max_a Q(s',a) else: tmp = list(map(np.max, tmp.data)) # max_a Q(s',a) max_q_dash = np.asanyarray(tmp, dtype=np.float32) if self.use_gpu >= 0: target = np.asanyarray(q.data.get(), dtype=np.float32) else: # make new array target = np.array(q.data, dtype=np.float32) for i in xrange(num_of_batch): if not episode_end[i][0]: tmp_ = reward[i] + self.gamma * max_q_dash[i] else: tmp_ = reward[i] action_index = self.action_to_index(action[i]) target[i, action_index] = tmp_ # TD-error clipping if self.use_gpu >= 0: target = cuda.to_gpu(target) td = Variable(target) - q # TD error td_tmp = td.data + 1000.0 * (abs(td.data) <= 1) # Avoid zero division td_clip = td * (abs(td.data) <= 1) + td/abs(td_tmp) * (abs(td.data) > 1) zero_val = np.zeros((self.replay_size, self.num_of_actions), dtype=np.float32) if self.use_gpu >= 0: zero_val = cuda.to_gpu(zero_val) zero_val = Variable(zero_val) loss = F.mean_squared_error(td_clip, zero_val) return loss, q def stock_experience(self, time,state, action, reward, state_dash,episode_end_flag): data_index = time % self.data_size #timeを引数に入れることでqueueを実現 if episode_end_flag is True: # ep_endがTrueならstate_dashが全て0になる self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward else: self.d[0][data_index] = state self.d[1][data_index] = action self.d[2][data_index] = reward self.d[3][data_index] = state_dash self.d[4][data_index] = episode_end_flag def experience_replay(self, time): if self.initial_exploration < time: if time < self.data_size: #during the first sweep of the History replay_index = np.random.randint(0, time, (self.replay_size, 1)) else: replay_index = np.random.randint(0, self.data_size, (self.replay_size, 1)) s_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) a_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.uint8) r_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.float32) s_dash_replay = np.ndarray(shape=(self.replay_size, self.hist_size, self.dim), dtype=np.float32) episode_end_replay = np.ndarray(shape=(self.replay_size, 1), dtype=np.bool) for i in xrange(self.replay_size): s_replay[i] = np.asarray(self.d[0][replay_index[i]], dtype=np.float32) a_replay[i] = self.d[1][replay_index[i]] r_replay[i] = self.d[2][replay_index[i]] s_dash_replay[i] = np.array(self.d[3][replay_index[i]], dtype=np.float32) episode_end_replay[i] = self.d[4][replay_index[i]] if self.use_gpu >= 0: s_replay = cuda.to_gpu(s_replay) s_dash_replay = cuda.to_gpu(s_dash_replay) # Gradient-based update self.optimizer.zero_grads() loss, _ = self.forward(s_replay, a_replay, r_replay, s_dash_replay, episode_end_replay) loss.backward() self.optimizer.update() def q_func(self, state): h4 = F.relu(self.model.l4(state / 255.0)) q = self.model.q_value(h4) return q def q_func_target(self, state): h4 = F.relu(self.model_target.l4(state / 255.0)) q = self.model_target.q_value(h4) return q def e_greedy(self, state, epsilon): s = Variable(state) q = self.q_func(s) q = q.data if np.random.rand() < epsilon: index_action = np.random.randint(0, self.num_of_actions) print(" Random"), else: if self.use_gpu >= 0: index_action = np.argmax(q.get()) else: index_action = np.argmax(q) print("#Greedy"), return self.index_to_action(index_action), q def target_model_update(self): self.model_target = copy.deepcopy(self.model) def index_to_action(self, index_of_action): return self.enable_controller[index_of_action] def action_to_index(self, action): return self.enable_controller.index(action) def save_model(self,folder,time): try: model_path = "./%s/%dmodel"%(folder,time) serializers.save_npz(model_path,self.model) except: import traceback import sys traceback.print_exc() sys.exit() print "model is saved!!(Model_Path=%s)"%(model_path) print "----------------------------------------------" def load_model(self,folder,model_num): try: model_path = "./%s/%dmodel"%(folder,model_num) serializers.load_npz(model_path,self.model) except: import traceback import sys traceback.print_exc() sys.exit() print "model load is done!!(Model_Path=%s)"%(model_path) print "----------------------------------------------" self.target_model_update()