def create_model(): inp = Input((28, 28), "images") top = Flatten()(inp) top = Tanh()(Linear(28*28*10, name="l1")(top)) top = Tanh()(Linear(1000, name="l2")(top)) top = Linear(10, name="out")(top) loss = CrossEntropyLoss("LogLoss")(top) return Model(inp, loss)
def __init__(self, params, bulk, xcolumn, ycolumn): self.Bulk = bulk self.Columns = params["columns"] model = Model.from_config(params["_model"]["config"]) trainer = SGD(params["lr"], params.get("momentum", 0.5)) model.compile(trainer=trainer) self.Model = model weights = [ p for n, p in sorted(bulk.items()) if n.startswith("weight_") ] self.Weights0 = weights self.Grads = map(np.zeros_like, weights) self.Samples = 0 self.SumLoss = 0.0 self.SumMetric = 0.0
def build_graph(seed=123, build_decoder=True, batch_size=256, padlen=40): print("\n Building graph...") np.random.seed(seed) tf.set_random_seed(seed) # reproducibility tf.reset_default_graph() model = Model(embedding_weights=weights, build_decoder=build_decoder, batch_size=batch_size, padlen=padlen) # Build tensorflow graph from config variables_to_save1 = [ v for v in tf.global_variables() if 'Adam' not in v.name and 'global_step' not in v.name and 'vad' not in v.name ] # Saver to save & restore all the variables. variables_to_save2 = [ v for v in tf.global_variables() if 'Adam' not in v.name and 'global_step' not in v.name and 'clf' not in v.name ] saver1 = tf.train.Saver(var_list=variables_to_save1, keep_checkpoint_every_n_hours=1.0) # CLF saver saver2 = tf.train.Saver(var_list=variables_to_save2, keep_checkpoint_every_n_hours=1.0) # VAD saver return model, saver1, saver2
def __init__(self, num_actions, id_num, shared_arr=None, num_moves=None, args=None): print "USING OPTION CRITIC" self.args = args self.id_num = id_num self.num_actions = num_actions self.num_moves = num_moves self.reset_storing() self.rng = np.random.RandomState(100 + id_num) # input is 8x8 model_network = [{ "model_type": "conv", "filter_size": [4, 4], "pool": [1, 1], "stride": [2, 2], "out_size": 32, "activation": "relu" }, { "model_type": "conv", "filter_size": [3, 3], "pool": [1, 1], "stride": [2, 2], "out_size": 64, "activation": "relu" }, { "model_type": "mlp", "out_size": 48, "activation": "relu" }, { "model_type": "mlp", "out_size": 32, "activation": "relu" }] out = [None, model_network[-1]["out_size"]] self.conv = Model(model_network, input_size=[ None, args.concat_frames * (1 if args.grayscale else 3), 8, 8 ]) self.termination_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "sigmoid", "W": 0 }], input_size=out) self.Q_val_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "linear", "W": 0 }], input_size=out) self.options_model = MLP3D(input_size=out[1], num_options=args.num_options, out_size=num_actions, activation="softmax") self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params self.set_rms_shared_weights(shared_arr) x = T.ftensor4() y = T.fvector() a = T.ivector() o = T.ivector() delib = T.fscalar() s = self.conv.apply(x / np.float32(255)) intra_option_policy = self.options_model.apply(s, o) q_vals = self.Q_val_model.apply(s) disc_q = theano.gradient.disconnected_grad(q_vals) current_option_q = q_vals[T.arange(o.shape[0]), o] disc_opt_q = disc_q[T.arange(o.shape[0]), o] terms = self.termination_model.apply(s) o_term = terms[T.arange(o.shape[0]), o] V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + ( self.args.option_epsilon * T.mean(q_vals, axis=1)) disc_V = theano.gradient.disconnected_grad(V) aggr = T.mean # T.sum log_eps = 0.0001 critic_cost = aggr(args.critic_coef * 0.5 * T.sqr(y - current_option_q)) termination_grad = aggr(o_term * ((disc_opt_q - disc_V) + delib)) entropy = -aggr( T.sum(intra_option_policy * T.log(intra_option_policy + log_eps), axis=1)) * args.entropy_reg pg = aggr( (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) * (y - disc_opt_q)) cost = pg + entropy - critic_cost - termination_grad grads = T.grad(cost * args.update_freq, self.params) # grads = T.grad(cost, self.params) updates, grad_rms, self.rms_weights = rmsprop(self.params, grads, clip=args.clip, clip_type=args.clip_type) self.share_rms(shared_arr) self.get_state = theano.function([x], s, on_unused_input='warn') self.get_policy = theano.function([s, o], intra_option_policy) self.get_termination = theano.function([x], terms) self.get_q = theano.function([x], q_vals) self.get_q_from_s = theano.function([s], q_vals) self.get_V = theano.function([x], V) self.rms_grads = theano.function([x, a, y, o, delib], grad_rms, updates=updates, on_unused_input='warn') print "ALL COMPILED" if not self.args.testing: self.init_tracker() self.initialized = False
class AOCAgent_THEANO(): def __init__(self, num_actions, id_num, shared_arr=None, num_moves=None, args=None): print "USING OPTION CRITIC" self.args = args self.id_num = id_num self.num_actions = num_actions self.num_moves = num_moves self.reset_storing() self.rng = np.random.RandomState(100 + id_num) # input is 8x8 model_network = [{ "model_type": "conv", "filter_size": [4, 4], "pool": [1, 1], "stride": [2, 2], "out_size": 32, "activation": "relu" }, { "model_type": "conv", "filter_size": [3, 3], "pool": [1, 1], "stride": [2, 2], "out_size": 64, "activation": "relu" }, { "model_type": "mlp", "out_size": 48, "activation": "relu" }, { "model_type": "mlp", "out_size": 32, "activation": "relu" }] out = [None, model_network[-1]["out_size"]] self.conv = Model(model_network, input_size=[ None, args.concat_frames * (1 if args.grayscale else 3), 8, 8 ]) self.termination_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "sigmoid", "W": 0 }], input_size=out) self.Q_val_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "linear", "W": 0 }], input_size=out) self.options_model = MLP3D(input_size=out[1], num_options=args.num_options, out_size=num_actions, activation="softmax") self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params self.set_rms_shared_weights(shared_arr) x = T.ftensor4() y = T.fvector() a = T.ivector() o = T.ivector() delib = T.fscalar() s = self.conv.apply(x / np.float32(255)) intra_option_policy = self.options_model.apply(s, o) q_vals = self.Q_val_model.apply(s) disc_q = theano.gradient.disconnected_grad(q_vals) current_option_q = q_vals[T.arange(o.shape[0]), o] disc_opt_q = disc_q[T.arange(o.shape[0]), o] terms = self.termination_model.apply(s) o_term = terms[T.arange(o.shape[0]), o] V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + ( self.args.option_epsilon * T.mean(q_vals, axis=1)) disc_V = theano.gradient.disconnected_grad(V) aggr = T.mean # T.sum log_eps = 0.0001 critic_cost = aggr(args.critic_coef * 0.5 * T.sqr(y - current_option_q)) termination_grad = aggr(o_term * ((disc_opt_q - disc_V) + delib)) entropy = -aggr( T.sum(intra_option_policy * T.log(intra_option_policy + log_eps), axis=1)) * args.entropy_reg pg = aggr( (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) * (y - disc_opt_q)) cost = pg + entropy - critic_cost - termination_grad grads = T.grad(cost * args.update_freq, self.params) # grads = T.grad(cost, self.params) updates, grad_rms, self.rms_weights = rmsprop(self.params, grads, clip=args.clip, clip_type=args.clip_type) self.share_rms(shared_arr) self.get_state = theano.function([x], s, on_unused_input='warn') self.get_policy = theano.function([s, o], intra_option_policy) self.get_termination = theano.function([x], terms) self.get_q = theano.function([x], q_vals) self.get_q_from_s = theano.function([s], q_vals) self.get_V = theano.function([x], V) self.rms_grads = theano.function([x, a, y, o, delib], grad_rms, updates=updates, on_unused_input='warn') print "ALL COMPILED" if not self.args.testing: self.init_tracker() self.initialized = False def update_weights(self, x, a, y, o, moves, delib): args = self.args self.num_moves.value += moves lr = np.max([ args.init_lr * (args.max_num_frames - self.num_moves.value) / args.max_num_frames, 0 ]).astype("float32") cumul = self.rms_grads(x, a, y, o, delib) for i in range(len(cumul)): self.shared_arr[i] += lr * cumul[i] self.params[i].set_value(self.shared_arr[i]) return def load_values(self, values): assert (len(self.params + self.rms_weights) == len(values)) for p, v in zip(self.params + self.rms_weights, values): p.set_value(v) def save_values(self, folder_name): pickle.dump([p.get_value() for p in self.params + self.rms_weights], open(folder_name + "/tmp_model.pkl", "wb")) os.system("mv " + folder_name + "/tmp_model.pkl " + folder_name + "/model.pkl") # try: # server creates too many core files # os.system("rm ./core*") # except: # pass def get_param_vals(self): return [m.get_value() for m in self.params + self.rms_weights] def set_rms_shared_weights(self, shared_arr): if shared_arr is not None: self.shared_arr = [ np.frombuffer(s, dtype="float32").reshape(p.get_value().shape) for s, p in zip(shared_arr, self.params) ] self.rms_shared_arr = shared_arr[len(self.params):] if self.args.init_num_moves > 0: for s, p in zip(shared_arr, self.params): p.set_value( np.frombuffer(s, dtype="float32").reshape( p.get_value().shape)) print "LOADED VALUES" def share_rms(self, shared_arr): # Ties rms params between threads with borrow=True flag if self.args.rms_shared and shared_arr is not None: assert (len(self.rms_weights) == len(self.rms_shared_arr)) for rms_w, s_rms_w in zip(self.rms_weights, self.rms_shared_arr): rms_w.set_value(np.frombuffer( s_rms_w, dtype="float32").reshape(rms_w.get_value().shape), borrow=True) def get_action(self, x): p = self.get_policy([self.current_s], [self.current_o]) return self.rng.choice(range(self.num_actions), p=p[-1]) def get_policy_over_options(self, s): return self.get_q_from_s(s)[0].argmax( ) if self.rng.rand() > self.args.option_epsilon else self.rng.randint( self.args.num_options) def update_internal_state(self, x): self.current_s = self.get_state([x])[0] self.delib = self.args.delib_cost if self.terminated: self.current_o = self.get_policy_over_options([self.current_s]) self.o_tracker_chosen[self.current_o] += 1 self.o_tracker_steps[self.current_o] += 1 def init_tracker(self): csv_things = ["moves", "reward", "term_prob"] csv_things += [ "opt_chosen" + str(ccc) for ccc in range(self.args.num_options) ] csv_things += [ "opt_steps" + str(ccc) for ccc in range(self.args.num_options) ] with open(self.args.folder_name + "/data.csv", "a") as myfile: myfile.write(",".join([str(cc) for cc in csv_things]) + "\n") def tracker(self): term_prob = float(self.termination_counter) / self.frame_counter * 100 csv_things = [ self.num_moves.value, self.total_reward, round(term_prob, 1) ] + list(self.o_tracker_chosen) + list(self.o_tracker_steps) print self.o_tracker_steps with open(self.args.folder_name + "/data.csv", "a") as myfile: myfile.write(",".join([str(cc) for cc in csv_things]) + "\n") def reset_tracker(self): self.termination_counter = 0 self.frame_counter = 0 self.o_tracker_chosen = np.zeros(self.args.num_options, ) self.o_tracker_steps = np.zeros(self.args.num_options, ) def reset(self, x): if not self.args.testing and self.initialized: self.tracker() self.total_reward = 0 self.terminated = True self.reset_tracker() self.update_internal_state(x) self.initialized = True def reset_storing(self): self.a_seq = np.zeros((self.args.max_update_freq, ), dtype="int32") self.o_seq = np.zeros((self.args.max_update_freq, ), dtype="int32") self.r_seq = np.zeros((self.args.max_update_freq, ), dtype="float32") self.x_seq = np.zeros( (self.args.max_update_freq, self.args.concat_frames * (1 if self.args.grayscale else 3), 8, 8), dtype="float32") self.t_counter = 0 def store(self, x, new_x, action, raw_reward, done, death): end_ep = done or (death and self.args.death_ends_episode) self.frame_counter += 1 self.total_reward += raw_reward reward = np.clip(raw_reward, -1, 1) self.terminated = self.get_termination( [new_x])[0][self.current_o] > self.rng.rand() self.termination_counter += self.terminated self.x_seq[self.t_counter] = np.copy(x) self.o_seq[self.t_counter] = np.copy(self.current_o) self.a_seq[self.t_counter] = np.copy(action) self.r_seq[self.t_counter] = np.copy( float(reward)) - (float(self.terminated) * self.delib * (1 - float(end_ep))) self.t_counter += 1 # do n-step return to option termination. # cut off at self.args.max_update_freq # min steps: self.args.update_freq (usually 5 like a3c) # this doesn't make option length a minimum of 5 (they can still terminate). only batch size option_term = (self.terminated and self.t_counter >= self.args.update_freq) if self.t_counter == self.args.max_update_freq or end_ep or option_term: if not self.args.testing: V = self.get_V([new_x])[0] if self.terminated else self.get_q( [new_x])[0][self.current_o] R = 0 if end_ep else V V = [] for j in range(self.t_counter - 1, -1, -1): R = np.float32(self.r_seq[j] + self.args.gamma * R) V.append(R) self.update_weights(self.x_seq[:self.t_counter], self.a_seq[:self.t_counter], V[::-1], self.o_seq[:self.t_counter], self.t_counter, self.delib + self.args.margin_cost) self.reset_storing() if not end_ep: self.update_internal_state(new_x)
def __init__(self, num_actions, id_num, shared_arr=None, num_moves=None, args=None): print "USING OPTION CRITIC" self.args = args self.id_num = id_num self.num_actions = num_actions self.num_moves = num_moves self.reset_storing() self.rng = np.random.RandomState(100 + id_num) model_network = [{ "model_type": "conv", "filter_size": [8, 8], "pool": [1, 1], "stride": [4, 4], "out_size": 16, "activation": "relu" }, { "model_type": "conv", "filter_size": [4, 4], "pool": [1, 1], "stride": [2, 2], "out_size": 32, "activation": "relu" }, { "model_type": "mlp", "out_size": 256, "activation": "relu" }] out = [None, model_network[-1]["out_size"]] self.conv = Model(model_network, input_size=[ None, args.concat_frames * (1 if args.grayscale else 3), 84, 84 ]) self.termination_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "sigmoid", "W": 0 }], input_size=out) self.Q_val_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "linear", "W": 0 }], input_size=out) self.Sigma_val_model = Model( [{ "model_type": "mlp", "out_size": args.num_options, "activation": "linear", "W": 0 }], input_size=out) #Sigma: Variance(state,option) self.options_model = MLP3D(input_size=out[1], num_options=args.num_options, out_size=num_actions, activation="softmax") self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params + self.Sigma_val_model.params self.set_rms_shared_weights(shared_arr) x = T.ftensor4() #observation y = T.fvector() #G y_sigma = T.fvector( ) #True variance (s,o) like G is true value for Q(s,o) a = T.ivector() #action o = T.ivector() #option delib = T.fscalar() s = self.conv.apply(x / np.float32(255)) #states intra_option_policy = self.options_model.apply(s, o) q_vals = self.Q_val_model.apply(s) sigma_vals = self.Sigma_val_model.apply(s) disc_q = theano.gradient.disconnected_grad(q_vals) disc_sigma = theano.gradient.disconnected_grad(sigma_vals) current_option_q = q_vals[T.arange(o.shape[0]), o] current_option_sigma = sigma_vals[T.arange(o.shape[0]), o] disc_opt_q = disc_q[T.arange(o.shape[0]), o] disc_opt_sigma = disc_sigma[T.arange(o.shape[0]), o] terms = self.termination_model.apply(s) o_term = terms[T.arange(o.shape[0]), o] #termination at option (s,o) V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + ( self.args.option_epsilon * T.mean(q_vals, axis=1)) V_sigma = T.min(sigma_vals, axis=1)*(1-self.args.option_epsilon) \ + (self.args.option_epsilon*T.mean(sigma_vals, axis=1)) #Ideal V_var is min[sigma(s,o)] we want to achieve that disc_V = theano.gradient.disconnected_grad(V) disc_V_sigma = theano.gradient.disconnected_grad(V_sigma) aggr = T.mean #T.sum log_eps = 0.0001 critic_cost = aggr(args.critic_coef * 0.5 * T.sqr(y - current_option_q)) # update for Q(s,o) critic_sigma_cost = aggr( args.critic_sigma_coef * 0.5 * T.sqr(y_sigma - current_option_sigma)) # update for sigma(s,o) # termination_grad = aggr(o_term*((disc_opt_q-disc_V)+delib)) advantage_q = disc_opt_q - disc_V #advatnage function for Q value advantage_sigma = disc_opt_sigma - disc_V_sigma #advantage function for sigma termination_grad = aggr( o_term * ((advantage_q - self.args.psi * advantage_sigma) + delib)) #CHANGE entropy = -aggr( T.sum(intra_option_policy * T.log(intra_option_policy + log_eps), axis=1)) * args.entropy_reg pg = aggr( (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) * ((y - disc_opt_q) - self.args.psi * (y_sigma - disc_opt_sigma))) cost = pg + entropy - critic_cost - critic_sigma_cost - termination_grad grads = T.grad(cost * args.update_freq, self.params) #grads = T.grad(cost, self.params) updates, grad_rms, self.rms_weights = rmsprop(self.params, grads, clip=args.clip, clip_type=args.clip_type) self.share_rms(shared_arr) self.get_state = theano.function([x], s, on_unused_input='warn') self.get_policy = theano.function([s, o], intra_option_policy) self.get_termination = theano.function([x], terms) self.get_q = theano.function([x], q_vals) self.get_sigma = theano.function([x], sigma_vals) self.get_q_from_s = theano.function([s], q_vals) self.get_sigma_from_s = theano.function([s], sigma_vals) self.get_V = theano.function([x], V) self.get_V_sigma = theano.function([x], V_sigma) self.rms_grads = theano.function([x, a, y, y_sigma, o, delib], grad_rms, updates=updates, on_unused_input='warn') print "ALL COMPILED" if not self.args.testing: self.init_tracker() self.initialized = False
def unpack_model(self, params): self.Model = Model.from_config(params["config"])
nx = 2 ny = 2 nb = 123 w = 0.1 #model = Sequential() #model.add(Dense(3, input_shape=(nx,), activation="tanh")) #model.add(Dense(ny, activation="softmax")) inp = Input((nx, ), "input") l1 = Linear(3, name="l1")(inp) act = Tanh()(l1) l2 = Linear(ny, name="l2")(act) out = CrossEntoryLoss(ny)(l2) model = Model(inp, out) sgd = SGD(0.1, momentum=0.95) model.compile(trainer=sgd) import pprint cfg = model.config() pprint.pprint(cfg) m1 = Model.from_config(cfg) pprint.pprint(m1.config()) pprint.pprint(model.get_params())
from nnet import Model from nnet.core_layers import Linear, Input, Flatten from nnet.activations import Tanh, Sigmoid, SoftPlus from nnet.losses import CrossEntropyLoss, L2Loss from nnet.callbacks import Callback, Callbacks import numpy as np import random from nnet.trainers import SGD inp = Input((28, 28), "images") top = Flatten()(inp) top = Tanh()(Linear(28 * 28 * 10, name="l1")(top)) top = Tanh()(Linear(1000, name="l2")(top)) top = Linear(10, name="out")(top) loss = CrossEntropyLoss("LogLoss")(top) model = Model(inp, loss) cfg = model.config() m1 = Model.from_config(cfg)
def __init__(self): super().__init__() self.model = Model()
def __init__(self, num_actions, id_num, shared_arr=None, num_moves=None, args=None): print "USING OPTION CRITIC" self.args = args self.id_num = id_num self.num_actions = num_actions self.num_moves = num_moves self.reset_storing() self.rng = np.random.RandomState(100 + id_num) model_network = [{ "model_type": "conv", "filter_size": [8, 8], "pool": [1, 1], "stride": [4, 4], "out_size": 16, "activation": "relu" }, { "model_type": "conv", "filter_size": [4, 4], "pool": [1, 1], "stride": [2, 2], "out_size": 32, "activation": "relu" }, { "model_type": "mlp", "out_size": 256, "activation": "relu" }] out = [None, model_network[-1]["out_size"]] self.conv = Model(model_network, input_size=[ None, args.concat_frames * (1 if args.grayscale else 3), 84, 84 ]) self.termination_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "sigmoid", "W": 0 }], input_size=out) self.Q_val_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "linear", "W": 0 }], input_size=out) self.options_model = MLP3D(input_size=out[1], num_options=args.num_options, out_size=num_actions, activation="softmax") self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params self.set_rms_shared_weights(shared_arr) x = T.ftensor4() # State y = T.fvector() # Onestep Return? a = T.ivector() # Action o = T.ivector() # Option delib = T.fscalar() s = self.conv.apply(x / np.float32(255)) intra_option_policy = self.options_model.apply(s, o) q_vals = self.Q_val_model.apply( s) # Gets all of the Q values, given a state disc_q = theano.gradient.disconnected_grad( q_vals) # Calculate all gradients (simultaneously learning) current_option_q = q_vals[T.arange( o.shape[0] ), o] # Given that we are in option o (and s, from above), get all q values for each action disc_opt_q = disc_q[T.arange(o.shape[0]), o] # get all relevant gradients for each action terms = self.termination_model.apply(s) o_term = terms[T.arange(o.shape[0]), o] # get all terminations for each option V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + ( self.args.option_epsilon * T.mean(q_vals, axis=1) ) # same as Value function in A3C; has value for each policy, argmax it disc_V = theano.gradient.disconnected_grad(V) aggr = T.mean #T.sum -- function call log_eps = 0.0001 critic_cost = aggr( args.critic_coef * 0.5 * T.sqr(y - current_option_q) ) # Value Loss - How much better was actual reward than q value; Same as A3c, but again, becomes q value termination_grad = aggr( o_term * ((disc_opt_q - disc_V) + delib)) # NOTE: Delib always <= 0 entropy = -aggr( T.sum(intra_option_policy * T.log(intra_option_policy + log_eps), axis=1) ) * args.entropy_reg # Traditional entropy; discourages actions that dominate too quickly pg = aggr( (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) * (y - disc_opt_q)) # Policy loss cost = pg + entropy - critic_cost - termination_grad # NOTE: DO THIS AS TF DOES WITH THREADRUNNER grads = T.grad(cost * args.update_freq, self.params) # update gradients #grads = T.grad(cost, self.params) updates, grad_rms, self.rms_weights = rmsprop(self.params, grads, clip=args.clip, clip_type=args.clip_type) self.share_rms(shared_arr) # Get functions self.get_state = theano.function([x], s, on_unused_input='warn') self.get_policy = theano.function([s, o], intra_option_policy) self.get_termination = theano.function([s], terms) self.get_q = theano.function([s], q_vals) self.get_V = theano.function([s], V) # Compute RMS gradients # By default, updates = computing, updating all variables using rmsprop() function self.rms_grads = theano.function( [x, a, y, o, delib], grad_rms, updates=updates, on_unused_input='warn' ) # http://deeplearning.net/software/theano/tutorial/examples.html#basictutexamples print "ALL COMPILED" if not self.args.testing: self.init_tracker() self.initialized = False
class AOCAgent_THEANO(): def __init__(self, num_actions, id_num, shared_arr=None, num_moves=None, args=None): print "USING OPTION CRITIC" self.args = args self.id_num = id_num self.num_actions = num_actions self.num_moves = num_moves self.reset_storing() self.rng = np.random.RandomState(100 + id_num) model_network = [{ "model_type": "conv", "filter_size": [8, 8], "pool": [1, 1], "stride": [4, 4], "out_size": 16, "activation": "relu" }, { "model_type": "conv", "filter_size": [4, 4], "pool": [1, 1], "stride": [2, 2], "out_size": 32, "activation": "relu" }, { "model_type": "mlp", "out_size": 256, "activation": "relu" }] out = [None, model_network[-1]["out_size"]] self.conv = Model(model_network, input_size=[ None, args.concat_frames * (1 if args.grayscale else 3), 84, 84 ]) self.termination_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "sigmoid", "W": 0 }], input_size=out) self.Q_val_model = Model([{ "model_type": "mlp", "out_size": args.num_options, "activation": "linear", "W": 0 }], input_size=out) self.options_model = MLP3D(input_size=out[1], num_options=args.num_options, out_size=num_actions, activation="softmax") self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params self.set_rms_shared_weights(shared_arr) x = T.ftensor4() # State y = T.fvector() # Onestep Return? a = T.ivector() # Action o = T.ivector() # Option delib = T.fscalar() s = self.conv.apply(x / np.float32(255)) intra_option_policy = self.options_model.apply(s, o) q_vals = self.Q_val_model.apply( s) # Gets all of the Q values, given a state disc_q = theano.gradient.disconnected_grad( q_vals) # Calculate all gradients (simultaneously learning) current_option_q = q_vals[T.arange( o.shape[0] ), o] # Given that we are in option o (and s, from above), get all q values for each action disc_opt_q = disc_q[T.arange(o.shape[0]), o] # get all relevant gradients for each action terms = self.termination_model.apply(s) o_term = terms[T.arange(o.shape[0]), o] # get all terminations for each option V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + ( self.args.option_epsilon * T.mean(q_vals, axis=1) ) # same as Value function in A3C; has value for each policy, argmax it disc_V = theano.gradient.disconnected_grad(V) aggr = T.mean #T.sum -- function call log_eps = 0.0001 critic_cost = aggr( args.critic_coef * 0.5 * T.sqr(y - current_option_q) ) # Value Loss - How much better was actual reward than q value; Same as A3c, but again, becomes q value termination_grad = aggr( o_term * ((disc_opt_q - disc_V) + delib)) # NOTE: Delib always <= 0 entropy = -aggr( T.sum(intra_option_policy * T.log(intra_option_policy + log_eps), axis=1) ) * args.entropy_reg # Traditional entropy; discourages actions that dominate too quickly pg = aggr( (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) * (y - disc_opt_q)) # Policy loss cost = pg + entropy - critic_cost - termination_grad # NOTE: DO THIS AS TF DOES WITH THREADRUNNER grads = T.grad(cost * args.update_freq, self.params) # update gradients #grads = T.grad(cost, self.params) updates, grad_rms, self.rms_weights = rmsprop(self.params, grads, clip=args.clip, clip_type=args.clip_type) self.share_rms(shared_arr) # Get functions self.get_state = theano.function([x], s, on_unused_input='warn') self.get_policy = theano.function([s, o], intra_option_policy) self.get_termination = theano.function([s], terms) self.get_q = theano.function([s], q_vals) self.get_V = theano.function([s], V) # Compute RMS gradients # By default, updates = computing, updating all variables using rmsprop() function self.rms_grads = theano.function( [x, a, y, o, delib], grad_rms, updates=updates, on_unused_input='warn' ) # http://deeplearning.net/software/theano/tutorial/examples.html#basictutexamples print "ALL COMPILED" if not self.args.testing: self.init_tracker() self.initialized = False def update_weights(self, x, a, y, o, moves, delib): args = self.args self.num_moves.value += moves lr = np.max([ args.init_lr * (args.max_num_frames - self.num_moves.value) / args.max_num_frames, 0 ]).astype("float32") cumul = self.rms_grads(x, a, y, o, delib) for i in range(len(cumul)): self.shared_arr[i] += lr * cumul[i] self.params[i].set_value(self.shared_arr[i]) return def load_values(self, values): assert (len(self.params + self.rms_weights) == len(values)) for p, v in zip(self.params + self.rms_weights, values): p.set_value(v) def save_values(self, folder_name): pickle.dump([p.get_value() for p in self.params + self.rms_weights], open(folder_name + "/tmp_model.pkl", "wb")) os.system("mv " + folder_name + "/tmp_model.pkl " + folder_name + "/model.pkl") #try: # server creates too many core files # os.system("rm ./core*") #except: # pass def get_param_vals(self): return [m.get_value() for m in self.params + self.rms_weights] def set_rms_shared_weights(self, shared_arr): if shared_arr is not None: self.shared_arr = [ np.frombuffer(s, dtype="float32").reshape(p.get_value().shape) for s, p in zip(shared_arr, self.params) ] self.rms_shared_arr = shared_arr[len(self.params):] if self.args.init_num_moves > 0: for s, p in zip(shared_arr, self.params): p.set_value( np.frombuffer(s, dtype="float32").reshape( p.get_value().shape)) print "LOADED VALUES" def share_rms(self, shared_arr): # Ties rms params between threads with borrow=True flag if self.args.rms_shared and shared_arr is not None: assert (len(self.rms_weights) == len(self.rms_shared_arr)) for rms_w, s_rms_w in zip(self.rms_weights, self.rms_shared_arr): rms_w.set_value(np.frombuffer( s_rms_w, dtype="float32").reshape(rms_w.get_value().shape), borrow=True) def get_action(self, x): p = self.get_policy([self.current_s], [self.current_o]) return self.rng.choice(range(self.num_actions), p=p[-1]) def get_policy_over_options(self, s): return self.get_q(s)[0].argmax( ) if self.rng.rand() > self.args.option_epsilon else self.rng.randint( self.args.num_options) def update_internal_state(self, x): self.current_s = self.get_state([x])[0] self.delib = self.args.delib_cost if self.terminated: self.current_o = self.get_policy_over_options([self.current_s]) self.o_tracker_chosen[self.current_o] += 1 self.o_tracker_steps[self.current_o] += 1 def init_tracker(self): csv_things = ["moves", "reward", "term_prob"] csv_things += [ "opt_chosen" + str(ccc) for ccc in range(self.args.num_options) ] csv_things += [ "opt_steps" + str(ccc) for ccc in range(self.args.num_options) ] with open(self.args.folder_name + "/data.csv", "a") as myfile: myfile.write(",".join([str(cc) for cc in csv_things]) + "\n") def tracker(self): term_prob = float(self.termination_counter) / self.frame_counter * 100 csv_things = [ self.num_moves.value, self.total_reward, round(term_prob, 1) ] + list(self.o_tracker_chosen) + list(self.o_tracker_steps) with open(self.args.folder_name + "/data.csv", "a") as myfile: myfile.write(",".join([str(cc) for cc in csv_things]) + "\n") def reset_tracker(self): self.termination_counter = 0 self.frame_counter = 0 self.o_tracker_chosen = np.zeros(self.args.num_options, ) self.o_tracker_steps = np.zeros(self.args.num_options, ) def reset(self, x): if not self.args.testing and self.initialized: self.tracker() self.total_reward = 0 self.terminated = True self.reset_tracker() self.update_internal_state(x) self.initialized = True def reset_storing(self): self.a_seq = np.zeros((self.args.max_update_freq, ), dtype="int32") self.o_seq = np.zeros((self.args.max_update_freq, ), dtype="int32") self.r_seq = np.zeros((self.args.max_update_freq, ), dtype="float32") self.x_seq = np.zeros( (self.args.max_update_freq, self.args.concat_frames * (1 if self.args.grayscale else 3), 84, 84), dtype="float32") self.t_counter = 0 def store(self, x, new_x, action, raw_reward, done, death): end_ep = done or (death and self.args.death_ends_episode) self.frame_counter += 1 self.total_reward += raw_reward reward = np.clip(raw_reward, -1, 1) self.x_seq[self.t_counter] = np.copy(x) self.o_seq[self.t_counter] = np.copy(self.current_o) self.a_seq[self.t_counter] = np.copy(action) self.r_seq[self.t_counter] = np.copy(float(reward)) - (float( self.terminated) * self.delib * float(self.frame_counter > 1)) self.terminated = self.get_termination( [self.current_s])[0][self.current_o] > self.rng.rand() self.termination_counter += self.terminated self.t_counter += 1 # do n-step return to option termination. # cut off at self.args.max_update_freq # min steps: self.args.update_freq (usually 5 like a3c) # this doesn't make option length a minimum of 5 (they can still terminate). only batch size option_term = (self.terminated and self.t_counter >= self.args.update_freq) if self.t_counter == self.args.max_update_freq or end_ep or option_term: # Time to update if not self.args.testing: d = (self.delib * float(self.frame_counter > 1) ) # add delib if termination because it isn't part of V V = self.get_V([self.current_s ])[0] - d if self.terminated else self.get_q( [self.current_s])[0][self.current_o] R = 0 if end_ep else V V = [] for j in range(self.t_counter - 1, -1, -1): # Easy way to reset to 0 R = np.float32(self.r_seq[j] + self.args.gamma * R) # discount V.append(R) self.update_weights(self.x_seq[:self.t_counter], self.a_seq[:self.t_counter], V[::-1], self.o_seq[:self.t_counter], self.t_counter, self.delib + self.args.margin_cost) self.reset_storing() if not end_ep: self.update_internal_state(new_x)
def __init__(self, model_network=None, gamma=0.99, learning_method="rmsprop", batch_size=32, input_size=None, learning_params=None, dnn_type=True, clip_delta=0, scale=255., double_q=False, prioritized_exp_replay=False, heads_num=1, action_num=0): x = T.ftensor4() next_x = T.ftensor4() a = T.ivector() r = T.fvector() terminal = T.ivector() self.heads_num = heads_num self.action_num = action_num self.x_shared = theano.shared( np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32')) self.next_x_shared = theano.shared( np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32')) self.a_shared = theano.shared(np.zeros((batch_size), dtype='int32')) self.terminal_shared = theano.shared( np.zeros((batch_size), dtype='int32')) self.r_shared = theano.shared(np.zeros((batch_size), dtype='float32')) self.Q_model = Model(model_network, input_size=input_size, dnn_type=dnn_type) self.Q_prime_model = Model(model_network, input_size=input_size, dnn_type=dnn_type) if double_q: alt_actions = T.argmax(self.Q_model.apply(next_x / scale), axis=1) alt_actions = theano.gradient.disconnected_grad(alt_actions) y = r + (T.ones_like(terminal)-terminal)*gamma*\ self.Q_prime_model.apply(next_x/scale)[T.arange(alt_actions.shape[0]), alt_actions] else: q_stack = self.Q_prime_model.apply(next_x / scale) q_list = [ q_stack[T.arange(a.shape[0]), k * self.action_num:(k + 1) * self.action_num] for k in range(self.heads_num) ] y_list = [ r + (T.ones_like(terminal) - terminal) * gamma * T.max(q_list[k], axis=1) for k in range(self.heads_num) ] y_concat = theano.tensor.concatenate(y_list, axis=0) y = r + (T.ones_like(terminal) - terminal) * gamma * T.max( self.Q_prime_model.apply(next_x / scale), axis=1) all_q_vals = self.Q_model.apply(x / scale) q_vals = all_q_vals[T.arange(a.shape[0]), a] q_vals_list = [ all_q_vals[T.arange(a.shape[0]), a + k * self.heads_num] for k in range(self.heads_num) ] q_vals_concat = theano.tensor.concatenate(q_vals_list, axis=0) # td_errors = y-q_vals td_errors = y_concat - q_vals_concat """ if clip_delta > 0: td_errors = td_errors.clip(-clip_delta, clip_delta) cost = 0.5*td_errors**2 """ if clip_delta > 0: #TOOK THIS FROM GITHUB CODE # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(td_errors), clip_delta) linear_part = abs(td_errors) - quadratic_part cost = 0.5 * quadratic_part**2 + clip_delta * linear_part else: cost = 0.5 * td_errors**2 #""" cost = T.sum(cost) print self.Q_model.params self.learning_method = self.Q_model.get_learning_method( learning_method, **learning_params) grads = T.grad(cost, self.Q_model.params) param_updates = self.learning_method.apply(self.Q_model.params, grads) target_updates = OrderedDict() for t, b in zip(self.Q_prime_model.params, self.Q_model.params): target_updates[t] = b givens = { x: self.x_shared, a: self.a_shared, r: self.r_shared, terminal: self.terminal_shared, next_x: self.next_x_shared } # print 'fast compile' # theano.config.mode = 'FAST_COMPILE' print "building" self.train_model = theano.function([], td_errors, updates=param_updates, givens=givens) print "compiled train_model (1/3)" self.pred_score = theano.function([], all_q_vals, givens={x: self.x_shared}) print "compiled pred_score (2/3)" self.update_target_params = theano.function([], [], updates=target_updates) print "compiled update_target_params (3/3)" self.update_target_params() print "updated target params"
def onBtnSolveClick(self): """ Solve the IVP and plot solution (in case of checked). """ condition = eval(self.ui.editInitialCondition.text()) if self.data is None: try: self.interval = np.array( eval(self.ui.editSolutionInterval.text())) self.interval = self.interval.reshape(len(self.interval), 1) except: message(self.ui.statusBar, 'Invalid solution interval format') return else: self.interval = self.data try: ivp = IVP(self.ui.editEquation.text(), condition[0], condition[1]) except: message(self.ui.statusBar, 'Invalid equation type') return try: model = Model(self.ui.cbModel.currentText(), int(self.ui.editNeurons.text()), self.ui.cbActivationFunction.currentText()) self.network = NNet(model.get(), self.ui.cbOptimizer.currentText(), int(self.ui.editIterations.text()), float(self.ui.editAccuracy.text()), self.ui.cbTensorBoard.isChecked()) except: message(self.ui.statusBar, 'An error has occured creating the model') return self.network.set_updatable_widgets(self.progress_bar, self.lb_loss) enable_widgets(self.widgets, False) # Calculates the time of the training session start_t = time.time() update_status_bar(self.ui.statusBar, self.progress_bar, self.lb_loss) self.h = self.network.solve_ivp(ivp, self.interval) hide([self.progress_bar]) end_t = time.time() # Shows the calculated time self.lb_loss.setText(self.lb_loss.text() + '. The training took ' + '{:.2f}'.format(end_t - start_t) + ' segs.') try: self.values = self.network(self.interval) np.savetxt(self.path + "result.txt", self.values, fmt='%10.4f') except: message(self.ui.statusBar, 'An error ocurred trying to save data!') return self.check_fo_saving_graph() enable_widgets(self.widgets, True) self.onCbModelSelectionChange(self.ui.cbModel.currentIndex())
mnist_data = np.load("mnist.npz") train_images, train_labels = mnist_data["train_images"], mnist_data[ "train_labels"] test_images, test_labels = mnist_data["test_images"], mnist_data["test_labels"] print "Loaded %d mnis images: %s %s" % (len(train_images), train_images.shape, train_labels.shape) inp = Input((28, 28), "images") top = Flatten()(inp) top = Tanh()(Linear(28 * 28 * 10, name="l1")(top)) top = Tanh()(Linear(1000, name="l2")(top)) top = Linear(10, name="out")(top) loss = CrossEntropyLoss("LogLoss")(top) model = Model(inp, loss) model.compile(trainer=SGD(0.01, momentum=0.9)) def print_image(img): for row in img: line = ''.join([ '#' if x > 0.7 else ('+' if x > 0.3 else ('.' if x > 0 else ' ')) for x in row ]) print line class cb(Callback): def onEpochEnd(self, epoch, samples, total_samples, losses, metrics): print "epoch end:", epoch, samples, total_samples, losses, metrics
mnist_data = np.load("mnist.npz") train_images, train_labels = mnist_data["train_images"], mnist_data[ "train_labels"] test_images, test_labels = mnist_data["test_images"], mnist_data["test_labels"] print "Loaded %d MNIST images: %s %s" % (len(train_images), train_images.shape, train_labels.shape) inp = Input((28, 28), "images") top = Flatten()(inp) top = Tanh()(Linear(28 * 28 * 10, name="l1")(top)) top = Tanh()(Linear(1000, name="l2")(top)) top = Linear(10, name="out")(top) loss = CrossEntropyLoss("LogLoss")(top) model = Model(inp, loss) model.compile(trainer=SGD(0.01, momentum=0.9)) def calc_grads(model, mbsize, x, y_): assert len(x) == len(y_) sumgrads = [np.zeros_like(p) for p in model.get_params()] sumloss = 0.0 N = len(x) bar = Bar( "Calculating gradients...", suffix= "%(index)d/%(max)d - %(percent)d%% - loss:%(loss)f - acc:%(acc).1f%%", max=N)