Пример #1
0
def create_model():

	inp = Input((28, 28), "images")
	top = Flatten()(inp)
	top = Tanh()(Linear(28*28*10, name="l1")(top))
	top = Tanh()(Linear(1000, name="l2")(top))
	top = Linear(10, name="out")(top)
	loss = CrossEntropyLoss("LogLoss")(top)

	return Model(inp, loss)
Пример #2
0
    def __init__(self, params, bulk, xcolumn, ycolumn):
        self.Bulk = bulk
        self.Columns = params["columns"]

        model = Model.from_config(params["_model"]["config"])
        trainer = SGD(params["lr"], params.get("momentum", 0.5))
        model.compile(trainer=trainer)
        self.Model = model

        weights = [
            p for n, p in sorted(bulk.items()) if n.startswith("weight_")
        ]
        self.Weights0 = weights

        self.Grads = map(np.zeros_like, weights)
        self.Samples = 0
        self.SumLoss = 0.0
        self.SumMetric = 0.0
Пример #3
0
def build_graph(seed=123, build_decoder=True, batch_size=256, padlen=40):
    print("\n Building graph...")
    np.random.seed(seed)
    tf.set_random_seed(seed)  # reproducibility
    tf.reset_default_graph()
    model = Model(embedding_weights=weights,
                  build_decoder=build_decoder,
                  batch_size=batch_size,
                  padlen=padlen)  # Build tensorflow graph from config

    variables_to_save1 = [
        v for v in tf.global_variables() if 'Adam' not in v.name
        and 'global_step' not in v.name and 'vad' not in v.name
    ]  # Saver to save & restore all the variables.
    variables_to_save2 = [
        v for v in tf.global_variables() if 'Adam' not in v.name
        and 'global_step' not in v.name and 'clf' not in v.name
    ]
    saver1 = tf.train.Saver(var_list=variables_to_save1,
                            keep_checkpoint_every_n_hours=1.0)  # CLF saver
    saver2 = tf.train.Saver(var_list=variables_to_save2,
                            keep_checkpoint_every_n_hours=1.0)  # VAD saver
    return model, saver1, saver2
Пример #4
0
    def __init__(self,
                 num_actions,
                 id_num,
                 shared_arr=None,
                 num_moves=None,
                 args=None):
        print "USING OPTION CRITIC"
        self.args = args
        self.id_num = id_num
        self.num_actions = num_actions
        self.num_moves = num_moves
        self.reset_storing()
        self.rng = np.random.RandomState(100 + id_num)
        # input is 8x8
        model_network = [{
            "model_type": "conv",
            "filter_size": [4, 4],
            "pool": [1, 1],
            "stride": [2, 2],
            "out_size": 32,
            "activation": "relu"
        }, {
            "model_type": "conv",
            "filter_size": [3, 3],
            "pool": [1, 1],
            "stride": [2, 2],
            "out_size": 64,
            "activation": "relu"
        }, {
            "model_type": "mlp",
            "out_size": 48,
            "activation": "relu"
        }, {
            "model_type": "mlp",
            "out_size": 32,
            "activation": "relu"
        }]
        out = [None, model_network[-1]["out_size"]]
        self.conv = Model(model_network,
                          input_size=[
                              None, args.concat_frames *
                              (1 if args.grayscale else 3), 8, 8
                          ])
        self.termination_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "sigmoid",
            "W": 0
        }],
                                       input_size=out)
        self.Q_val_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "linear",
            "W": 0
        }],
                                 input_size=out)
        self.options_model = MLP3D(input_size=out[1],
                                   num_options=args.num_options,
                                   out_size=num_actions,
                                   activation="softmax")
        self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params
        self.set_rms_shared_weights(shared_arr)

        x = T.ftensor4()
        y = T.fvector()
        a = T.ivector()
        o = T.ivector()
        delib = T.fscalar()

        s = self.conv.apply(x / np.float32(255))
        intra_option_policy = self.options_model.apply(s, o)

        q_vals = self.Q_val_model.apply(s)
        disc_q = theano.gradient.disconnected_grad(q_vals)
        current_option_q = q_vals[T.arange(o.shape[0]), o]
        disc_opt_q = disc_q[T.arange(o.shape[0]), o]
        terms = self.termination_model.apply(s)
        o_term = terms[T.arange(o.shape[0]), o]
        V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + (
            self.args.option_epsilon * T.mean(q_vals, axis=1))
        disc_V = theano.gradient.disconnected_grad(V)

        aggr = T.mean  # T.sum
        log_eps = 0.0001

        critic_cost = aggr(args.critic_coef * 0.5 *
                           T.sqr(y - current_option_q))
        termination_grad = aggr(o_term * ((disc_opt_q - disc_V) + delib))
        entropy = -aggr(
            T.sum(intra_option_policy * T.log(intra_option_policy + log_eps),
                  axis=1)) * args.entropy_reg
        pg = aggr(
            (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) *
            (y - disc_opt_q))
        cost = pg + entropy - critic_cost - termination_grad

        grads = T.grad(cost * args.update_freq, self.params)
        # grads = T.grad(cost, self.params)
        updates, grad_rms, self.rms_weights = rmsprop(self.params,
                                                      grads,
                                                      clip=args.clip,
                                                      clip_type=args.clip_type)
        self.share_rms(shared_arr)

        self.get_state = theano.function([x], s, on_unused_input='warn')
        self.get_policy = theano.function([s, o], intra_option_policy)
        self.get_termination = theano.function([x], terms)
        self.get_q = theano.function([x], q_vals)
        self.get_q_from_s = theano.function([s], q_vals)
        self.get_V = theano.function([x], V)

        self.rms_grads = theano.function([x, a, y, o, delib],
                                         grad_rms,
                                         updates=updates,
                                         on_unused_input='warn')
        print "ALL COMPILED"

        if not self.args.testing:
            self.init_tracker()
        self.initialized = False
Пример #5
0
class AOCAgent_THEANO():
    def __init__(self,
                 num_actions,
                 id_num,
                 shared_arr=None,
                 num_moves=None,
                 args=None):
        print "USING OPTION CRITIC"
        self.args = args
        self.id_num = id_num
        self.num_actions = num_actions
        self.num_moves = num_moves
        self.reset_storing()
        self.rng = np.random.RandomState(100 + id_num)
        # input is 8x8
        model_network = [{
            "model_type": "conv",
            "filter_size": [4, 4],
            "pool": [1, 1],
            "stride": [2, 2],
            "out_size": 32,
            "activation": "relu"
        }, {
            "model_type": "conv",
            "filter_size": [3, 3],
            "pool": [1, 1],
            "stride": [2, 2],
            "out_size": 64,
            "activation": "relu"
        }, {
            "model_type": "mlp",
            "out_size": 48,
            "activation": "relu"
        }, {
            "model_type": "mlp",
            "out_size": 32,
            "activation": "relu"
        }]
        out = [None, model_network[-1]["out_size"]]
        self.conv = Model(model_network,
                          input_size=[
                              None, args.concat_frames *
                              (1 if args.grayscale else 3), 8, 8
                          ])
        self.termination_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "sigmoid",
            "W": 0
        }],
                                       input_size=out)
        self.Q_val_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "linear",
            "W": 0
        }],
                                 input_size=out)
        self.options_model = MLP3D(input_size=out[1],
                                   num_options=args.num_options,
                                   out_size=num_actions,
                                   activation="softmax")
        self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params
        self.set_rms_shared_weights(shared_arr)

        x = T.ftensor4()
        y = T.fvector()
        a = T.ivector()
        o = T.ivector()
        delib = T.fscalar()

        s = self.conv.apply(x / np.float32(255))
        intra_option_policy = self.options_model.apply(s, o)

        q_vals = self.Q_val_model.apply(s)
        disc_q = theano.gradient.disconnected_grad(q_vals)
        current_option_q = q_vals[T.arange(o.shape[0]), o]
        disc_opt_q = disc_q[T.arange(o.shape[0]), o]
        terms = self.termination_model.apply(s)
        o_term = terms[T.arange(o.shape[0]), o]
        V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + (
            self.args.option_epsilon * T.mean(q_vals, axis=1))
        disc_V = theano.gradient.disconnected_grad(V)

        aggr = T.mean  # T.sum
        log_eps = 0.0001

        critic_cost = aggr(args.critic_coef * 0.5 *
                           T.sqr(y - current_option_q))
        termination_grad = aggr(o_term * ((disc_opt_q - disc_V) + delib))
        entropy = -aggr(
            T.sum(intra_option_policy * T.log(intra_option_policy + log_eps),
                  axis=1)) * args.entropy_reg
        pg = aggr(
            (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) *
            (y - disc_opt_q))
        cost = pg + entropy - critic_cost - termination_grad

        grads = T.grad(cost * args.update_freq, self.params)
        # grads = T.grad(cost, self.params)
        updates, grad_rms, self.rms_weights = rmsprop(self.params,
                                                      grads,
                                                      clip=args.clip,
                                                      clip_type=args.clip_type)
        self.share_rms(shared_arr)

        self.get_state = theano.function([x], s, on_unused_input='warn')
        self.get_policy = theano.function([s, o], intra_option_policy)
        self.get_termination = theano.function([x], terms)
        self.get_q = theano.function([x], q_vals)
        self.get_q_from_s = theano.function([s], q_vals)
        self.get_V = theano.function([x], V)

        self.rms_grads = theano.function([x, a, y, o, delib],
                                         grad_rms,
                                         updates=updates,
                                         on_unused_input='warn')
        print "ALL COMPILED"

        if not self.args.testing:
            self.init_tracker()
        self.initialized = False

    def update_weights(self, x, a, y, o, moves, delib):
        args = self.args
        self.num_moves.value += moves
        lr = np.max([
            args.init_lr * (args.max_num_frames - self.num_moves.value) /
            args.max_num_frames, 0
        ]).astype("float32")

        cumul = self.rms_grads(x, a, y, o, delib)
        for i in range(len(cumul)):
            self.shared_arr[i] += lr * cumul[i]
            self.params[i].set_value(self.shared_arr[i])
        return

    def load_values(self, values):
        assert (len(self.params + self.rms_weights) == len(values))
        for p, v in zip(self.params + self.rms_weights, values):
            p.set_value(v)

    def save_values(self, folder_name):
        pickle.dump([p.get_value() for p in self.params + self.rms_weights],
                    open(folder_name + "/tmp_model.pkl", "wb"))
        os.system("mv " + folder_name + "/tmp_model.pkl " + folder_name +
                  "/model.pkl")
        # try: # server creates too many core files
        #  os.system("rm ./core*")
        # except:
        #  pass

    def get_param_vals(self):
        return [m.get_value() for m in self.params + self.rms_weights]

    def set_rms_shared_weights(self, shared_arr):
        if shared_arr is not None:
            self.shared_arr = [
                np.frombuffer(s, dtype="float32").reshape(p.get_value().shape)
                for s, p in zip(shared_arr, self.params)
            ]
            self.rms_shared_arr = shared_arr[len(self.params):]
            if self.args.init_num_moves > 0:
                for s, p in zip(shared_arr, self.params):
                    p.set_value(
                        np.frombuffer(s, dtype="float32").reshape(
                            p.get_value().shape))
                print "LOADED VALUES"

    def share_rms(self, shared_arr):
        # Ties rms params between threads with borrow=True flag
        if self.args.rms_shared and shared_arr is not None:
            assert (len(self.rms_weights) == len(self.rms_shared_arr))
            for rms_w, s_rms_w in zip(self.rms_weights, self.rms_shared_arr):
                rms_w.set_value(np.frombuffer(
                    s_rms_w, dtype="float32").reshape(rms_w.get_value().shape),
                                borrow=True)

    def get_action(self, x):
        p = self.get_policy([self.current_s], [self.current_o])
        return self.rng.choice(range(self.num_actions), p=p[-1])

    def get_policy_over_options(self, s):
        return self.get_q_from_s(s)[0].argmax(
        ) if self.rng.rand() > self.args.option_epsilon else self.rng.randint(
            self.args.num_options)

    def update_internal_state(self, x):
        self.current_s = self.get_state([x])[0]
        self.delib = self.args.delib_cost

        if self.terminated:
            self.current_o = self.get_policy_over_options([self.current_s])
            self.o_tracker_chosen[self.current_o] += 1

        self.o_tracker_steps[self.current_o] += 1

    def init_tracker(self):
        csv_things = ["moves", "reward", "term_prob"]
        csv_things += [
            "opt_chosen" + str(ccc) for ccc in range(self.args.num_options)
        ]
        csv_things += [
            "opt_steps" + str(ccc) for ccc in range(self.args.num_options)
        ]
        with open(self.args.folder_name + "/data.csv", "a") as myfile:
            myfile.write(",".join([str(cc) for cc in csv_things]) + "\n")

    def tracker(self):
        term_prob = float(self.termination_counter) / self.frame_counter * 100
        csv_things = [
            self.num_moves.value, self.total_reward,
            round(term_prob, 1)
        ] + list(self.o_tracker_chosen) + list(self.o_tracker_steps)
        print self.o_tracker_steps
        with open(self.args.folder_name + "/data.csv", "a") as myfile:
            myfile.write(",".join([str(cc) for cc in csv_things]) + "\n")

    def reset_tracker(self):
        self.termination_counter = 0
        self.frame_counter = 0
        self.o_tracker_chosen = np.zeros(self.args.num_options, )
        self.o_tracker_steps = np.zeros(self.args.num_options, )

    def reset(self, x):
        if not self.args.testing and self.initialized: self.tracker()
        self.total_reward = 0
        self.terminated = True
        self.reset_tracker()
        self.update_internal_state(x)
        self.initialized = True

    def reset_storing(self):
        self.a_seq = np.zeros((self.args.max_update_freq, ), dtype="int32")
        self.o_seq = np.zeros((self.args.max_update_freq, ), dtype="int32")
        self.r_seq = np.zeros((self.args.max_update_freq, ), dtype="float32")
        self.x_seq = np.zeros(
            (self.args.max_update_freq, self.args.concat_frames *
             (1 if self.args.grayscale else 3), 8, 8),
            dtype="float32")
        self.t_counter = 0

    def store(self, x, new_x, action, raw_reward, done, death):
        end_ep = done or (death and self.args.death_ends_episode)
        self.frame_counter += 1

        self.total_reward += raw_reward
        reward = np.clip(raw_reward, -1, 1)

        self.terminated = self.get_termination(
            [new_x])[0][self.current_o] > self.rng.rand()
        self.termination_counter += self.terminated

        self.x_seq[self.t_counter] = np.copy(x)
        self.o_seq[self.t_counter] = np.copy(self.current_o)
        self.a_seq[self.t_counter] = np.copy(action)
        self.r_seq[self.t_counter] = np.copy(
            float(reward)) - (float(self.terminated) * self.delib *
                              (1 - float(end_ep)))

        self.t_counter += 1

        # do n-step return to option termination.
        # cut off at self.args.max_update_freq
        # min steps: self.args.update_freq (usually 5 like a3c)
        # this doesn't make option length a minimum of 5 (they can still terminate). only batch size
        option_term = (self.terminated
                       and self.t_counter >= self.args.update_freq)
        if self.t_counter == self.args.max_update_freq or end_ep or option_term:
            if not self.args.testing:
                V = self.get_V([new_x])[0] if self.terminated else self.get_q(
                    [new_x])[0][self.current_o]
                R = 0 if end_ep else V
                V = []
                for j in range(self.t_counter - 1, -1, -1):
                    R = np.float32(self.r_seq[j] + self.args.gamma * R)
                    V.append(R)
                self.update_weights(self.x_seq[:self.t_counter],
                                    self.a_seq[:self.t_counter], V[::-1],
                                    self.o_seq[:self.t_counter],
                                    self.t_counter,
                                    self.delib + self.args.margin_cost)
            self.reset_storing()
        if not end_ep:
            self.update_internal_state(new_x)
Пример #6
0
    def __init__(self,
                 num_actions,
                 id_num,
                 shared_arr=None,
                 num_moves=None,
                 args=None):
        print "USING OPTION CRITIC"
        self.args = args
        self.id_num = id_num
        self.num_actions = num_actions
        self.num_moves = num_moves
        self.reset_storing()
        self.rng = np.random.RandomState(100 + id_num)
        model_network = [{
            "model_type": "conv",
            "filter_size": [8, 8],
            "pool": [1, 1],
            "stride": [4, 4],
            "out_size": 16,
            "activation": "relu"
        }, {
            "model_type": "conv",
            "filter_size": [4, 4],
            "pool": [1, 1],
            "stride": [2, 2],
            "out_size": 32,
            "activation": "relu"
        }, {
            "model_type": "mlp",
            "out_size": 256,
            "activation": "relu"
        }]
        out = [None, model_network[-1]["out_size"]]
        self.conv = Model(model_network,
                          input_size=[
                              None, args.concat_frames *
                              (1 if args.grayscale else 3), 84, 84
                          ])
        self.termination_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "sigmoid",
            "W": 0
        }],
                                       input_size=out)
        self.Q_val_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "linear",
            "W": 0
        }],
                                 input_size=out)
        self.Sigma_val_model = Model(
            [{
                "model_type": "mlp",
                "out_size": args.num_options,
                "activation": "linear",
                "W": 0
            }],
            input_size=out)  #Sigma: Variance(state,option)
        self.options_model = MLP3D(input_size=out[1],
                                   num_options=args.num_options,
                                   out_size=num_actions,
                                   activation="softmax")
        self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params + self.Sigma_val_model.params
        self.set_rms_shared_weights(shared_arr)

        x = T.ftensor4()  #observation
        y = T.fvector()  #G
        y_sigma = T.fvector(
        )  #True variance (s,o) like G is true value for Q(s,o)
        a = T.ivector()  #action
        o = T.ivector()  #option
        delib = T.fscalar()

        s = self.conv.apply(x / np.float32(255))  #states
        intra_option_policy = self.options_model.apply(s, o)

        q_vals = self.Q_val_model.apply(s)
        sigma_vals = self.Sigma_val_model.apply(s)
        disc_q = theano.gradient.disconnected_grad(q_vals)
        disc_sigma = theano.gradient.disconnected_grad(sigma_vals)
        current_option_q = q_vals[T.arange(o.shape[0]), o]
        current_option_sigma = sigma_vals[T.arange(o.shape[0]), o]
        disc_opt_q = disc_q[T.arange(o.shape[0]), o]
        disc_opt_sigma = disc_sigma[T.arange(o.shape[0]), o]
        terms = self.termination_model.apply(s)
        o_term = terms[T.arange(o.shape[0]), o]  #termination at option (s,o)
        V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + (
            self.args.option_epsilon * T.mean(q_vals, axis=1))
        V_sigma = T.min(sigma_vals, axis=1)*(1-self.args.option_epsilon) \
                  + (self.args.option_epsilon*T.mean(sigma_vals, axis=1)) #Ideal V_var is min[sigma(s,o)] we want to achieve that
        disc_V = theano.gradient.disconnected_grad(V)
        disc_V_sigma = theano.gradient.disconnected_grad(V_sigma)

        aggr = T.mean  #T.sum
        log_eps = 0.0001

        critic_cost = aggr(args.critic_coef * 0.5 *
                           T.sqr(y - current_option_q))  # update for Q(s,o)
        critic_sigma_cost = aggr(
            args.critic_sigma_coef * 0.5 *
            T.sqr(y_sigma - current_option_sigma))  # update for sigma(s,o)
        # termination_grad = aggr(o_term*((disc_opt_q-disc_V)+delib))
        advantage_q = disc_opt_q - disc_V  #advatnage function for Q value
        advantage_sigma = disc_opt_sigma - disc_V_sigma  #advantage function for sigma
        termination_grad = aggr(
            o_term *
            ((advantage_q - self.args.psi * advantage_sigma) + delib))  #CHANGE
        entropy = -aggr(
            T.sum(intra_option_policy * T.log(intra_option_policy + log_eps),
                  axis=1)) * args.entropy_reg
        pg = aggr(
            (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) *
            ((y - disc_opt_q) - self.args.psi * (y_sigma - disc_opt_sigma)))
        cost = pg + entropy - critic_cost - critic_sigma_cost - termination_grad

        grads = T.grad(cost * args.update_freq, self.params)
        #grads = T.grad(cost, self.params)
        updates, grad_rms, self.rms_weights = rmsprop(self.params,
                                                      grads,
                                                      clip=args.clip,
                                                      clip_type=args.clip_type)
        self.share_rms(shared_arr)

        self.get_state = theano.function([x], s, on_unused_input='warn')
        self.get_policy = theano.function([s, o], intra_option_policy)
        self.get_termination = theano.function([x], terms)
        self.get_q = theano.function([x], q_vals)
        self.get_sigma = theano.function([x], sigma_vals)
        self.get_q_from_s = theano.function([s], q_vals)
        self.get_sigma_from_s = theano.function([s], sigma_vals)
        self.get_V = theano.function([x], V)
        self.get_V_sigma = theano.function([x], V_sigma)

        self.rms_grads = theano.function([x, a, y, y_sigma, o, delib],
                                         grad_rms,
                                         updates=updates,
                                         on_unused_input='warn')
        print "ALL COMPILED"

        if not self.args.testing:
            self.init_tracker()
        self.initialized = False
Пример #7
0
 def unpack_model(self, params):
     self.Model = Model.from_config(params["config"])
Пример #8
0
nx = 2
ny = 2
nb = 123
w = 0.1

#model = Sequential()
#model.add(Dense(3, input_shape=(nx,), activation="tanh"))
#model.add(Dense(ny, activation="softmax"))

inp = Input((nx, ), "input")
l1 = Linear(3, name="l1")(inp)
act = Tanh()(l1)
l2 = Linear(ny, name="l2")(act)
out = CrossEntoryLoss(ny)(l2)
model = Model(inp, out)

sgd = SGD(0.1, momentum=0.95)

model.compile(trainer=sgd)

import pprint

cfg = model.config()
pprint.pprint(cfg)

m1 = Model.from_config(cfg)
pprint.pprint(m1.config())

pprint.pprint(model.get_params())
Пример #9
0
from nnet import Model
from nnet.core_layers import Linear, Input, Flatten
from nnet.activations import Tanh, Sigmoid, SoftPlus
from nnet.losses import CrossEntropyLoss, L2Loss
from nnet.callbacks import Callback, Callbacks
import numpy as np
import random
from nnet.trainers import SGD

inp = Input((28, 28), "images")
top = Flatten()(inp)
top = Tanh()(Linear(28 * 28 * 10, name="l1")(top))
top = Tanh()(Linear(1000, name="l2")(top))
top = Linear(10, name="out")(top)
loss = CrossEntropyLoss("LogLoss")(top)

model = Model(inp, loss)

cfg = model.config()

m1 = Model.from_config(cfg)
Пример #10
0
 def __init__(self):
     super().__init__()
     self.model = Model()
Пример #11
0
    def __init__(self,
                 num_actions,
                 id_num,
                 shared_arr=None,
                 num_moves=None,
                 args=None):
        print "USING OPTION CRITIC"
        self.args = args
        self.id_num = id_num
        self.num_actions = num_actions
        self.num_moves = num_moves
        self.reset_storing()
        self.rng = np.random.RandomState(100 + id_num)
        model_network = [{
            "model_type": "conv",
            "filter_size": [8, 8],
            "pool": [1, 1],
            "stride": [4, 4],
            "out_size": 16,
            "activation": "relu"
        }, {
            "model_type": "conv",
            "filter_size": [4, 4],
            "pool": [1, 1],
            "stride": [2, 2],
            "out_size": 32,
            "activation": "relu"
        }, {
            "model_type": "mlp",
            "out_size": 256,
            "activation": "relu"
        }]
        out = [None, model_network[-1]["out_size"]]
        self.conv = Model(model_network,
                          input_size=[
                              None, args.concat_frames *
                              (1 if args.grayscale else 3), 84, 84
                          ])
        self.termination_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "sigmoid",
            "W": 0
        }],
                                       input_size=out)
        self.Q_val_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "linear",
            "W": 0
        }],
                                 input_size=out)
        self.options_model = MLP3D(input_size=out[1],
                                   num_options=args.num_options,
                                   out_size=num_actions,
                                   activation="softmax")
        self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params
        self.set_rms_shared_weights(shared_arr)

        x = T.ftensor4()  # State
        y = T.fvector()  # Onestep Return?
        a = T.ivector()  # Action
        o = T.ivector()  # Option
        delib = T.fscalar()

        s = self.conv.apply(x / np.float32(255))
        intra_option_policy = self.options_model.apply(s, o)

        q_vals = self.Q_val_model.apply(
            s)  # Gets all of the Q values, given a state
        disc_q = theano.gradient.disconnected_grad(
            q_vals)  # Calculate all gradients (simultaneously learning)
        current_option_q = q_vals[T.arange(
            o.shape[0]
        ), o]  # Given that we are in option o (and s, from above), get all q values for each action
        disc_opt_q = disc_q[T.arange(o.shape[0]),
                            o]  # get all relevant gradients for each action
        terms = self.termination_model.apply(s)
        o_term = terms[T.arange(o.shape[0]),
                       o]  # get all terminations for each option
        V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + (
            self.args.option_epsilon * T.mean(q_vals, axis=1)
        )  # same as Value function in A3C; has value for each policy, argmax it
        disc_V = theano.gradient.disconnected_grad(V)

        aggr = T.mean  #T.sum -- function call
        log_eps = 0.0001

        critic_cost = aggr(
            args.critic_coef * 0.5 * T.sqr(y - current_option_q)
        )  # Value Loss - How much better was actual reward than q value; Same as A3c, but again, becomes q value
        termination_grad = aggr(
            o_term *
            ((disc_opt_q - disc_V) + delib))  # NOTE: Delib always <= 0
        entropy = -aggr(
            T.sum(intra_option_policy * T.log(intra_option_policy + log_eps),
                  axis=1)
        ) * args.entropy_reg  # Traditional entropy; discourages actions that dominate too quickly
        pg = aggr(
            (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) *
            (y - disc_opt_q))  # Policy loss
        cost = pg + entropy - critic_cost - termination_grad

        # NOTE: DO THIS AS TF DOES WITH THREADRUNNER
        grads = T.grad(cost * args.update_freq,
                       self.params)  # update gradients
        #grads = T.grad(cost, self.params)
        updates, grad_rms, self.rms_weights = rmsprop(self.params,
                                                      grads,
                                                      clip=args.clip,
                                                      clip_type=args.clip_type)
        self.share_rms(shared_arr)

        # Get functions
        self.get_state = theano.function([x], s, on_unused_input='warn')
        self.get_policy = theano.function([s, o], intra_option_policy)
        self.get_termination = theano.function([s], terms)
        self.get_q = theano.function([s], q_vals)
        self.get_V = theano.function([s], V)

        # Compute RMS gradients
        # By default, updates = computing, updating all variables using rmsprop() function
        self.rms_grads = theano.function(
            [x, a, y, o, delib],
            grad_rms,
            updates=updates,
            on_unused_input='warn'
        )  # http://deeplearning.net/software/theano/tutorial/examples.html#basictutexamples
        print "ALL COMPILED"

        if not self.args.testing:
            self.init_tracker()
        self.initialized = False
Пример #12
0
class AOCAgent_THEANO():
    def __init__(self,
                 num_actions,
                 id_num,
                 shared_arr=None,
                 num_moves=None,
                 args=None):
        print "USING OPTION CRITIC"
        self.args = args
        self.id_num = id_num
        self.num_actions = num_actions
        self.num_moves = num_moves
        self.reset_storing()
        self.rng = np.random.RandomState(100 + id_num)
        model_network = [{
            "model_type": "conv",
            "filter_size": [8, 8],
            "pool": [1, 1],
            "stride": [4, 4],
            "out_size": 16,
            "activation": "relu"
        }, {
            "model_type": "conv",
            "filter_size": [4, 4],
            "pool": [1, 1],
            "stride": [2, 2],
            "out_size": 32,
            "activation": "relu"
        }, {
            "model_type": "mlp",
            "out_size": 256,
            "activation": "relu"
        }]
        out = [None, model_network[-1]["out_size"]]
        self.conv = Model(model_network,
                          input_size=[
                              None, args.concat_frames *
                              (1 if args.grayscale else 3), 84, 84
                          ])
        self.termination_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "sigmoid",
            "W": 0
        }],
                                       input_size=out)
        self.Q_val_model = Model([{
            "model_type": "mlp",
            "out_size": args.num_options,
            "activation": "linear",
            "W": 0
        }],
                                 input_size=out)
        self.options_model = MLP3D(input_size=out[1],
                                   num_options=args.num_options,
                                   out_size=num_actions,
                                   activation="softmax")
        self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params
        self.set_rms_shared_weights(shared_arr)

        x = T.ftensor4()  # State
        y = T.fvector()  # Onestep Return?
        a = T.ivector()  # Action
        o = T.ivector()  # Option
        delib = T.fscalar()

        s = self.conv.apply(x / np.float32(255))
        intra_option_policy = self.options_model.apply(s, o)

        q_vals = self.Q_val_model.apply(
            s)  # Gets all of the Q values, given a state
        disc_q = theano.gradient.disconnected_grad(
            q_vals)  # Calculate all gradients (simultaneously learning)
        current_option_q = q_vals[T.arange(
            o.shape[0]
        ), o]  # Given that we are in option o (and s, from above), get all q values for each action
        disc_opt_q = disc_q[T.arange(o.shape[0]),
                            o]  # get all relevant gradients for each action
        terms = self.termination_model.apply(s)
        o_term = terms[T.arange(o.shape[0]),
                       o]  # get all terminations for each option
        V = T.max(q_vals, axis=1) * (1 - self.args.option_epsilon) + (
            self.args.option_epsilon * T.mean(q_vals, axis=1)
        )  # same as Value function in A3C; has value for each policy, argmax it
        disc_V = theano.gradient.disconnected_grad(V)

        aggr = T.mean  #T.sum -- function call
        log_eps = 0.0001

        critic_cost = aggr(
            args.critic_coef * 0.5 * T.sqr(y - current_option_q)
        )  # Value Loss - How much better was actual reward than q value; Same as A3c, but again, becomes q value
        termination_grad = aggr(
            o_term *
            ((disc_opt_q - disc_V) + delib))  # NOTE: Delib always <= 0
        entropy = -aggr(
            T.sum(intra_option_policy * T.log(intra_option_policy + log_eps),
                  axis=1)
        ) * args.entropy_reg  # Traditional entropy; discourages actions that dominate too quickly
        pg = aggr(
            (T.log(intra_option_policy[T.arange(a.shape[0]), a] + log_eps)) *
            (y - disc_opt_q))  # Policy loss
        cost = pg + entropy - critic_cost - termination_grad

        # NOTE: DO THIS AS TF DOES WITH THREADRUNNER
        grads = T.grad(cost * args.update_freq,
                       self.params)  # update gradients
        #grads = T.grad(cost, self.params)
        updates, grad_rms, self.rms_weights = rmsprop(self.params,
                                                      grads,
                                                      clip=args.clip,
                                                      clip_type=args.clip_type)
        self.share_rms(shared_arr)

        # Get functions
        self.get_state = theano.function([x], s, on_unused_input='warn')
        self.get_policy = theano.function([s, o], intra_option_policy)
        self.get_termination = theano.function([s], terms)
        self.get_q = theano.function([s], q_vals)
        self.get_V = theano.function([s], V)

        # Compute RMS gradients
        # By default, updates = computing, updating all variables using rmsprop() function
        self.rms_grads = theano.function(
            [x, a, y, o, delib],
            grad_rms,
            updates=updates,
            on_unused_input='warn'
        )  # http://deeplearning.net/software/theano/tutorial/examples.html#basictutexamples
        print "ALL COMPILED"

        if not self.args.testing:
            self.init_tracker()
        self.initialized = False

    def update_weights(self, x, a, y, o, moves, delib):
        args = self.args
        self.num_moves.value += moves
        lr = np.max([
            args.init_lr * (args.max_num_frames - self.num_moves.value) /
            args.max_num_frames, 0
        ]).astype("float32")

        cumul = self.rms_grads(x, a, y, o, delib)
        for i in range(len(cumul)):
            self.shared_arr[i] += lr * cumul[i]
            self.params[i].set_value(self.shared_arr[i])
        return

    def load_values(self, values):
        assert (len(self.params + self.rms_weights) == len(values))
        for p, v in zip(self.params + self.rms_weights, values):
            p.set_value(v)

    def save_values(self, folder_name):
        pickle.dump([p.get_value() for p in self.params + self.rms_weights],
                    open(folder_name + "/tmp_model.pkl", "wb"))
        os.system("mv " + folder_name + "/tmp_model.pkl " + folder_name +
                  "/model.pkl")
        #try: # server creates too many core files
        #  os.system("rm ./core*")
        #except:
        #  pass

    def get_param_vals(self):
        return [m.get_value() for m in self.params + self.rms_weights]

    def set_rms_shared_weights(self, shared_arr):
        if shared_arr is not None:
            self.shared_arr = [
                np.frombuffer(s, dtype="float32").reshape(p.get_value().shape)
                for s, p in zip(shared_arr, self.params)
            ]
            self.rms_shared_arr = shared_arr[len(self.params):]
            if self.args.init_num_moves > 0:
                for s, p in zip(shared_arr, self.params):
                    p.set_value(
                        np.frombuffer(s, dtype="float32").reshape(
                            p.get_value().shape))
                print "LOADED VALUES"

    def share_rms(self, shared_arr):
        # Ties rms params between threads with borrow=True flag
        if self.args.rms_shared and shared_arr is not None:
            assert (len(self.rms_weights) == len(self.rms_shared_arr))
            for rms_w, s_rms_w in zip(self.rms_weights, self.rms_shared_arr):
                rms_w.set_value(np.frombuffer(
                    s_rms_w, dtype="float32").reshape(rms_w.get_value().shape),
                                borrow=True)

    def get_action(self, x):
        p = self.get_policy([self.current_s], [self.current_o])
        return self.rng.choice(range(self.num_actions), p=p[-1])

    def get_policy_over_options(self, s):
        return self.get_q(s)[0].argmax(
        ) if self.rng.rand() > self.args.option_epsilon else self.rng.randint(
            self.args.num_options)

    def update_internal_state(self, x):
        self.current_s = self.get_state([x])[0]
        self.delib = self.args.delib_cost

        if self.terminated:
            self.current_o = self.get_policy_over_options([self.current_s])
            self.o_tracker_chosen[self.current_o] += 1

        self.o_tracker_steps[self.current_o] += 1

    def init_tracker(self):
        csv_things = ["moves", "reward", "term_prob"]
        csv_things += [
            "opt_chosen" + str(ccc) for ccc in range(self.args.num_options)
        ]
        csv_things += [
            "opt_steps" + str(ccc) for ccc in range(self.args.num_options)
        ]
        with open(self.args.folder_name + "/data.csv", "a") as myfile:
            myfile.write(",".join([str(cc) for cc in csv_things]) + "\n")

    def tracker(self):
        term_prob = float(self.termination_counter) / self.frame_counter * 100
        csv_things = [
            self.num_moves.value, self.total_reward,
            round(term_prob, 1)
        ] + list(self.o_tracker_chosen) + list(self.o_tracker_steps)
        with open(self.args.folder_name + "/data.csv", "a") as myfile:
            myfile.write(",".join([str(cc) for cc in csv_things]) + "\n")

    def reset_tracker(self):
        self.termination_counter = 0
        self.frame_counter = 0
        self.o_tracker_chosen = np.zeros(self.args.num_options, )
        self.o_tracker_steps = np.zeros(self.args.num_options, )

    def reset(self, x):
        if not self.args.testing and self.initialized: self.tracker()
        self.total_reward = 0
        self.terminated = True
        self.reset_tracker()
        self.update_internal_state(x)
        self.initialized = True

    def reset_storing(self):
        self.a_seq = np.zeros((self.args.max_update_freq, ), dtype="int32")
        self.o_seq = np.zeros((self.args.max_update_freq, ), dtype="int32")
        self.r_seq = np.zeros((self.args.max_update_freq, ), dtype="float32")
        self.x_seq = np.zeros(
            (self.args.max_update_freq, self.args.concat_frames *
             (1 if self.args.grayscale else 3), 84, 84),
            dtype="float32")
        self.t_counter = 0

    def store(self, x, new_x, action, raw_reward, done, death):
        end_ep = done or (death and self.args.death_ends_episode)
        self.frame_counter += 1

        self.total_reward += raw_reward
        reward = np.clip(raw_reward, -1, 1)

        self.x_seq[self.t_counter] = np.copy(x)
        self.o_seq[self.t_counter] = np.copy(self.current_o)
        self.a_seq[self.t_counter] = np.copy(action)
        self.r_seq[self.t_counter] = np.copy(float(reward)) - (float(
            self.terminated) * self.delib * float(self.frame_counter > 1))

        self.terminated = self.get_termination(
            [self.current_s])[0][self.current_o] > self.rng.rand()
        self.termination_counter += self.terminated

        self.t_counter += 1

        # do n-step return to option termination.
        # cut off at self.args.max_update_freq
        # min steps: self.args.update_freq (usually 5 like a3c)
        # this doesn't make option length a minimum of 5 (they can still terminate). only batch size
        option_term = (self.terminated
                       and self.t_counter >= self.args.update_freq)
        if self.t_counter == self.args.max_update_freq or end_ep or option_term:  # Time to update
            if not self.args.testing:
                d = (self.delib * float(self.frame_counter > 1)
                     )  # add delib if termination because it isn't part of V
                V = self.get_V([self.current_s
                                ])[0] - d if self.terminated else self.get_q(
                                    [self.current_s])[0][self.current_o]
                R = 0 if end_ep else V
                V = []
                for j in range(self.t_counter - 1, -1,
                               -1):  # Easy way to reset to 0
                    R = np.float32(self.r_seq[j] +
                                   self.args.gamma * R)  # discount
                    V.append(R)
                self.update_weights(self.x_seq[:self.t_counter],
                                    self.a_seq[:self.t_counter], V[::-1],
                                    self.o_seq[:self.t_counter],
                                    self.t_counter,
                                    self.delib + self.args.margin_cost)
            self.reset_storing()
        if not end_ep:
            self.update_internal_state(new_x)
Пример #13
0
    def __init__(self,
                 model_network=None,
                 gamma=0.99,
                 learning_method="rmsprop",
                 batch_size=32,
                 input_size=None,
                 learning_params=None,
                 dnn_type=True,
                 clip_delta=0,
                 scale=255.,
                 double_q=False,
                 prioritized_exp_replay=False,
                 heads_num=1,
                 action_num=0):

        x = T.ftensor4()
        next_x = T.ftensor4()
        a = T.ivector()
        r = T.fvector()
        terminal = T.ivector()

        self.heads_num = heads_num
        self.action_num = action_num

        self.x_shared = theano.shared(
            np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32'))
        self.next_x_shared = theano.shared(
            np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32'))
        self.a_shared = theano.shared(np.zeros((batch_size), dtype='int32'))
        self.terminal_shared = theano.shared(
            np.zeros((batch_size), dtype='int32'))
        self.r_shared = theano.shared(np.zeros((batch_size), dtype='float32'))

        self.Q_model = Model(model_network,
                             input_size=input_size,
                             dnn_type=dnn_type)
        self.Q_prime_model = Model(model_network,
                                   input_size=input_size,
                                   dnn_type=dnn_type)

        if double_q:
            alt_actions = T.argmax(self.Q_model.apply(next_x / scale), axis=1)
            alt_actions = theano.gradient.disconnected_grad(alt_actions)
            y = r + (T.ones_like(terminal)-terminal)*gamma*\
            self.Q_prime_model.apply(next_x/scale)[T.arange(alt_actions.shape[0]), alt_actions]
        else:
            q_stack = self.Q_prime_model.apply(next_x / scale)
            q_list = [
                q_stack[T.arange(a.shape[0]),
                        k * self.action_num:(k + 1) * self.action_num]
                for k in range(self.heads_num)
            ]
            y_list = [
                r + (T.ones_like(terminal) - terminal) * gamma *
                T.max(q_list[k], axis=1) for k in range(self.heads_num)
            ]

            y_concat = theano.tensor.concatenate(y_list, axis=0)

            y = r + (T.ones_like(terminal) - terminal) * gamma * T.max(
                self.Q_prime_model.apply(next_x / scale), axis=1)

        all_q_vals = self.Q_model.apply(x / scale)
        q_vals = all_q_vals[T.arange(a.shape[0]), a]

        q_vals_list = [
            all_q_vals[T.arange(a.shape[0]), a + k * self.heads_num]
            for k in range(self.heads_num)
        ]
        q_vals_concat = theano.tensor.concatenate(q_vals_list, axis=0)

        # td_errors = y-q_vals

        td_errors = y_concat - q_vals_concat
        """
    if clip_delta > 0:
      td_errors = td_errors.clip(-clip_delta, clip_delta)
    cost = 0.5*td_errors**2
    """
        if clip_delta > 0:
            #TOOK THIS FROM GITHUB CODE

            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(td_errors), clip_delta)
            linear_part = abs(td_errors) - quadratic_part
            cost = 0.5 * quadratic_part**2 + clip_delta * linear_part
        else:
            cost = 0.5 * td_errors**2
        #"""

        cost = T.sum(cost)

        print self.Q_model.params
        self.learning_method = self.Q_model.get_learning_method(
            learning_method, **learning_params)
        grads = T.grad(cost, self.Q_model.params)
        param_updates = self.learning_method.apply(self.Q_model.params, grads)

        target_updates = OrderedDict()
        for t, b in zip(self.Q_prime_model.params, self.Q_model.params):
            target_updates[t] = b

        givens = {
            x: self.x_shared,
            a: self.a_shared,
            r: self.r_shared,
            terminal: self.terminal_shared,
            next_x: self.next_x_shared
        }

        # print 'fast compile'
        # theano.config.mode = 'FAST_COMPILE'
        print "building"
        self.train_model = theano.function([],
                                           td_errors,
                                           updates=param_updates,
                                           givens=givens)
        print "compiled train_model (1/3)"
        self.pred_score = theano.function([],
                                          all_q_vals,
                                          givens={x: self.x_shared})
        print "compiled pred_score (2/3)"
        self.update_target_params = theano.function([], [],
                                                    updates=target_updates)
        print "compiled update_target_params (3/3)"
        self.update_target_params()
        print "updated target params"
    def onBtnSolveClick(self):
        """
        Solve the IVP and plot solution (in case of checked).
        """
        condition = eval(self.ui.editInitialCondition.text())

        if self.data is None:
            try:
                self.interval = np.array(
                    eval(self.ui.editSolutionInterval.text()))
                self.interval = self.interval.reshape(len(self.interval), 1)
            except:
                message(self.ui.statusBar, 'Invalid solution interval format')
                return
        else:
            self.interval = self.data

        try:
            ivp = IVP(self.ui.editEquation.text(), condition[0], condition[1])
        except:
            message(self.ui.statusBar, 'Invalid equation type')
            return

        try:
            model = Model(self.ui.cbModel.currentText(),
                          int(self.ui.editNeurons.text()),
                          self.ui.cbActivationFunction.currentText())
            self.network = NNet(model.get(), self.ui.cbOptimizer.currentText(),
                                int(self.ui.editIterations.text()),
                                float(self.ui.editAccuracy.text()),
                                self.ui.cbTensorBoard.isChecked())
        except:
            message(self.ui.statusBar,
                    'An error has occured creating the model')
            return

        self.network.set_updatable_widgets(self.progress_bar, self.lb_loss)

        enable_widgets(self.widgets, False)

        # Calculates the time of the training session
        start_t = time.time()
        update_status_bar(self.ui.statusBar, self.progress_bar, self.lb_loss)
        self.h = self.network.solve_ivp(ivp, self.interval)
        hide([self.progress_bar])
        end_t = time.time()

        # Shows the calculated time
        self.lb_loss.setText(self.lb_loss.text() + '. The training took ' +
                             '{:.2f}'.format(end_t - start_t) + ' segs.')

        try:
            self.values = self.network(self.interval)
            np.savetxt(self.path + "result.txt", self.values, fmt='%10.4f')
        except:
            message(self.ui.statusBar, 'An error ocurred trying to save data!')
            return

        self.check_fo_saving_graph()

        enable_widgets(self.widgets, True)
        self.onCbModelSelectionChange(self.ui.cbModel.currentIndex())
Пример #15
0
mnist_data = np.load("mnist.npz")
train_images, train_labels = mnist_data["train_images"], mnist_data[
    "train_labels"]
test_images, test_labels = mnist_data["test_images"], mnist_data["test_labels"]

print "Loaded %d mnis images: %s %s" % (len(train_images), train_images.shape,
                                        train_labels.shape)

inp = Input((28, 28), "images")
top = Flatten()(inp)
top = Tanh()(Linear(28 * 28 * 10, name="l1")(top))
top = Tanh()(Linear(1000, name="l2")(top))
top = Linear(10, name="out")(top)
loss = CrossEntropyLoss("LogLoss")(top)

model = Model(inp, loss)
model.compile(trainer=SGD(0.01, momentum=0.9))


def print_image(img):
    for row in img:
        line = ''.join([
            '#' if x > 0.7 else ('+' if x > 0.3 else ('.' if x > 0 else ' '))
            for x in row
        ])
        print line


class cb(Callback):
    def onEpochEnd(self, epoch, samples, total_samples, losses, metrics):
        print "epoch end:", epoch, samples, total_samples, losses, metrics
Пример #16
0
mnist_data = np.load("mnist.npz")
train_images, train_labels = mnist_data["train_images"], mnist_data[
    "train_labels"]
test_images, test_labels = mnist_data["test_images"], mnist_data["test_labels"]

print "Loaded %d MNIST images: %s %s" % (len(train_images), train_images.shape,
                                         train_labels.shape)

inp = Input((28, 28), "images")
top = Flatten()(inp)
top = Tanh()(Linear(28 * 28 * 10, name="l1")(top))
top = Tanh()(Linear(1000, name="l2")(top))
top = Linear(10, name="out")(top)
loss = CrossEntropyLoss("LogLoss")(top)

model = Model(inp, loss)
model.compile(trainer=SGD(0.01, momentum=0.9))


def calc_grads(model, mbsize, x, y_):

    assert len(x) == len(y_)

    sumgrads = [np.zeros_like(p) for p in model.get_params()]
    sumloss = 0.0
    N = len(x)
    bar = Bar(
        "Calculating gradients...",
        suffix=
        "%(index)d/%(max)d - %(percent)d%% - loss:%(loss)f - acc:%(acc).1f%%",
        max=N)