示例#1
0
    def __init__(self, sess, state_len, actions_no, max_depth, weights,
                 workers_no):
        from ac_net import ACNet
        from net_evaluator import NetEvaluator
        global BATCH_SIZE
        self.processes = workers_no + 1

        self.state_len = state_len
        self.actions_no = actions_no
        self.eps = EPS
        self.evaluator = NetEvaluator(trainable=True)
        self.actions_no = actions_no
        self.state = np.zeros(state_len)
        self.state[0] = 1

        self.max_depth = max_depth
        self.prev_acc = 0

        self.current_max_depth = max_depth
        self.old_weights = weights
        self.grads = []
        self.samples = []
        self.memory = dict()
        self.ac_net = ACNet(sess, self.state_len, self.actions_no, 'worker')
        self.ac_net.set_weights(self.old_weights)
示例#2
0
    def __init__(self, sess, state_len, actions_no, actions_bounds, max_depth,
                 weights, workers_no, dataset, trainable):
        from ac_net import ACNet
        from net_evaluator import NetEvaluator

        self.processes = workers_no + 1
        self.actions_no = actions_no
        self.eps = EPS
        self.evaluator = NetEvaluator(ACTIONS_NO,
                                      trainable=trainable,
                                      dataset=dataset)
        self.state = STARTING_STATE.copy()
        self.state_len = state_len
        self.max_depth = max_depth
        self.t = 1
        self.prev_acc = self.evaluator.baseline
        self.model = None
        self.current_max_depth = self.max_depth
        self.old_weights = weights
        self.grads = []
        self.samples = []
        self.best_samples = []
        self.best_reward = -1000
        self.memory = dict()
        self.ac_net = ACNet(sess, self.state_len, self.actions_no,
                            actions_bounds, 'worker')
        self.ac_net.set_weights(self.old_weights)
示例#3
0
 def __init__(self, sess, state_len, actions_no, action_bounds, workers_no):
     from ac_net import ACNet
     self.T = 0
     self.processes = workers_no + 1
     # Next, we build a very simple model.
     self.ac_net = ACNet(sess, state_len, actions_no, action_bounds,
                         'Master')
示例#4
0
 def __init__(self, sess, state_len, actions_no, workers_no):
     from ac_net import ACNet
     self.T = 0
     self.processes = workers_no + 1
     self.ac_net = ACNet(sess, state_len, actions_no, 'Master')
示例#5
0
class Worker(object):
    def __init__(self, sess, state_len, actions_no, max_depth, weights,
                 workers_no):
        from ac_net import ACNet
        from net_evaluator import NetEvaluator
        global BATCH_SIZE
        self.processes = workers_no + 1

        self.state_len = state_len
        self.actions_no = actions_no
        self.eps = EPS
        self.evaluator = NetEvaluator(trainable=True)
        self.actions_no = actions_no
        self.state = np.zeros(state_len)
        self.state[0] = 1

        self.max_depth = max_depth
        self.prev_acc = 0

        self.current_max_depth = max_depth
        self.old_weights = weights
        self.grads = []
        self.samples = []
        self.memory = dict()
        self.ac_net = ACNet(sess, self.state_len, self.actions_no, 'worker')
        self.ac_net.set_weights(self.old_weights)

    def update_ac_weights(self, weights):
        self.old_weights = weights
        if self.ac_net is not None:
            self.ac_net.set_weights(weights)

    def get_grads(self):
        return self.grads

    def calculate_gradients(self):
        grads = []
        weights = self.ac_net.get_weights()
        for i in range(len(weights)):
            grads.append(weights[i] - self.old_weights[i])
        return grads

    def fetch_from_memory(self, state):
        if state in self.memory:
            return self.memory[state]
        else:
            return None

    def add_to_memory(self, state, acc):
        self.memory[state] = acc

    def play(self):
        self.state = np.zeros(len(self.state))
        self.state[0] = 1
        self.prev_acc = self.evaluator.baseline
        del self.model
        self.model = None
        t_start = self.t

        episode_flag = True
        while episode_flag:

            policy, value = self.ac_net.predict(
                self.state.reshape(1, self.state_len))
            policy = policy[0]

            action = np.argmax(policy)
            reward, new_state = self.perform_action(action)
            self.state = new_state
            self.t += 1
            if self.t - t_start >= self.current_max_depth:
                episode_flag = False

        return self.prev_acc, self.state

    def run(self):

        self.grads = []
        self.t = 1
        self.episodes = 0
        self.samples = []
        self.eps = self.eps * EPS_RED_FACTOR

        while self.t <= BATCH_SIZE:

            self.state = np.zeros(len(self.state))
            self.state[0] = 1
            self.prev_acc = self.evaluator.baseline
            del self.model

            self.model = None

            t_start = self.t
            s_buffer = []
            r_buffer = []
            a_buffer = []

            episode_flag = True
            while episode_flag:

                policy, value = self.ac_net.predict(
                    self.state.reshape(1, self.state_len))

                policy = policy[0]
                value = value[0]
                action = np.random.choice(self.actions_no, p=policy)
                if np.random.uniform() < self.eps:
                    action = np.random.choice(self.actions_no)

                reward, new_state = self.perform_action(action)

                s_buffer.append(self.state)
                r_buffer.append(reward)
                a_buffer.append(action)

                self.state = new_state
                self.t += 1
                self.print_episode(policy, action, value, reward)
                if self.t - t_start >= self.current_max_depth:
                    episode_flag = False

            self.episodes += 1

            R = 0.0
            rev_rewards = []
            counter = 0
            for r in reversed(r_buffer):
                if counter == self.current_max_depth:
                    counter = 0
                    R = 0
                R = R * gamma + r
                rev_rewards.append(R)

            for reward, state, action in zip(rev_rewards, reversed(s_buffer),
                                             reversed(a_buffer)):
                self.samples.append((state, action, reward))

            np.random.shuffle(self.samples)

            # Transfrom to column vectors
            state, action, reward = list(map(np.array, zip(*self.samples)))
            v_l, p_l, e, g_n, v_n, grads = self.ac_net.fit(
                state, action, reward)
            self.samples = []

            for i in range(len(grads)):
                if len(self.grads) == i:
                    self.grads.append(grads[i])
                else:
                    self.grads[i] = self.grads[i] + grads[i]

        if self.current_max_depth < self.max_depth:
            self.current_max_depth += 1

        return self.prev_acc, self.state
        # return self.play()

    def print_episode(self, policy, action, value, reward):
        if DEBUG:
            print('Policy :\n', np.array2string(policy, precision=3))
            print('Action :\n', action)
            print('Value  :\n', np.array2string(value, precision=3))
            print('State :', self.state)
            print('Reward : %.3f' % reward, 'Accuracy : %.3f' % self.prev_acc)

    def perform_action(self, action):
        # Get new state
        new_state = self.update_state(action)
        # Expand model and evaluate
        acc = self.fetch_from_memory(str(new_state))
        if acc is None:
            acc = self.evaluator.evaluate_model(new_state,
                                                epochs=TRAINING_EPOCHS)
            self.add_to_memory(str(new_state), acc)
        # Get the reward
        reward = acc - self.prev_acc

        self.prev_acc = acc
        return reward, new_state

    def update_state(self, action, old_state=None):
        '''
            Update the state, based on the action taken
        '''

        if old_state is None:
            old_state = np.copy(self.state)

        new_state = np.copy(old_state)

        # If we added a layer
        if action != 0:
            onehot_action = np.zeros(self.actions_no - 1)
            onehot_action[action - 1] = 1
            index = 1
            for depth in range(self.max_depth):
                start = depth * (self.actions_no - 1) + 1
                actives = 0
                for i in range(self.actions_no - 1):
                    actives += old_state[start + i]
                if actives == 0:
                    index = start
                    break
            for i in range(self.actions_no - 1):
                new_state[index + i] = onehot_action[i]
        return new_state
# data = np.load('3_training_data_2_balanced_400k.npy', allow_pickle = True)

WIDTH = 64
HEIGHT = 48

VERSION = 3
EPOCHS = 50
SAMPLES = 400
MODEL = 'acnet'

MODEL_NAME = '{}-v{}-{}-epochs-{}k-samples.model'.format(
    MODEL, VERSION, EPOCHS, SAMPLES)

# initialize a new model with the given width and height

model = ACNet(WIDTH, HEIGHT)

# train-test split = 75-5-20 (TRAIN, VAL, TEST) %
# 400k samples balanced data- FROM 87939, TRAIN = 65954, VAL = 4397, TEST = 17588 (65954 + 4397 = 70351)
# 400k samples balanced data- FROM 92820, TRAIN = 69615, VAL = 4641, TEST = 18564 (69615 + 4641 = 74556)
train = data[:-18564]  # removed testing data from total
val = train[-4641:]  # removed val from train data
test = data[-18564:]  # reserved test data from total

# train data
X = np.array([i[0] for i in train]).reshape(-1, WIDTH, HEIGHT, 1)
Y = [i[1] for i in train]

# val data
val_x = np.array([i[0] for i in val]).reshape(-1, WIDTH, HEIGHT, 1)
val_y = [i[1] for i in val]
示例#7
0
class Worker(object):
    def __init__(self, sess, state_len, actions_no, actions_bounds, max_depth,
                 weights, workers_no, dataset, trainable):
        from ac_net import ACNet
        from net_evaluator import NetEvaluator

        self.processes = workers_no + 1
        self.actions_no = actions_no
        self.eps = EPS
        self.evaluator = NetEvaluator(ACTIONS_NO,
                                      trainable=trainable,
                                      dataset=dataset)
        self.state = STARTING_STATE.copy()
        self.state_len = state_len
        self.max_depth = max_depth
        self.t = 1
        self.prev_acc = self.evaluator.baseline
        self.model = None
        self.current_max_depth = self.max_depth
        self.old_weights = weights
        self.grads = []
        self.samples = []
        self.best_samples = []
        self.best_reward = -1000
        self.memory = dict()
        self.ac_net = ACNet(sess, self.state_len, self.actions_no,
                            actions_bounds, 'worker')
        self.ac_net.set_weights(self.old_weights)

    def update_ac_weights(self, weights):
        self.old_weights = weights
        if self.ac_net is not None:
            self.ac_net.set_weights(weights)

    def get_grads(self):
        return self.grads

    def fetch_from_memory(self, state):
        state_repr = state.copy()
        for i in range(len(state)):
            if i % ACTIONS_NO == ACTIONS_NO - 1:
                state_repr[i] = np.round(state_repr[i], 1)
            else:
                state_repr[i] = np.round(state_repr[i], 0)
        state_repr = str(state_repr)
        if state_repr in self.memory:
            return self.memory[state_repr]
        else:
            return None

    def add_to_memory(self, state, acc):
        state_repr = state.copy()
        for i in range(len(state)):
            if i % ACTIONS_NO == ACTIONS_NO - 1:
                state_repr[i] = np.round(state_repr[i], 1)
            else:
                state_repr[i] = np.round(state_repr[i], 0)
        state_repr = str(state_repr)
        self.memory[state_repr] = acc

    def play(self):
        prev_trainable = self.evaluator.builder.trainable
        self.evaluator.builder.trainable = True
        self.state = STARTING_STATE.copy()
        self.prev_acc = 0
        t_start = self.t
        episode_flag = True
        self.current_layer = 0
        while episode_flag:

            action, policy_mean, policy_sigma, value = self.ac_net.predict(
                self.state.reshape(1, self.state_len // self.actions_no,
                                   self.actions_no))

            value = value[(self.current_layer)]
            reward, new_state = self.perform_action(action)

            self.state = new_state
            self.t += 1
            self.current_layer += 1
            if self.t - t_start >= self.current_max_depth:
                episode_flag = False

        self.evaluator.builder.trainable = prev_trainable
        return self.prev_acc, self.state

    def run(self):

        self.grads = []
        self.samples = []
        t_start = self.t
        # Gather experiences
        self.eps = self.eps * EPS_RED_FACTOR

        while self.t - t_start < self.max_depth:
            self.current_layer = 0
            R = 0.0
            self.state = STARTING_STATE.copy()
            self.prev_acc = self.evaluator.baseline
            del self.model

            self.model = None
            self.d_theta = 0
            self.d_theta_v = 0
            self.alive = True

            s_buffer = []
            r_buffer = []
            a_buffer = []
            v_buffer = []

            episode_flag = True
            while episode_flag:

                action, policy_mean, policy_sigma, value = self.ac_net.predict(
                    self.state.reshape(1, self.state_len // self.actions_no,
                                       self.actions_no))

                action = action[(self.current_layer)]

                if np.random.uniform() < self.eps:
                    action = (np.random.uniform() *
                              (ACTIONS_BOUNDS[1] - ACTIONS_BOUNDS[0])) // 1

                value = value[(self.current_layer)]

                print('Policy_mean :\n',
                      np.array2string(policy_mean, precision=3))
                print('Policy_sigma :\n',
                      np.array2string(policy_sigma, precision=3))
                print('Action :\n', action)
                print('Value  :\n', np.array2string(value, precision=3))
                print('Layer :', self.current_layer)
                print('State :', self.state)
                reward, new_state = self.perform_action(action)

                r_buffer.extend(([reward]))
                a_buffer.append(([action]))
                v_buffer.append(([value]))
                R = reward + gamma * R
                self.state = new_state
                self.t += 1
                self.current_layer += 1

                self.print_episode(policy_mean, policy_sigma, action, value,
                                   reward)
                if self.current_layer >= self.current_max_depth:
                    episode_flag = False

            # Kill grads
            r_buffer.extend(([0]))
            a_buffer.append(([policy_mean[-1]]))
            v_buffer.append(([0]))
            # Add state
            s_buffer.append(
                self.state.reshape(1, self.state_len // self.actions_no,
                                   self.actions_no))

            R = 0.0
            rev_rewards = []
            for r in reversed(r_buffer):
                R = R * gamma + r
                rev_rewards.append(R)

            reward = rev_rewards.reverse()
            reward = np.array(rev_rewards).reshape((-1, 1))
            action = np.array(a_buffer).reshape((-1, self.actions_no))
            state = self.state

            self.samples.append((self.state, action, reward))

        np.random.shuffle(self.samples)

        # Transfrom to column vectors
        state, action, reward = list(map(np.array, zip(*self.samples)))

        v_l, p_l, e, grads = self.ac_net.fit(state, action, reward)
        self.samples = []
        self.grads = grads

        if self.current_max_depth < self.max_depth and self.t > 100:
            self.current_max_depth += 1

        self.grads = self.ac_net.get_grads()
        if return_episode:
            return self.prev_acc, self.state
        else:
            return self.play()

    def perform_action(self, action, search_mem=True):
        def get_acc(new_state):

            return self.evaluator.evaluate_model(new_state,
                                                 epochs=TRAIN_EPOCHS)

        # Get new state
        new_state = self.update_state(action)

        # Build the model and evaluate
        acc = self.fetch_from_memory(new_state)
        if not search_mem:
            acc = get_acc(new_state)
        else:
            if acc is None:
                acc = get_acc(new_state)
                self.add_to_memory(new_state, acc)
        # Get the reward
        reward = (acc - self.prev_acc)
        self.prev_acc = acc
        return reward, new_state

    def update_state(self, action, old_state=None):
        '''
            Update the state, based on the action taken
        '''

        if old_state is None:
            old_state = np.copy(self.state)

        new_state = np.copy(old_state)
        index = (self.current_layer + 1) * ACTIONS_NO

        for i in range(self.actions_no):
            new_state[index + i] = action[i]

        return new_state

    def print_episode(self, policy_mean, policy_sigma, action, value, reward):
        if DEBUG:
            print('Policy_mean :\n', np.array2string(policy_mean, precision=3))
            print('Policy_sigma :\n', np.array2string(policy_sigma,
                                                      precision=3))
            print('Action :\n', action)
            print('Value  :\n', np.array2string(value, precision=3))
            print('Layer :', self.current_layer)
            print('State :', self.state)
            print('Reward : %.3f' % reward, 'Accuracy : %.3f' % self.prev_acc)
from ac_net import ACNet
# from resnet import ResNet
# from dense_net import DenseNet

WIDTH = 64
HEIGHT = 48

VERSION = 4
EPOCHS = 50
SAMPLES = 400

MODEL = 'acnet'
MODEL_NAME = '{}-v{}-{}-epochs-{}k-samples.model'.format(MODEL, VERSION, EPOCHS, SAMPLES)

model = ACNet(WIDTH, HEIGHT)
# model = ResNet(WIDTH, HEIGHT)
# model = DenseNet(WIDTH, HEIGHT)

model = load_model(MODEL_NAME)

t_time = 0.05

# defining possible movements
def straight():
    PressKey(W)
    ReleaseKey(A)
    ReleaseKey(D)
    
def left():
    PressKey(W)
示例#9
0
def run(render=False):
    env = gym.make(GAME)
    s = env.reset()
    N_S, N_A = env.observation_space.shape[0], env.action_space.shape[0]
    A_BOUND = [env.action_space.low, env.action_space.high]
    env.close()

    sess = tf.InteractiveSession()

    OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
    # OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
    GLOBAL_AC = ACNet(sess,
                      GLOBAL_NET_SCOPE,
                      N_S,
                      N_A,
                      A_BOUND,
                      OPT_A,
                      entropy_beta=ENTROPY_BETA)  # we only need its params

    # Create train worker
    workers = []
    for i in range(N_WORKERS):
        i_name = 'W_%i' % i  # worker name
        env = gym.make(GAME)
        ac = ACNet(sess,
                   i_name,
                   N_S,
                   N_A,
                   A_BOUND,
                   OPT_A,
                   global_ac=GLOBAL_AC,
                   entropy_beta=ENTROPY_BETA)
        workers.append(ACWorker(ac, env, GAMMA, name=i_name))

    # create test worker
    env = gym.make(GAME)
    ac = ACNet(sess,
               'test',
               N_S,
               N_A,
               A_BOUND,
               OPT_A,
               global_ac=GLOBAL_AC,
               entropy_beta=ENTROPY_BETA)
    tester = ACWorker(ac, env, GAMMA, name="test")

    # create save worker
    saver = SaveWorker(sess)

    # init variables
    sess.run(tf.global_variables_initializer())
    '''
    if OUTPUT_GRAPH:
        if os.path.exists(LOG_DIR):
            shutil.rmtree(LOG_DIR)
        tf.summary.FileWriter(LOG_DIR, sess.graph)
    '''

    worker_threads = []

    # train workers
    for worker in workers:
        job = lambda: worker.train()
        t = threading.Thread(target=job)
        t.start()
        worker_threads.append(t)

    # test worker
    job = lambda: tester.test(render=render)
    t = threading.Thread(target=job)
    t.start()
    worker_threads.append(t)

    # save worker
    job = lambda: saver()
    t = threading.Thread(target=job)
    t.start()
    worker_threads.append(t)

    # wait
    COORD = tf.train.Coordinator()
    COORD.join(worker_threads)