Пример #1
0
def expert_policy(idx, n_samples, args):
    data = []

    my_simulator = SIMULATOR()

    progress = tqdm(range(n_samples),
                    position=idx,
                    desc='worker_{:02}'.format(idx))
    while len(data) < n_samples:
        state = my_simulator.reset()

        root = None
        for e in range(50):
            action, root = MCTS.search(state, args, root=root)

            data.append((state, action))

            state, reward, terminal = my_simulator.step(action)

            root = root.children[action]

            if terminal:
                break

            progress.update(1)

    if not os.path.exists(args.dir):
        os.makedirs(args.dir)

    file = open('{}/{:02}.data'.format(args.dir, idx), 'wb')
    pickle.dump(data, file)
    file.close()
Пример #2
0
def collector(idx, shared_model, shared_dataset, hyperparameters, lock):
    try:
        writer = SummaryWriter('runs/{}/collector:{:02}'.format(
            datetime.now().strftime("%d|%m_%H|%M"), idx))
        logging.basicConfig(filename='logs/collector:{:02}.log'.format(idx),
                            filemode='w',
                            format='%(message)s',
                            level=logging.DEBUG)

        # allocate a device
        n_gpu = t.cuda.device_count()
        if n_gpu > 0:
            Device.set_device(idx % n_gpu)

        local_model = deepcopy(shared_model)
        local_model.to(Device.get_device())
        local_model.eval()

        simulator = SIMULATOR()

        for itr in tqdm(count(),
                        position=idx,
                        desc='collector:{:02}'.format(idx)):
            local_model.load_state_dict(shared_model.state_dict())

            state = simulator.reset()

            episode_reward = 0
            for i in range(50):
                # Find the expert action for input belief
                expert_action, _ = expert(state, hyperparameters)

                lock.acquire()
                shared_dataset.append((state, expert_action))
                lock.release()

                # Simulate the learner's action
                action, _ = local_model.search(state, hyperparameters)
                state, reward, terminal = simulator.step(action)
                episode_reward += reward

                if terminal:
                    break

            logging.debug('Episode reward: {:.2f}'.format(episode_reward))
            writer.add_scalar('episode_reward', episode_reward, itr)
            writer.close()

    except KeyboardInterrupt:
        print('exiting collector:{:02}'.format(idx))
Пример #3
0
    def search(state, args, root=None):
        if root is None:
            root = Node(state)

        for i in range(args.n_simulations):
            node = root
            path = []
            # Start simulation and add a new child
            terminal = False
            while not terminal:
                # Choose which branch to explore/exploit based on embedding memory
                Q = node.Q + MCTS.c * np.sqrt(np.log(node.N) / node.N_a)
                action = int(np.argmax(Q))

                # simulate with action to get next state
                state, reward, terminal = SIMULATOR.simulate(node.state, action)

                path.append((node, action, reward))

                # keep on traversing if the child exists
                if node.children.get(action) is None:
                    # add new child
                    node.children[action] = Node(state, terminal)
                    break
                else:
                    node = node.children[action]

            # backup values through the path to root
            for node, action, reward in reversed(path):
                node.N += 1
                node.N_a[action] += 1
                node.Q[action] = node.Q[action] + (reward + np.max(node.children[action].Q) - node.Q[action]) / node.N_a[action]

        return int(np.argmax(root.Q)), root
Пример #4
0
    def search(self, state, args):

        root = self.new_node(state)

        predictions = [self.f_readout(root.tensors.memory)]
        logits = []
        actions = []

        for i in range(args.n_simulations):

            node = root

            path = []
            logits_m = []
            actions_m = []

            # Start simulation and add a new child
            terminal = False
            while not terminal:
                # Choose which branch to explore/exploit based on node memory
                p_actions = self.f_policy(node)
                action = Categorical(logits=p_actions).sample().item()

                # store embedding and action for policy gradient
                if args.training:
                    logits_m.append(p_actions)
                    actions_m.append(action)

                # simulate with action to get next state
                next_state, reward, terminal = SIMULATOR.simulate(
                    node.variables.state, action)
                path.append(Path(node, action, reward))

                # if action and observation branch exists, traverse to the next node and add the new state
                if node.variables.children.get(action) is None:
                    node.variables.children[action] = self.new_node(next_state)
                    break
                # else, create one
                else:
                    node = node.variables.children[action]

            # backup values through the path to root
            for node, action, reward in reversed(path):
                node.tensors.memory = self.f_backup(
                    *prepare_input_for_f_backup(node, action, reward))

                node.tensors.children[action] = node.variables.children[
                    action].tensors.memory

            # store predictions after m_th step
            predictions.append(self.f_readout(root.tensors.memory))

            # store logits and action for the m_th step
            logits.append(logits_m)
            actions.append(actions_m)

        return Categorical(
            logits=predictions[-1]).sample().item(), (predictions, logits,
                                                      actions)
Пример #5
0
 def __init__(self):
     super().__init__()
     channels, _, _ = SIMULATOR.tensor_shape()
     self.memory = nn.Sequential(nn.Conv2d(channels, 64, 1, 1),
                                 ResidualConv(64), ResidualConv(64),
                                 nn.Conv2d(64, 128, 1, 1),
                                 nn.AdaptiveMaxPool2d((1, 1)), nn.Flatten(),
                                 nn.Linear(128, d_memory))
Пример #6
0
    def state_to_tensor(self, state):
        key = str(state)
        tensor = self.tensor_cache.get(key)
        if tensor is None:
            tensor = SIMULATOR.state_to_tensor(state).to(Device.get_device())
            self.tensor_cache[key] = tensor

        return tensor
Пример #7
0
    def __init__(self, state, terminal=False):
        self.state = state

        self.children = {}
        self.N = SIMULATOR.n_actions
        self.N_a = np.ones(SIMULATOR.n_actions)
        self.Q = np.zeros(SIMULATOR.n_actions)

        if not terminal:
            for i in range(SIMULATOR.n_actions):
                self.Q[i] = SIMULATOR.rollout(state, i)

        self.terminal = terminal
Пример #8
0
def performer(solver, args, render=False):
    my_simulator = SIMULATOR()
    state = my_simulator.reset()
    episode_reward = 0

    if render:
        my_simulator.render()

    for i in range(MAX_EPISODE_LENGTH):
        action, _ = solver.search(state, args)

        state, reward, terminal = my_simulator.step(action)

        if render:
            print(SIMULATOR.ACTIONS[action], reward)
            my_simulator.render()

        episode_reward += reward
        if terminal:
            break

    return episode_reward
def run_exper(model, steps, get_features, pre_proc_features):
    from environment import SIMULATOR

    # initializing our environment
    my_sim = SIMULATOR()

    # beginning of an episode
    state_temp = my_sim.reset()
    observation = my_sim.state_to_tensor(state_temp)
    r_tup, e_tup, rover_poss = [], [], []
    # main loop
    prev_input = None
    total_moves = 0
    MAX_MOVES = 25
    for i in range(steps):
        total_moves += 1
        start = time.perf_counter()
        cur_input = observation
        x = cur_input.astype(
            np.float).ravel() if prev_input is not None else np.zeros(70)
        x = x[10:80] if prev_input is not None else x
        x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)])
        x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)])

        x, rov_pos = get_rover_pos(x, r_tup, e_tup, rover_poss)
        x = np.array(x)
        rover_poss.append(rov_pos)
        """
        x = x[x != 0]
        if(len(x) == 1):
            x = np.zeros(4)
            x = x.tolist()
            x.append(-7.)
            x = np.array(x)
        """
        #print_map(x)
        x_t = pre_proc_features.fit_transform(x.reshape(-1, 1))
        x_t = x_t.reshape(1, INPUT_SIZE)[0]
        print("Shape = ", x_t.shape)
        prev_input = cur_input

        # forward the policy network and sample action according to the proba distribution
        #print_map(x)
        proba = model.predict(np.expand_dims(x_t, axis=1).T)
        end = time.perf_counter()
        action = proba[0].argmax()
        print("Time taken = ", end - start)

        #run one step
        state_temp, reward, done, r_tup, e_tup = my_sim.step(action)
        observation = my_sim.state_to_tensor(state_temp)
        my_sim.render()
        time.sleep(1)

        if total_moves == MAX_MOVES:
            total_moves = 0
            done = True
        # if episode is over, reset to beginning
        if done:
            state_temp = my_sim.reset()
            observation = my_sim.state_to_tensor(state_temp)
            my_sim.render()
            rover_poss = []
def run_exper(model, steps, get_features, pre_proc_features):
    r_tup, e_tup = [], []
    rover_poss = []
    total_stats = {'total': 0, 'good': 0}

    from environment import SIMULATOR

    # initializing our environment
    my_sim = SIMULATOR()

    # beginning of an episode
    state_temp = my_sim.reset()
    observation = my_sim.state_to_tensor(state_temp)
    state_obs = observation
    total_moves = 0

    # main loop
    prev_input = None
    for i in range(steps):
        # preprocess the observation, set input as difference between images
        cur_input = observation

        x = cur_input.astype(
            np.float).ravel() if prev_input is not None else np.zeros(70)
        x = x[10:80] if prev_input is not None else x
        x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)])
        x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)])

        prev_input = cur_input

        x, rover_pos = get_rover_pos(x, r_tup, e_tup, rover_poss)
        rover_poss.append(rover_pos)
        x = np.array(x)
        """
        x = x[x != 0]
        if(len(x) == 1):
            x = np.zeros(4)
            x = x.tolist()
            x.append(-7.)
            x = np.array(x)
        """

        x_t = pre_proc_features.fit_transform(x.reshape(-1, 1))
        x_t = x_t.reshape(1, INPUT_SIZE)[0]
        # forward the policy network and sample action according to the proba distribution
        proba = model.predict(np.expand_dims(x_t, axis=1).T)
        action = proba.argmax()

        #run one step
        state_temp, reward, done, r_tup, e_tup = my_sim.step(action)
        observation = my_sim.state_to_tensor(state_temp)
        #my_sim.render()
        total_moves += 1
        if (total_moves == MAX_STEPS):
            done = True
            total_moves = 0

        # if episode is over, reset to beginning
        if done:
            total_stats['total'] += 1
            so = np.asarray(state_obs).ravel().tolist()
            o = np.asarray(observation).ravel().tolist()
            #print("state obs ===============")
            #print(state_obs)
            #print("obs ===============")
            #print(observation)
            try:
                index_obs = so.index(7.0)
            except ValueError:
                index_obs = -1
            try:
                index_curr = o.index(7.0)
            except ValueError:
                index_curr = -1

            if (index_obs != -1 and index_curr == -1):
                #print("Good Game")
                #print(so)
                #print(o)
                total_stats['good'] += 1
            state_temp = my_sim.reset()
            observation = my_sim.state_to_tensor(state_temp)
            state_obs = observation
            rover_poss = []
            #my_sim.render()

    return total_stats
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_type',
                        default='sparse',
                        type=str,
                        help='Choose between encoded or sparse')
    args = parser.parse_args()
    data_type = args.data_type

model = get_model(data_type)

import numpy as np
import gym

# gym initialization
from environment import SIMULATOR
my_sim = SIMULATOR()
state_temp = my_sim.reset()
observation = my_sim.state_to_tensor(state_temp)
prev_input = None

# Hyperparameters to calculate discount rewards
gamma = 0.99

# initialization of variables used in the main loop
x_train, y_train, y_pred, rewards, r_tup, e_tup, rover_poss = [], [], [], [], [], [], []
reward_sum = 0
episode_nb = 0
resume = True
running_reward = None
EPOCHS_BEFORE_SAVING = 50
moves_count = 0