Exemplo n.º 1
0
def expert_policy(idx, n_samples, args):
    data = []

    my_simulator = SIMULATOR()

    progress = tqdm(range(n_samples),
                    position=idx,
                    desc='worker_{:02}'.format(idx))
    while len(data) < n_samples:
        state = my_simulator.reset()

        root = None
        for e in range(50):
            action, root = MCTS.search(state, args, root=root)

            data.append((state, action))

            state, reward, terminal = my_simulator.step(action)

            root = root.children[action]

            if terminal:
                break

            progress.update(1)

    if not os.path.exists(args.dir):
        os.makedirs(args.dir)

    file = open('{}/{:02}.data'.format(args.dir, idx), 'wb')
    pickle.dump(data, file)
    file.close()
Exemplo n.º 2
0
def collector(idx, shared_model, shared_dataset, hyperparameters, lock):
    try:
        writer = SummaryWriter('runs/{}/collector:{:02}'.format(
            datetime.now().strftime("%d|%m_%H|%M"), idx))
        logging.basicConfig(filename='logs/collector:{:02}.log'.format(idx),
                            filemode='w',
                            format='%(message)s',
                            level=logging.DEBUG)

        # allocate a device
        n_gpu = t.cuda.device_count()
        if n_gpu > 0:
            Device.set_device(idx % n_gpu)

        local_model = deepcopy(shared_model)
        local_model.to(Device.get_device())
        local_model.eval()

        simulator = SIMULATOR()

        for itr in tqdm(count(),
                        position=idx,
                        desc='collector:{:02}'.format(idx)):
            local_model.load_state_dict(shared_model.state_dict())

            state = simulator.reset()

            episode_reward = 0
            for i in range(50):
                # Find the expert action for input belief
                expert_action, _ = expert(state, hyperparameters)

                lock.acquire()
                shared_dataset.append((state, expert_action))
                lock.release()

                # Simulate the learner's action
                action, _ = local_model.search(state, hyperparameters)
                state, reward, terminal = simulator.step(action)
                episode_reward += reward

                if terminal:
                    break

            logging.debug('Episode reward: {:.2f}'.format(episode_reward))
            writer.add_scalar('episode_reward', episode_reward, itr)
            writer.close()

    except KeyboardInterrupt:
        print('exiting collector:{:02}'.format(idx))
Exemplo n.º 3
0
def performer(solver, args, render=False):
    my_simulator = SIMULATOR()
    state = my_simulator.reset()
    episode_reward = 0

    if render:
        my_simulator.render()

    for i in range(MAX_EPISODE_LENGTH):
        action, _ = solver.search(state, args)

        state, reward, terminal = my_simulator.step(action)

        if render:
            print(SIMULATOR.ACTIONS[action], reward)
            my_simulator.render()

        episode_reward += reward
        if terminal:
            break

    return episode_reward
def run_exper(model, steps, get_features, pre_proc_features):
    from environment import SIMULATOR

    # initializing our environment
    my_sim = SIMULATOR()

    # beginning of an episode
    state_temp = my_sim.reset()
    observation = my_sim.state_to_tensor(state_temp)
    r_tup, e_tup, rover_poss = [], [], []
    # main loop
    prev_input = None
    total_moves = 0
    MAX_MOVES = 25
    for i in range(steps):
        total_moves += 1
        start = time.perf_counter()
        cur_input = observation
        x = cur_input.astype(
            np.float).ravel() if prev_input is not None else np.zeros(70)
        x = x[10:80] if prev_input is not None else x
        x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)])
        x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)])

        x, rov_pos = get_rover_pos(x, r_tup, e_tup, rover_poss)
        x = np.array(x)
        rover_poss.append(rov_pos)
        """
        x = x[x != 0]
        if(len(x) == 1):
            x = np.zeros(4)
            x = x.tolist()
            x.append(-7.)
            x = np.array(x)
        """
        #print_map(x)
        x_t = pre_proc_features.fit_transform(x.reshape(-1, 1))
        x_t = x_t.reshape(1, INPUT_SIZE)[0]
        print("Shape = ", x_t.shape)
        prev_input = cur_input

        # forward the policy network and sample action according to the proba distribution
        #print_map(x)
        proba = model.predict(np.expand_dims(x_t, axis=1).T)
        end = time.perf_counter()
        action = proba[0].argmax()
        print("Time taken = ", end - start)

        #run one step
        state_temp, reward, done, r_tup, e_tup = my_sim.step(action)
        observation = my_sim.state_to_tensor(state_temp)
        my_sim.render()
        time.sleep(1)

        if total_moves == MAX_MOVES:
            total_moves = 0
            done = True
        # if episode is over, reset to beginning
        if done:
            state_temp = my_sim.reset()
            observation = my_sim.state_to_tensor(state_temp)
            my_sim.render()
            rover_poss = []
def run_exper(model, steps, get_features, pre_proc_features):
    r_tup, e_tup = [], []
    rover_poss = []
    total_stats = {'total': 0, 'good': 0}

    from environment import SIMULATOR

    # initializing our environment
    my_sim = SIMULATOR()

    # beginning of an episode
    state_temp = my_sim.reset()
    observation = my_sim.state_to_tensor(state_temp)
    state_obs = observation
    total_moves = 0

    # main loop
    prev_input = None
    for i in range(steps):
        # preprocess the observation, set input as difference between images
        cur_input = observation

        x = cur_input.astype(
            np.float).ravel() if prev_input is not None else np.zeros(70)
        x = x[10:80] if prev_input is not None else x
        x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)])
        x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)])

        prev_input = cur_input

        x, rover_pos = get_rover_pos(x, r_tup, e_tup, rover_poss)
        rover_poss.append(rover_pos)
        x = np.array(x)
        """
        x = x[x != 0]
        if(len(x) == 1):
            x = np.zeros(4)
            x = x.tolist()
            x.append(-7.)
            x = np.array(x)
        """

        x_t = pre_proc_features.fit_transform(x.reshape(-1, 1))
        x_t = x_t.reshape(1, INPUT_SIZE)[0]
        # forward the policy network and sample action according to the proba distribution
        proba = model.predict(np.expand_dims(x_t, axis=1).T)
        action = proba.argmax()

        #run one step
        state_temp, reward, done, r_tup, e_tup = my_sim.step(action)
        observation = my_sim.state_to_tensor(state_temp)
        #my_sim.render()
        total_moves += 1
        if (total_moves == MAX_STEPS):
            done = True
            total_moves = 0

        # if episode is over, reset to beginning
        if done:
            total_stats['total'] += 1
            so = np.asarray(state_obs).ravel().tolist()
            o = np.asarray(observation).ravel().tolist()
            #print("state obs ===============")
            #print(state_obs)
            #print("obs ===============")
            #print(observation)
            try:
                index_obs = so.index(7.0)
            except ValueError:
                index_obs = -1
            try:
                index_curr = o.index(7.0)
            except ValueError:
                index_curr = -1

            if (index_obs != -1 and index_curr == -1):
                #print("Good Game")
                #print(so)
                #print(o)
                total_stats['good'] += 1
            state_temp = my_sim.reset()
            observation = my_sim.state_to_tensor(state_temp)
            state_obs = observation
            rover_poss = []
            #my_sim.render()

    return total_stats
    parser.add_argument('--data_type',
                        default='sparse',
                        type=str,
                        help='Choose between encoded or sparse')
    args = parser.parse_args()
    data_type = args.data_type

model = get_model(data_type)

import numpy as np
import gym

# gym initialization
from environment import SIMULATOR
my_sim = SIMULATOR()
state_temp = my_sim.reset()
observation = my_sim.state_to_tensor(state_temp)
prev_input = None

# Hyperparameters to calculate discount rewards
gamma = 0.99

# initialization of variables used in the main loop
x_train, y_train, y_pred, rewards, r_tup, e_tup, rover_poss = [], [], [], [], [], [], []
reward_sum = 0
episode_nb = 0
resume = True
running_reward = None
EPOCHS_BEFORE_SAVING = 50
moves_count = 0
MAX_NEG_REWARD = -100