def expert_policy(idx, n_samples, args): data = [] my_simulator = SIMULATOR() progress = tqdm(range(n_samples), position=idx, desc='worker_{:02}'.format(idx)) while len(data) < n_samples: state = my_simulator.reset() root = None for e in range(50): action, root = MCTS.search(state, args, root=root) data.append((state, action)) state, reward, terminal = my_simulator.step(action) root = root.children[action] if terminal: break progress.update(1) if not os.path.exists(args.dir): os.makedirs(args.dir) file = open('{}/{:02}.data'.format(args.dir, idx), 'wb') pickle.dump(data, file) file.close()
def collector(idx, shared_model, shared_dataset, hyperparameters, lock): try: writer = SummaryWriter('runs/{}/collector:{:02}'.format( datetime.now().strftime("%d|%m_%H|%M"), idx)) logging.basicConfig(filename='logs/collector:{:02}.log'.format(idx), filemode='w', format='%(message)s', level=logging.DEBUG) # allocate a device n_gpu = t.cuda.device_count() if n_gpu > 0: Device.set_device(idx % n_gpu) local_model = deepcopy(shared_model) local_model.to(Device.get_device()) local_model.eval() simulator = SIMULATOR() for itr in tqdm(count(), position=idx, desc='collector:{:02}'.format(idx)): local_model.load_state_dict(shared_model.state_dict()) state = simulator.reset() episode_reward = 0 for i in range(50): # Find the expert action for input belief expert_action, _ = expert(state, hyperparameters) lock.acquire() shared_dataset.append((state, expert_action)) lock.release() # Simulate the learner's action action, _ = local_model.search(state, hyperparameters) state, reward, terminal = simulator.step(action) episode_reward += reward if terminal: break logging.debug('Episode reward: {:.2f}'.format(episode_reward)) writer.add_scalar('episode_reward', episode_reward, itr) writer.close() except KeyboardInterrupt: print('exiting collector:{:02}'.format(idx))
def performer(solver, args, render=False): my_simulator = SIMULATOR() state = my_simulator.reset() episode_reward = 0 if render: my_simulator.render() for i in range(MAX_EPISODE_LENGTH): action, _ = solver.search(state, args) state, reward, terminal = my_simulator.step(action) if render: print(SIMULATOR.ACTIONS[action], reward) my_simulator.render() episode_reward += reward if terminal: break return episode_reward
def run_exper(model, steps, get_features, pre_proc_features): from environment import SIMULATOR # initializing our environment my_sim = SIMULATOR() # beginning of an episode state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) r_tup, e_tup, rover_poss = [], [], [] # main loop prev_input = None total_moves = 0 MAX_MOVES = 25 for i in range(steps): total_moves += 1 start = time.perf_counter() cur_input = observation x = cur_input.astype( np.float).ravel() if prev_input is not None else np.zeros(70) x = x[10:80] if prev_input is not None else x x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)]) x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)]) x, rov_pos = get_rover_pos(x, r_tup, e_tup, rover_poss) x = np.array(x) rover_poss.append(rov_pos) """ x = x[x != 0] if(len(x) == 1): x = np.zeros(4) x = x.tolist() x.append(-7.) x = np.array(x) """ #print_map(x) x_t = pre_proc_features.fit_transform(x.reshape(-1, 1)) x_t = x_t.reshape(1, INPUT_SIZE)[0] print("Shape = ", x_t.shape) prev_input = cur_input # forward the policy network and sample action according to the proba distribution #print_map(x) proba = model.predict(np.expand_dims(x_t, axis=1).T) end = time.perf_counter() action = proba[0].argmax() print("Time taken = ", end - start) #run one step state_temp, reward, done, r_tup, e_tup = my_sim.step(action) observation = my_sim.state_to_tensor(state_temp) my_sim.render() time.sleep(1) if total_moves == MAX_MOVES: total_moves = 0 done = True # if episode is over, reset to beginning if done: state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) my_sim.render() rover_poss = []
def run_exper(model, steps, get_features, pre_proc_features): r_tup, e_tup = [], [] rover_poss = [] total_stats = {'total': 0, 'good': 0} from environment import SIMULATOR # initializing our environment my_sim = SIMULATOR() # beginning of an episode state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) state_obs = observation total_moves = 0 # main loop prev_input = None for i in range(steps): # preprocess the observation, set input as difference between images cur_input = observation x = cur_input.astype( np.float).ravel() if prev_input is not None else np.zeros(70) x = x[10:80] if prev_input is not None else x x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)]) x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)]) prev_input = cur_input x, rover_pos = get_rover_pos(x, r_tup, e_tup, rover_poss) rover_poss.append(rover_pos) x = np.array(x) """ x = x[x != 0] if(len(x) == 1): x = np.zeros(4) x = x.tolist() x.append(-7.) x = np.array(x) """ x_t = pre_proc_features.fit_transform(x.reshape(-1, 1)) x_t = x_t.reshape(1, INPUT_SIZE)[0] # forward the policy network and sample action according to the proba distribution proba = model.predict(np.expand_dims(x_t, axis=1).T) action = proba.argmax() #run one step state_temp, reward, done, r_tup, e_tup = my_sim.step(action) observation = my_sim.state_to_tensor(state_temp) #my_sim.render() total_moves += 1 if (total_moves == MAX_STEPS): done = True total_moves = 0 # if episode is over, reset to beginning if done: total_stats['total'] += 1 so = np.asarray(state_obs).ravel().tolist() o = np.asarray(observation).ravel().tolist() #print("state obs ===============") #print(state_obs) #print("obs ===============") #print(observation) try: index_obs = so.index(7.0) except ValueError: index_obs = -1 try: index_curr = o.index(7.0) except ValueError: index_curr = -1 if (index_obs != -1 and index_curr == -1): #print("Good Game") #print(so) #print(o) total_stats['good'] += 1 state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) state_obs = observation rover_poss = [] #my_sim.render() return total_stats
parser.add_argument('--data_type', default='sparse', type=str, help='Choose between encoded or sparse') args = parser.parse_args() data_type = args.data_type model = get_model(data_type) import numpy as np import gym # gym initialization from environment import SIMULATOR my_sim = SIMULATOR() state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) prev_input = None # Hyperparameters to calculate discount rewards gamma = 0.99 # initialization of variables used in the main loop x_train, y_train, y_pred, rewards, r_tup, e_tup, rover_poss = [], [], [], [], [], [], [] reward_sum = 0 episode_nb = 0 resume = True running_reward = None EPOCHS_BEFORE_SAVING = 50 moves_count = 0 MAX_NEG_REWARD = -100