def run_q_learning(env, num_episodes, gamma, alpha, epsilon): agent = QLearningAgent(env.action_space.n, gamma=gamma, alpha=alpha, epsilon=epsilon) stats = { 'episode_lengths': np.zeros(num_episodes), 'episode_rewards': np.zeros(num_episodes) } for i_episode in range(num_episodes): if (i_episode + 1) % 20 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() state = env.reset() done = False t = 0 while not done: action = agent.step(state) next_state, reward, done, _ = env.step(action) # Update statistics stats['episode_rewards'][i_episode] += reward stats['episode_lengths'][i_episode] = t agent.update(state, action, next_state, reward) t += 1 state = next_state print() return agent, stats
def run_q_learning(num_episodes, max_eps_length, env, dom_no, loop_no, run_no, with_options=False, factored=False): if with_options: print('with skills') if factored: skills_file = open( RESULTS_PATH + 'generated_options/' + str(dom_no) + '/' + str(loop_no) + '/' + str(run_no) + '_factored_skills.pickle', "rb") else: skills_file = open( RESULTS_PATH + 'generated_options/' + str(dom_no) + '/generated_skills.pickle', "rb") skills = pickle.load(skills_file) number_of_actions = env.action_space.n + len(skills) else: number_of_actions = env.action_space.n agent = QLearningAgent(number_of_actions, gamma=0.9, alpha=0.12, epsilon=0.1) stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) input_list = [] output_list = [] for i_episode in range(num_episodes): if (i_episode + 1) % 20 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() state = env.reset() for t in range(max_eps_length): action = agent.step(state) if action >= env.action_space.n: input = state + (action - env.action_space.n, ) input_list.append(list(input)) option = skills[action - env.action_space.n] if factored: state_in = option.factor_state(state, global_set) else: state_in = state if option.in_initialisation_set(state_in): next_state, reward, done, _, total_steps = execute_option( 25, env, option, state_in, factored=factored, state_uf=state) stats.episode_lengths[i_episode] = t + total_steps - 1 else: next_state = state reward = -1 done = False output_list.append(list(next_state)) else: next_state, reward, done, _ = env.step(action) stats.episode_lengths[i_episode] = t # Update statistics stats.episode_rewards[i_episode] += reward agent.update(state, action, next_state, reward) if done: break state = next_state cols = global_set df_out = pd.DataFrame.from_records(output_list, columns=cols) cols.append('Action') df_in = pd.DataFrame.from_records(input_list, columns=cols) data_path = './data/' + str(loop_no) if not (os.path.isdir(data_path)): os.makedirs(data_path) if run_no > 0: print("SHOULDNT B HERE") df_in2 = pd.read_csv(data_path + '/input_data_task_' + str(run_no - 1) + '.csv', index_col=False) df_out2 = pd.read_csv(data_path + '/output_data_task_' + str(run_no - 1) + '.csv', index_col=False) df_in = df_in.append(df_in2, ignore_index=True, sort=False) df_out = df_out.append(df_out2, ignore_index=True, sort=False) df_in.drop(df_in.filter(regex='Unname'), axis=1, inplace=True) df_out.drop(df_out.filter(regex='Unname'), axis=1, inplace=True) df_in.to_csv(data_path + '/input_data_task_' + str(run_no) + '.csv') df_out.to_csv(data_path + '/output_data_task_' + str(run_no) + '.csv') return agent, stats