def train(): env = my_env.MyEnv(0, realtime_mode=True) model = CatBoostClassifier() model.load_model("catboost_model.model") score = 0.0 print_interval = 1 for n_epi in range(10000): s = env.reset() done = False while not done: y_pred1 = model.predict(s, prediction_type="Probability") if deterministic: y_pred_max = int(np.argmax(y_pred1)) a = action_mapping(y_pred_max) else: a = int(np.random.choice([0, 1, 3, 4, 5], p=y_pred1)) s_prime, r, done, info = env.step(a) s = s_prime score += r if done: break if n_epi%print_interval==0 and n_epi!=0: print("# of episode :{}, avg score : {:.5f}".format(n_epi, score/print_interval)) score = 0.0 env.close()
def train(): env = my_env.MyEnv(0, realtime_mode=True) model = bc.BC() model.load_state_dict(torch.load("imitation_model_1000.pt")) score = 0.0 print_interval = 1 for n_epi in range(10000): s = env.reset() done = False while not done: prob = model(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() s_prime, r, done, info = env.step(a) s = s_prime score += r if done: break if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score : {:.5f}".format( n_epi, score / print_interval)) score = 0.0 env.close()
def train(): env = my_env.MyEnv(0) model = PPO() score = 0.0 print_interval = 1 for n_epi in range(10000): s = env.reset() done = False while not done: for t in range(T_horizon): a_prob, pi_b1, pi_b2, pi_b3 = model.pi( torch.from_numpy(s).float()) a_m = Categorical(a_prob) a = a_m.sample().item() if a == 0: b1_m = Categorical(pi_b1) b1 = b1_m.sample().item() b = b1 a_b_prob = a_prob[a] * pi_b1[b1] elif a == 1: b2_m = Categorical(pi_b2) b2 = b2_m.sample().item() b = b2 a_b_prob = a_prob[a] * pi_b2[b2] else: b3_m = Categorical(pi_b3) b3 = b3_m.sample().item() b = b3 a_b_prob = a_prob[a] * pi_b3[b3] action = a * 3 + b s_prime, r, done, info = env.step(action) model.put_data( (s, a, b, r, s_prime, a_b_prob, np.eye(3)[a], done)) s = s_prime score += r if done: break model.train_net() if n_epi % 50 == 0 and n_epi != 0: torch.save(model.state_dict(), f"ppo_model_{n_epi}.pt") print("saved!") break if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score : {:.1f}".format( n_epi, score / print_interval)) score = 0.0 env.close()
def train(): env = my_env.MyEnv(0) model1 = PPO() model2 = PPO() score = 0.0 print_interval = 20 for n_epi in range(10000): s = env.reset() done = False while not done: for t in range(T_horizon): prob1 = model1.pi(torch.from_numpy(s).float()) prob2 = model2.pi(torch.from_numpy(s).float()) m1 = Categorical(prob1) m2 = Categorical(prob2) a1 = m1.sample().item() a2 = m2.sample().item() s_prime, r, done, info = env.step(np.array([a1, a2])) model1.put_data((s, a1, r, s_prime, prob1[a1].item(), done)) model2.put_data((s, a2, r, s_prime, prob2[a2].item(), done)) s = s_prime score += r if done: break model1.train_net() model2.train_net() # if n_epi % 50 == 0 and n_epi != 0: # torch.save(model.state_dict(), f"ppo_model_{n_epi}.pt") # print("saved!") # break if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score : {:.1f}".format( n_epi, score / print_interval)) score = 0.0 env.close()
def train(): env = my_env.MyEnv(0) model = PPO() score = 0.0 print_interval = 20 for n_epi in range(10000): s = env.reset() done = False while not done: for t in range(T_horizon): prob = model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() s_prime, r, done, info = env.step(a) model.put_data((s, a, r, s_prime, prob[a].item(), done)) s = s_prime score += r if done: break model.train_net() if n_epi % 50 == 0 and n_epi != 0: torch.save(model.state_dict(), f"ppo_model_{n_epi}.pt") print("saved!") break if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score : {:.1f}".format( n_epi, score / print_interval)) score = 0.0 env.close()
print('LOSE') reward = FAIL_PENALTY agent.learn(cur_state, action, next_state, reward, done) print("Episode finished after {} timesteps".format(t + 1)) print('#################################\n') history.append(t + 1) break agent.learn(cur_state, action, next_state, reward, done) cur_state = next_state if t == MAX_STEPS - 1: history.append(t + 1) print("Episode finished after {} timesteps".format(t + 1)) return agent, history env = my_env.MyEnv() #gym.make('CartPole-v0') # TODO myenv # if RECORD: # env = wrappers.Monitor(env, '/home/vbalogh/git/reinforcement_learning-stormmax/my-experiment-1', force=True) def get_actions(current_state): all_actions = list(range(my_env.N_EDGES)) if 0 in current_state: current_pos = current_state.index(0) del all_actions[current_pos] return all_actions agent = qlearning.QLearningAgent(get_actions, epsilon=EPSILON, alpha=LEARNING_RATE,