Exemplo n.º 1
0
def train():
    env = my_env.MyEnv(0, realtime_mode=True)

    model = CatBoostClassifier()
    model.load_model("catboost_model.model")

    score = 0.0
    print_interval = 1

    for n_epi in range(10000):
        s = env.reset()
        done = False
        while not done:
            y_pred1 = model.predict(s, prediction_type="Probability")
            
            if deterministic:
                y_pred_max = int(np.argmax(y_pred1))
                a = action_mapping(y_pred_max)
            else:
                a = int(np.random.choice([0, 1, 3, 4, 5], p=y_pred1))            
            
            s_prime, r, done, info = env.step(a)

            s = s_prime

            score += r
            if done:
                break

        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.5f}".format(n_epi, score/print_interval))
            score = 0.0

    env.close()
Exemplo n.º 2
0
def train():
    env = my_env.MyEnv(0, realtime_mode=True)
    model = bc.BC()
    model.load_state_dict(torch.load("imitation_model_1000.pt"))
    score = 0.0
    print_interval = 1

    for n_epi in range(10000):
        s = env.reset()
        done = False
        while not done:
            prob = model(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample().item()

            s_prime, r, done, info = env.step(a)

            s = s_prime

            score += r
            if done:
                break

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score : {:.5f}".format(
                n_epi, score / print_interval))
            score = 0.0

    env.close()
Exemplo n.º 3
0
def train():
    env = my_env.MyEnv(0)
    model = PPO()
    score = 0.0
    print_interval = 1

    for n_epi in range(10000):
        s = env.reset()
        done = False
        while not done:
            for t in range(T_horizon):
                a_prob, pi_b1, pi_b2, pi_b3 = model.pi(
                    torch.from_numpy(s).float())
                a_m = Categorical(a_prob)
                a = a_m.sample().item()
                if a == 0:
                    b1_m = Categorical(pi_b1)
                    b1 = b1_m.sample().item()
                    b = b1
                    a_b_prob = a_prob[a] * pi_b1[b1]
                elif a == 1:
                    b2_m = Categorical(pi_b2)
                    b2 = b2_m.sample().item()
                    b = b2
                    a_b_prob = a_prob[a] * pi_b2[b2]
                else:
                    b3_m = Categorical(pi_b3)
                    b3 = b3_m.sample().item()
                    b = b3
                    a_b_prob = a_prob[a] * pi_b3[b3]

                action = a * 3 + b

                s_prime, r, done, info = env.step(action)

                model.put_data(
                    (s, a, b, r, s_prime, a_b_prob, np.eye(3)[a], done))
                s = s_prime

                score += r
                if done:
                    break

            model.train_net()

        if n_epi % 50 == 0 and n_epi != 0:
            torch.save(model.state_dict(), f"ppo_model_{n_epi}.pt")
            print("saved!")
            break

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score : {:.1f}".format(
                n_epi, score / print_interval))
            score = 0.0

    env.close()
Exemplo n.º 4
0
def train():
    env = my_env.MyEnv(0)
    model1 = PPO()
    model2 = PPO()
    score = 0.0
    print_interval = 20

    for n_epi in range(10000):
        s = env.reset()
        done = False
        while not done:
            for t in range(T_horizon):
                prob1 = model1.pi(torch.from_numpy(s).float())
                prob2 = model2.pi(torch.from_numpy(s).float())
                m1 = Categorical(prob1)
                m2 = Categorical(prob2)
                a1 = m1.sample().item()
                a2 = m2.sample().item()

                s_prime, r, done, info = env.step(np.array([a1, a2]))

                model1.put_data((s, a1, r, s_prime, prob1[a1].item(), done))
                model2.put_data((s, a2, r, s_prime, prob2[a2].item(), done))
                s = s_prime

                score += r
                if done:
                    break

            model1.train_net()
            model2.train_net()

        # if n_epi % 50 == 0 and n_epi != 0:
        #     torch.save(model.state_dict(), f"ppo_model_{n_epi}.pt")
        #     print("saved!")
        #     break

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score : {:.1f}".format(
                n_epi, score / print_interval))
            score = 0.0

    env.close()
Exemplo n.º 5
0
def train():
    env = my_env.MyEnv(0)
    model = PPO()
    score = 0.0
    print_interval = 20

    for n_epi in range(10000):
        s = env.reset()
        done = False
        while not done:
            for t in range(T_horizon):
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()

                s_prime, r, done, info = env.step(a)

                model.put_data((s, a, r, s_prime, prob[a].item(), done))
                s = s_prime

                score += r
                if done:
                    break

            model.train_net()

        if n_epi % 50 == 0 and n_epi != 0:
            torch.save(model.state_dict(), f"ppo_model_{n_epi}.pt")
            print("saved!")
            break

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score : {:.1f}".format(
                n_epi, score / print_interval))
            score = 0.0

    env.close()
          print('LOSE')
          reward = FAIL_PENALTY
        agent.learn(cur_state, action, next_state, reward, done)
        print("Episode finished after {} timesteps".format(t + 1))
        print('#################################\n')
        history.append(t + 1)
        break
      agent.learn(cur_state, action, next_state, reward, done)
      cur_state = next_state
      if t == MAX_STEPS - 1:
        history.append(t + 1)
        print("Episode finished after {} timesteps".format(t + 1))
  return agent, history


env = my_env.MyEnv() #gym.make('CartPole-v0') # TODO myenv
# if RECORD:
#   env = wrappers.Monitor(env, '/home/vbalogh/git/reinforcement_learning-stormmax/my-experiment-1', force=True)

def get_actions(current_state):
  all_actions = list(range(my_env.N_EDGES))
  if 0 in current_state:
    current_pos = current_state.index(0)
    del all_actions[current_pos]

  return all_actions


agent = qlearning.QLearningAgent(get_actions,
                                     epsilon=EPSILON,
                                     alpha=LEARNING_RATE,