예제 #1
0
    def __init__(self, env):
        super(DDPG, self).__init__()

        pi_net = PiNet(self.ns, self.na)
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na)
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net = QNet(self.ns, self.na)
        self.q_net = q_net.to(self.device)

        q_target = QNet(self.ns, self.na)
        self.q_target = q_target.to(self.device)
        self.load_state_dict(self.q_target, self.q_net.state_dict())

        self.optimizer_q = torch.optim.Adam(self.q_net.parameters(),
                                            lr=self.lr_q,
                                            betas=(0.9, 0.999),
                                            weight_decay=1e-2)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=0)

        self.noise = OrnsteinUhlenbeckActionNoise(
            torch.zeros(1, self.na).to(self.device),
            self.epsilon * torch.ones(1, self.na).to(self.device))
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions)

    optimizer = optim.Adam(net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    net.to(device)
    net.train()
    running_score = 0
    steps = 0
    loss = 0

    for e in range(3000):
        done = False
        memory = Memory()

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(2)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        loss = QNet.train_model(net, memory.sample(), optimizer)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
예제 #3
0
def main():

    if not (os.path.isdir("logs")):
        os.makedirs("logs")

    working_dir = "logs/" + args.dir
    if not (os.path.isdir(working_dir)):
        raise NameError(args.dir + " does not exist in dir logs")

    print(args)

    env = QubeSwingupEnv(use_simulator=args.sim, batch_size= 2048*4)

    num_inputs = env.observation_space.shape[0]
    num_actions = NUMBER_OF_ACTIONS
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions) if not args.new_net else QNet_more_layers(num_inputs, num_actions)
    net.load_state_dict(torch.load(working_dir + "/best_model.pth", map_location=torch.device(device)))
    net.to(device)
    net.eval()
    running_score = 0
    epsilon = 1.0
    steps = 0
    beta = beta_start
    loss = 0

    best_running_score = -1000

    for e in range(1):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)
        
        while not done:
            steps += 1
            action = get_continuous_action(get_action(state, net))
            if np.abs(state[0][1].item()) < deg2rad(25):
                action = pd_control_policy(state.cpu().numpy()[0])[0]
            next_state, reward, done, info = env.step(action)
            reward = give_me_reward(info["alpha"], info["theta"])
            if args.sim: env.render()
            reward = give_me_reward(info["alpha"], info["theta"])
            if done:
                print(info)
                print("theta:" , info["theta"] * 180/np.pi)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            score += reward
            state = next_state

        running_score = 0.99 * running_score + 0.01 * score
        print('{} episode | running_score: {:.2f} | score: {:.2f} | steps: {} '.format(e, running_score, score, steps))
    env.close()
예제 #4
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    img_shape = env.observation_space.shape
    num_actions = 3
    print('image size:', img_shape)
    print('action size:', num_actions)

    net = QNet(num_actions)
    net.load_state_dict(torch.load(args.save_path + 'model.pth'))

    net.to(device)
    net.eval()

    epsilon = 0

    for e in range(5):
        done = False

        score = 0
        state = env.reset()

        state = pre_process(state)
        state = torch.Tensor(state).to(device)
        history = torch.stack((state, state, state, state))

        for i in range(3):
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            state = pre_process(state)
            state = torch.Tensor(state).to(device)
            state = state.unsqueeze(0)
            history = torch.cat((state, history[:-1]), dim=0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(history.unsqueeze(0))
            action = get_action(0, qvalue, num_actions)

            next_state, reward, done, info = env.step(action + 1)

            next_state = pre_process(next_state)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            next_history = torch.cat((next_state, history[:-1]), dim=0)

            score += reward
            history = next_history

        print('{} episode | score: {:.2f}'.format(e, score))
예제 #5
0
    def __init__(self, *largs, **kwargs):
        super(SSPG, self).__init__(*largs, **kwargs)

        pi_net = PiNet(self.ns, self.na, distribution='Normal')
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na, distribution='Normal')
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net_1 = QNet(self.ns, self.na)
        self.q_net_1 = q_net_1.to(self.device)

        q_target_1 = QNet(self.ns, self.na)
        self.q_target_1 = q_target_1.to(self.device)
        self.load_state_dict(self.q_target_1, self.q_net_1.state_dict())

        q_net_2 = QNet(self.ns, self.na)
        self.q_net_2 = q_net_2.to(self.device)

        q_target_2 = QNet(self.ns, self.na)
        self.q_target_2 = q_target_2.to(self.device)
        self.load_state_dict(self.q_target_2, self.q_net_2.state_dict())

        self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999),
                                              weight_decay=self.weight_decay_q)

        self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999),
                                              weight_decay=self.weight_decay_q)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=self.weight_decay_p)

        if self.entropy_tunning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.na).to(self.device)).item()
            self.log_alpha = torch.tensor([0.],
                                          requires_grad=True,
                                          device=self.device)
            self.optimizer_alpha = torch.optim.Adam([self.log_alpha],
                                                    lr=self.lr_q)
            self.alpha = float(self.log_alpha.exp())
예제 #6
0
    def __init__(self, env):
        super(SACV, self).__init__()

        self.env = env
        n_a = env.action_space.shape[0]
        n_s = env.observation_space.shape[0]

        pi_net = PiNet(n_s, n_a, distribution='Normal')
        self.pi_net = pi_net.to(self.device)

        q_net_1 = QNet(n_s, n_a)
        self.q_net_1 = q_net_1.to(self.device)

        q_net_2 = QNet(n_s, n_a)
        self.q_net_2 = q_net_2.to(self.device)

        v_net = QNet(n_s, 0)
        self.v_net = v_net.to(self.device)

        v_target = QNet(n_s, 0)
        self.v_target = v_target.to(self.device)
        self.load_state_dict(self.v_target, self.v_net.state_dict())

        self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999),
                                              weight_decay=1e-2)

        self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999),
                                              weight_decay=1e-2)

        self.optimizer_v = torch.optim.Adam(self.v_net.parameters(),
                                            lr=self.lr_q,
                                            betas=(0.9, 0.999),
                                            weight_decay=1e-2)

        # eps = 1e-04,
        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999),
                                            weight_decay=0)

        self.sample = self.actor_rb
예제 #7
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    target_net.load_state_dict(online_net.state_dict())
    online_net.share_memory()
    target_net.share_memory()

    optimizer = SharedAdam(online_net.parameters(), lr=lr)
    global_ep, global_ep_r, res_queue = mp.Value('i',
                                                 0), mp.Value('d',
                                                              0.), mp.Queue()

    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()

    workers = [
        Worker(online_net, target_net, optimizer, global_ep, global_ep_r,
               res_queue, i) for i in range(mp.cpu_count())
    ]
    [w.start() for w in workers]
    res = []
    while True:
        r = res_queue.get()
        if r is not None:
            res.append(r)
            [ep, ep_r, loss] = r
            writer.add_scalar('log/score', float(ep_r), ep)
            writer.add_scalar('log/loss', float(loss), ep)
        else:
            break
    [w.join() for w in workers]
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions)
    net.load_state_dict(torch.load(args.save_path + 'model.pth'))

    net.to(device)
    net.eval()
    running_score = 0
    steps = 0

    for e in range(5):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            env.render()

            steps += 1
            qvalue = net(state)
            action = get_action(qvalue)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            score += reward
            state = next_state

        print('{} episode | score: {:.2f}'.format(e, score))
예제 #9
0
파일: rbi.py 프로젝트: eladsar/contact
    def __init__(self, *largs, **kwargs):
        super(RBI, self).__init__(*largs, **kwargs)

        pi_net = PiNet(self.ns, self.na, distribution='Normal')
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na, distribution='Normal')
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net_1 = QNet(self.ns, self.na)
        self.q_net_1 = q_net_1.to(self.device)

        q_target_1 = QNet(self.ns, self.na)
        self.q_target_1 = q_target_1.to(self.device)
        self.load_state_dict(self.q_target_1, self.q_net_1.state_dict())

        q_net_2 = QNet(self.ns, self.na)
        self.q_net_2 = q_net_2.to(self.device)

        q_target_2 = QNet(self.ns, self.na)
        self.q_target_2 = q_target_2.to(self.device)
        self.load_state_dict(self.q_target_2, self.q_net_2.state_dict())

        self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(), lr=self.lr_q, betas=(0.9, 0.999),
                                     weight_decay=self.weight_decay_q)

        self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(), lr=self.lr_q, betas=(0.9, 0.999),
                                     weight_decay=self.weight_decay_q)

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999),
                                    weight_decay=self.weight_decay_p)

        self.alpha = self.rbi_alpha
        if self.entropy_tunning:
            # self.target_entropy = -float(self.na)
            std_target = 0.3 / math.sqrt(self.na)
            self.target_entropy = self.na * 0.5 * math.log(2 * math.pi * math.e * (std_target ** 2))
            print(f'target entropy: {self.target_entropy}')
            self.lr_alpha = 0.01
예제 #10
0
파일: td3.py 프로젝트: eladsar/contact
    def __init__(self, *largs, **kwargs):
        super(TD3, self).__init__(*largs, **kwargs)

        pi_net = PiNet(self.ns, self.na)
        self.pi_net = pi_net.to(self.device)

        pi_target = PiNet(self.ns, self.na)
        self.pi_target = pi_target.to(self.device)
        self.load_state_dict(self.pi_target, self.pi_net.state_dict())

        q_net_1 = QNet(self.ns, self.na)
        self.q_net_1 = q_net_1.to(self.device)

        q_target_1 = QNet(self.ns, self.na)
        self.q_target_1 = q_target_1.to(self.device)
        self.load_state_dict(self.q_target_1, self.q_net_1.state_dict())

        q_net_2 = QNet(self.ns, self.na)
        self.q_net_2 = q_net_2.to(self.device)

        q_target_2 = QNet(self.ns, self.na)
        self.q_target_2 = q_target_2.to(self.device)
        self.load_state_dict(self.q_target_2, self.q_net_2.state_dict())

        self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999))

        self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(),
                                              lr=self.lr_q,
                                              betas=(0.9, 0.999))

        self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(),
                                            lr=self.lr_p,
                                            betas=(0.9, 0.999))

        self.noise = RandomNoise(
            torch.zeros(1, self.na).to(self.device), self.epsilon)
def test(level_list, render=True):
    online_net = QNet(h=84, w=84, outputs=36)
    online_net.load_state_dict(torch.load('saved/online_net.pt'))

    online_net.to(device)

    cnt = 0
    death = 0
    total_reward = 0.0

    str_level_list = [LEVEL_SET[idx - 1] for idx in level_list]
    for level in str_level_list:
        env = make_retro(game=env_name,
                         state=level,
                         use_restricted_actions=retro.Actions.DISCRETE)

        obs = env.reset()
        state = torch.Tensor(obs).to(device).permute(2, 0, 1)
        #state = state.view(state.size()[0], -1)
        state = state.unsqueeze(0)

        previous_lives = 3
        previous_level = level_list[cnt]
        cnt += 1
        if death >= 3:
            break

        for t in count():
            action = online_net.get_action(state.to(device))

            if render:
                env.render()
                time.sleep(0.02)

            next_state, reward, done, info = env.step(action)

            next_state = torch.Tensor(next_state).permute(2, 0, 1)
            #next_state = next_state.view(next_state.size()[0], -1)
            next_state = next_state.unsqueeze(0)

            total_reward += reward

            current_lives = info['lives']
            current_level = info['level']

            if current_lives != previous_lives:
                print('Dead')
                previous_lives = info['lives']
                death += 1
                #if death >= 3:
                #    print("Finished ", level, " Total reward: {}".format(total_reward))
                #    break

            if current_level != previous_level:
                print('Stage changed')
                print("Finished ", level,
                      " Total reward: {}".format(total_reward))
                break

            state = next_state

            if done:
                print('All lives gone')
                print("Finished ", level,
                      " Total reward: {}".format(total_reward))
                break

        env.close()
    return
예제 #12
0
def main():

    if not (os.path.isdir("logs")):
        os.makedirs("logs")

    if (args.entropy and args.boltzmann):
        raise ValueError("Entropy as well as Boltzmann set.")

    print(args)

    working_dir = "logs/" + args.dir
    if not (os.path.isdir(working_dir)):
        os.mkdir(working_dir)

    env = QubeSwingupEnv(use_simulator=True)

    num_inputs = env.observation_space.shape[0]
    num_actions = NUMBER_OF_ACTIONS
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)

    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter(working_dir)

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory_With_TDError(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    beta = beta_start
    loss = 0
    training_started = False

    best_running_score = -1000

    for e in range(args.e):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)
        start_time = time.time()

        while not done:
            steps += 1
            action = get_action(state,
                                target_net,
                                epsilon,
                                use_entropy=args.entropy,
                                use_boltzmann=args.boltzmann)
            next_state, reward, done, info = env.step(
                get_continuous_action(action))

            reward = give_me_reward(info["alpha"], info["theta"])

            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            action_one_hot = np.zeros(NUMBER_OF_ACTIONS)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                if not training_started:
                    print("---------------- training started ---------------")
                    training_started = True
                epsilon -= 0.000005
                epsilon = max(epsilon, 0.1)
                beta += 0.000005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights, device)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        end_time = time.time()
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'.
                format(e, running_score, epsilon, beta))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > best_running_score and args.save:
            torch.save(online_net.state_dict(),
                       working_dir + "/best_model.pth")
            best_running_score = running_score
예제 #13
0
def main():
    # cartpole test
    if (cartpole_test):
        envs_fun = [lambda: gym.make('CartPole-v0')]
        envs_fun = np.tile(envs_fun, 3)
        envs = ShmemVecEnv(envs_fun)
        dummy_env = envs_fun[0]()
    else:
        INPUT_FILE = '../data/05f2a901.json'
        with open(INPUT_FILE, 'r') as f:
            puzzle = json.load(f)

        envs_fun = [
            lambda: gym.make('arc-v0',
                             input=task['input'],
                             output=task['output'],
                             need_ui=need_ui) for task in puzzle['train']
        ]
        #pdb.set_trace()
        envs_fun = envs_fun[0:1]
        envs = ShmemVecEnv(envs_fun)
        dummy_env = envs_fun[0]()

    env_num = len(envs_fun)
    torch.manual_seed(500)

    num_inputs = dummy_env.observation_space.shape[0]
    num_actions = dummy_env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode)
    target_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode)

    if (evalution_mode):
        online_net = torch.load('../result/arc0.model')
        target_net = torch.load('../result/arc0.model')

    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)

    score = 0
    epsilon = 1.0
    steps = 0
    loss = 0

    states = envs.reset()

    try:
        while True:
            if (need_ui):
                envs.render()
            steps += 1

            global initial_exploration
            if (initial_exploration > 0):
                initial_exploration -= 1

            actions = []

            for state in states:
                state = torch.Tensor(state).to(device)
                state = state.unsqueeze(0)
                action = get_action(state, target_net,
                                    0 if evalution_mode else epsilon,
                                    dummy_env)
                if (evalution_mode):
                    print(action)
                actions.append(action)

            next_states, rewards, dones, info = envs.step(actions)
            #print(rewards)

            masks = np.zeros(envs.num_envs)
            for i in range(envs.num_envs):
                masks[i] = 0 if dones[i] else 1

            for i in range(envs.num_envs):
                #print(rewards[i])
                action_one_hot = np.zeros(dummy_env.action_space.n)
                action_one_hot[actions[i]] = 1
                memory.push(states[i], next_states[i], action_one_hot,
                            rewards[i], masks[i])

            #score += reward
            states = next_states

            if not evalution_mode and steps > initial_exploration:
                epsilon -= 0.00003
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, device)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

            if (steps > 1028):
                states = envs.reset()
                steps = 0
                print(
                    'new epsisode ------------------------------------------')

    except KeyboardInterrupt:
        print('save model')
        torch.save(target_net, '../result/arc.model')
        sys.exit(0)
예제 #14
0
def main():
    ### 環境を初期化
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### ポリシーネットワークの構築
    net = QNet(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=lr)

    net.to(device)
    net.train()

    ### もろもろの初期化
    running_score = 0
    steps = 0
    loss = 0
    steps_before = 0

    for e in range(10000):
        done = False
        ### 1エピソードごとにMemoryは空にする(実質、Experience Replay がない)
        memory = Memory()

        ### 環境を初期状態に
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定
            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(num_actions)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        ### 1エピソード分をまとめて学習
        ### memory.sample はランダムに選択ではなく、1エピソードのmemory全体を返す
        loss = QNet.train_model(net, optimizer, memory.sample())

        print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))

        if running_score > goal_score:
            break
예제 #15
0
파일: train.py 프로젝트: kaznyan/temp
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    ### NNのIn-Outは環境によって異なる
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### 2つのNWを作成・初期化
    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    ### 各NWの設定 CPU / GPU
    online_net.to(device)
    target_net.to(device)
    ### 各NWの設定 初めは学習モードにする
    online_net.train()
    target_net.train()

    ### 学習前の初期設定
    memory = Memory(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0
    steps_before = 0

    for e in range(3000):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            ### 行動の決定はtarget_netで行う
            action = get_action(state, target_net, epsilon, env)

            ### 次の状態の観測、報酬の獲得
            next_state, reward, done, _ = env.step(action)
            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)
            if e % 10 == 0:
                print(next_state, action, reward)

            ### わかりにくいので書き変えた
            if done:
                mask = 0
            else:
                mask = 1

            ### memoryに記録
            action_one_hot = np.zeros(num_actions)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            ### rewardは基本的に-1
            score += reward  ### そのepisodeで何ステップ行ったかを記録するためだけのもの

            state = next_state

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                ### online_net の学習
                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                ### たまにtarget_netをonline_netで上書きする
                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        steps_before = steps

        score = score if score == 200.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))

        if running_score > goal_score:
            break
예제 #16
0
파일: train.py 프로젝트: kaznyan/temp
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### ポリシーネットワークの構築
    ### inputに対してπ(a|s) と Q(s, a) が出力される
    ### 次元とユニット数は2つで同じ
    net = QNet(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=lr)

    net.to(device)
    net.train()

    ### もろもろの初期化
    running_score = 0
    steps = 0
    loss = 0
    steps_before = 0

    df = pd.DataFrame(index=range(10000), columns=["steps", "loss_policy", "loss_value"])

    for e in range(10000):
        done = False
        ### 1エピソード分のメモリすら持たずに1ステップずつ学習

        ### 環境を初期状態に
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        lp = []
        lv = []
        while not done:
            steps += 1

            ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定
            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            transition = [state, next_state, action, reward, mask]

            score += reward
            state = next_state

            ### 1ステップごとに、そのステップの結果のみを学習
            loss, loss_policy, loss_value = QNet.train_model(net, optimizer, transition)
            # loss = QNet.train_model(net, optimizer, transition)
            lp.append(loss_policy.item())
            lv.append(loss_value.item())

        lp = np.asarray(lp[:-1]).sum() / (len(lp) - 1)
        lv = np.asarray(lv[:-1]).sum() / (len(lv) - 1)
        print("Ep {0:04d}: {1} step, loss_policy: {2}, loss_value: {3}".format(e, steps - steps_before, lp, lv))
        # print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        df.loc[e, "steps"]       = steps - steps_before
        df.loc[e, "loss_policy"] = lp
        df.loc[e, "loss_value"]  = lv
        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))

        if running_score > goal_score:
            break
    df.to_csv("loss.csv")
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(net, target_net)

    optimizer = optim.Adam(net.parameters(), lr=0.001)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)
    
    net.to(device)
    target_net.to(device)
    net.train()
    target_net.train()
    memory = Memory(10000)
    running_score = 0
    epsilon = 1.0
    steps = 0
    
    for e in range(3000):
        done = False
        
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(state)
            action = get_action(epsilon, qvalue, num_actions)
            next_state, reward, done, _ = env.step(action)
            
            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)
            
            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            memory.push(state, next_state, action, reward, mask)

            score += reward
            state = next_state

            if steps > args.initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(args.batch_size)
                train_model(net, target_net, optimizer, batch, args.batch_size)

                if steps % args.update_target:
                    update_target_model(net, target_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % args.log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))
            writer.add_scalar('log/score', float(score), running_score)

        if running_score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break   
예제 #18
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### ポリシーネットワークの構築
    ### inputに対してπ(a|s) と V(s) が出力される
    ### Vの出力は1つ が学習時にはAdvantage関数を計算する
    net = QNet(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=lr)

    net.to(device)
    net.train()

    ### もろもろの初期化
    running_score = 0
    steps = 0
    loss = 0
    steps_before = 0

    df = pd.DataFrame(index=range(10000),
                      columns=["steps", "loss_policy", "loss_value"])

    memory = Memory()

    for e in range(10000):
        done = False
        ### 1エピソード分のメモリすら持たずに1ステップずつ学習

        ### 環境を初期状態に
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定
            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(num_actions)
            action_one_hot[action] = 1

            transition = [state, next_state, action, reward, mask]

            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score

        if e % 16 == 0:
            ### 16ステップごとに、まとめて学習
            loss, loss_policy, loss_value = QNet.train_model(
                net, optimizer, memory.sample())
            ### メモリの初期化
            memory = Memory()

            df.loc[e, "steps"] = running_score
            df.loc[e, "loss_policy"] = loss_policy
            df.loc[e, "loss_value"] = loss_value

            print(
                "Ep {0:04d}: score: {1:02d}, loss_policy: {2}, loss_value: {3}"
                .format(e, int(running_score), loss_policy, loss_value))

        if running_score > goal_score:
            break
    df.to_csv("loss.csv")
def train(render):
    online_net = QNet(h=84, w=84, outputs=36)
    online_net.load_state_dict(torch.load('saved/online_net.pt'))
    target_net = QNet(h=84, w=84, outputs=36)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    memory = torch.load('saved/model_memory.pt')
    epsilon = 0.1
    steps = 0
    beta = beta_start
    loss = 0

    for e in range(100000):
        #level = random.choice(LEVEL_SET)
        level = 'Level01'
        env = make_retro(game=env_name,
                         state=level,
                         use_restricted_actions=retro.Actions.DISCRETE)

        done = False

        total_reward = 0.0
        state = env.reset()
        state = torch.Tensor(state).to(device).permute(2, 0, 1)
        #state = state.view(state.size()[0], -1)
        state = state.unsqueeze(0)

        while not done:
            steps += 1
            action = get_action(state.to(device), target_net, epsilon, env)

            if render:
                env.render()

            next_state, reward, done, info = env.step(action)

            next_state = torch.Tensor(next_state).permute(2, 0, 1)
            #next_state = next_state.view(next_state.size()[0], -1)
            next_state = next_state.unsqueeze(0)

            total_reward += reward

            mask = 0 if done else 1
            action_one_hot = torch.zeros(36)
            action_one_hot[action] = 1

            reward = torch.tensor([info['score']]).to(device)
            memory.push(state, next_state, action_one_hot, reward, mask)

            state = next_state

            if len(memory) > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.02)
                beta += 0.00005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        if e % 1 == 0:
            print('{} episode | Total Reward: {}'.format(e, total_reward))
            torch.save(online_net.state_dict(), 'saved/online_net.pt')
            torch.save(memory, 'saved/model_memory.pt')
        env.close()
예제 #20
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = 2
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0

    for e in range(30000):
        done = False

        state_series = deque(maxlen=sequence_length)
        next_state_series = deque(maxlen=sequence_length)
        score = 0
        state = env.reset()

        state = state_to_partial_observability(state)
        state = torch.Tensor(state).to(device)

        next_state_series.append(state)
        while not done:
            steps += 1
            state_series.append(state)
            action = get_action(state_series, target_net, epsilon, env)
            next_state, reward, done, _ = env.step(action)

            next_state = state_to_partial_observability(next_state)
            next_state = torch.Tensor(next_state)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            action_one_hot = np.zeros(2)
            action_one_hot[action] = 1
            if len(state_series) >= sequence_length:
                memory.push(state_series, next_state_series, action_one_hot,
                            reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                epsilon -= 0.000005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        score = score if score == 500.0 else score + 1
        if running_score == 0:
            running_score = score
        else:
            running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
예제 #21
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    ### NNのIn-Outは環境によって異なる
    # num_inputs = env.observation_space.shape[0]
    num_inputs = 1024
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### 2つのNWを作成・初期化
    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)

    ### 各NWの設定 CPU / GPU
    online_net.to(device)
    target_net.to(device)
    ### 各NWの設定 初めは学習モードにする
    online_net.train()
    target_net.train()

    ### 特徴抽出用の学習済みモデル
    # pre_model = models.resnet50(pretrained=True)
    # pre_model.fc = nn.Identity()
    pre_model = models.squeezenet1_0(pretrained=True)
    pre_model.classifier = nn.AdaptiveAvgPool2d((1, 1))
    pre_model.to(device)

    def state_to_feature(state):
        state_img = render_cv2img(state[0], state[2])
        state_img = cv2.resize(state_img, (224, 224))[:, :, 0]
        state_img = state_img.reshape((1, 224, 224))
        state_img_rgb = np.zeros((1, 3, 224, 224))
        state_img_rgb[:, 0] = state_img
        state_img_rgb[:, 1] = state_img
        state_img_rgb[:, 2] = state_img
        state_img_rgb_tensor = torch.Tensor(state_img_rgb).to(device)

        state_feature = pre_model(state_img_rgb_tensor)
        return state_feature

    ### メモリの保存場所(改修中)
    memory_dir = "memory/"
    memory = Memory(replay_memory_capacity, memory_dir)

    ### 学習前の初期設定
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0
    steps_before = 0

    for e in range(3000):
        done = False

        score = 0

        ### state = [位置, 速度, 角度, 角速度]
        state = env.reset(
        )  ### [-0.01517264  0.02423424  0.02480018 -0.04009749]
        ### state = [[2048次元のベクトル]]
        state = state_to_feature(state)

        ### 前の時間の情報が無いときついため、それを入れるためのもの 最初はstateと同値でよさそう
        previous_state = state

        while not done:
            steps += 1

            ### 行動の決定はtarget_netで行う
            previous_present_state = torch.cat((previous_state, state), 1)
            action = get_action(previous_present_state, target_net, epsilon,
                                env)

            ### 次の状態の観測、報酬の獲得
            next_state, reward, done, _ = env.step(action)
            next_state = state_to_feature(next_state)
            present_next_state = torch.cat((state, next_state), 1)

            ### わかりにくいので書き変えた
            if done:
                mask = 0
            else:
                mask = 1
            if (done and (score != 499)):  ### 499ステップまで行かずにdoneになったら
                reward = -1
            else:
                pass  ### rewardは基本的に1

            ### memoryに記録
            action_one_hot = np.zeros(2)
            action_one_hot[action] = 1
            memory.push(previous_present_state, present_next_state,
                        action_one_hot, reward, mask)

            ### rewardは基本的に1
            score += reward  ### そのepisodeで何ステップ行ったかを記録するためだけのもの

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                ### online_net の学習
                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                ### たまにtarget_netをonline_netで上書きする
                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

            ### 次のステップ
            previous_state = state
            state = next_state

        print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
                e, running_score, epsilon))

        if running_score > goal_score:
            break
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    img_shape = env.observation_space.shape
    num_actions = 3
    print('image size:', img_shape)
    print('action size:', num_actions)

    net = QNet(num_actions)
    target_net = QNet(num_actions)
    update_target_model(net, target_net)

    optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)

    net.to(device)
    target_net.to(device)
    net.train()
    target_net.train()
    memory = Memory(100000)
    running_score = 0
    epsilon = 1.0
    steps = 0

    for e in range(10000):
        done = False
        dead = False

        score = 0
        avg_loss = []
        start_life = 5
        state = env.reset()

        state = pre_process(state)
        state = torch.Tensor(state).to(device)
        history = torch.stack((state, state, state, state))

        for i in range(3):
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            state = pre_process(state)
            state = torch.Tensor(state).to(device)
            state = state.unsqueeze(0)
            history = torch.cat((state, history[:-1]), dim=0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(history.unsqueeze(0))
            action = get_action(epsilon, qvalue, num_actions)

            next_state, reward, done, info = env.step(action + 1)

            next_state = pre_process(next_state)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            next_history = torch.cat((next_state, history[:-1]), dim=0)

            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']

            score += reward
            reward = np.clip(reward, -1, 1)

            mask = 0 if dead else 1
            memory.push(history.cpu(), next_history.cpu(), action, reward,
                        mask)

            if dead:
                dead = False

            if steps > args.initial_exploration:
                epsilon -= 1e-6
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(args.batch_size)
                loss = train_model(net, target_net, optimizer, batch)

                if steps % args.update_target:
                    update_target_model(net, target_net)
            else:
                loss = 0

            avg_loss.append(loss)
            history = next_history

        if e % args.log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.4f} | steps: {} | loss: {:.4f}'
                .format(e, score, epsilon, steps, np.mean(avg_loss)))
            writer.add_scalar('log/score', float(score), steps)
            writer.add_scalar('log/score', np.mean(avg_loss), steps)

        if score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory_With_TDError(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    beta = beta_start
    loss = 0

    for e in range(3000):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1
            action = get_action(state, target_net, epsilon, env)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            action_one_hot = np.zeros(2)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)
                beta += 0.00005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'.
                format(e, running_score, epsilon, beta))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
예제 #24
0
def main(L, mouse_initial_indices, rewardlist, actions_list):
    if mouse_initial_indices is None:
        all_possible_starting_positions = np.array([*np.where(L == 1)]).T
    scores = [0]
    best_scores = [0]
    env = deepcopy(L)
    torch.manual_seed(2020)

    num_inputs = 2 + 1
    num_actions = 4
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    # writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory(replay_memory_capacity)
    running_score = 0
    epsilon = 1.0
    steps = 0
    loss = 0
    inint = mouse_initial_indices
    best_score = 0
    number_episode = 1000
    for e in range(number_episode):
        if inint is None:
            mouse_initial_indices = all_possible_starting_positions[
                np.random.choice(range(len(all_possible_starting_positions)))]

        done = False
        env = deepcopy(L)
        eaubue = 0.
        score = 0
        state = np.array(mouse_initial_indices)
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            action = get_action(state, target_net, epsilon, env, eaubue=eaubue)
            newstate = state + torch.Tensor(np.array(
                actions_list[action])).to(device)
            if env[int(newstate[0][0].tolist()),
                   int(newstate[0][1].tolist())] != 0:
                next_state = newstate
                new_eaubue = eaubue
                reward = rewardlist[env[int(newstate[0][0].tolist()),
                                        int(newstate[0][1].tolist())]]
                if env[int(newstate[0][0].tolist()),
                       int(newstate[0][1].tolist())] == 2:
                    done = True
                if env[int(newstate[0][0].tolist()),
                       int(newstate[0][1].tolist()
                           )] == 4:  #if the mouse is in the water
                    env[int(newstate[0][0].tolist()),
                        int(newstate[0][1].tolist()
                            )] = 5  #there is no more water
                    new_eaubue = 1.
            else:
                next_state = state
                reward = rewardlist[0]
                new_eaubue = eaubue

            mask = 0 if done else 1
            action_one_hot = np.zeros(4)
            action_one_hot[action] = 1
            memory.push(
                torch.cat((
                    state,
                    torch.tensor(eaubue).unsqueeze(0).unsqueeze(0).to(device)),
                          1),
                torch.cat((next_state, torch.tensor(new_eaubue).unsqueeze(
                    0).unsqueeze(0).to(device)), 1), action_one_hot, reward,
                mask)

            score += reward
            state = next_state
            eaubue = new_eaubue

            if steps > initial_exploration:
                epsilon -= 0.00005
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(batch_size)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        # print("OK")
        if score > 35:
            print(score)
        running_score = 0.99 * running_score + 0.01 * score
        # running_score=score
        scores.append(running_score)
        best_scores.append(
            score if score > best_scores[-1] else best_scores[-1])
        if e % log_interval == 0:
            print(
                '{} episode | score: {:.2f} | best score: {:.2f} | epsilon: {:.2f}'
                .format(e, running_score, best_score, epsilon))
            # writer.add_scalar('log/score', float(running_score), e)
            # writer.add_scalar('log/loss', float(loss), e)
            if score > best_score:
                best_score = score
            torch.save(online_net.state_dict(), "./qlearning_model")

        if running_score > goal_score:
            break

    return number_episode, scores, best_scores