Python Env.Env示例，env.env.Env.Env Python示例

示例#1

0

显示文件

文件： dqn.py 项目： yt4766269/NAPLProgramSynthesis

def main(test=False):
    if test:
        dqn = DQN()
        dqn.test(test_case_count=10000, load_dir='models/dqn.pkl')
    else:
        dqn = DQN()
        env = Env()
        # dqn.load("models/pretrained.pkl")
        print('\nCollecting experience...')
        for i_episode in range(60000):
            s = env.reset()
            ep_r = 0
            for _count in range(4):
                root_action, leaf_action = dqn.choose_action(s)

                # take action
                s_, r, done = env.step(root_action, leaf_action)

                dqn.store_transition(s, (root_action, leaf_action), r, s_)

                ep_r += r
                if dqn.memory_counter > MEMORY_CAPACITY:
                    dqn.learn()

                if done:
                    break
                s = s_
            # print('ep_r:', ep_r)

            if i_episode % 1000 == 1:
                dqn.test()

        dqn.save('models/dqn_final_no_pretrain.pkl')

示例#2

0

显示文件

def test_attack():
    agent = Agent(args.img_stack, device)
    agent.load_param()
    env = Env(args.seed, args.img_stack, args.action_repeat)

    # load adv input, by default general attack perturbation
    delta_s = np.load('param/adv_general.npy')
    if args.attack_type != 'general':
        file_path = 'param/adv_' + args.attack_type
        if args.attack_type == 'patch':
            file_path += '_' + args.patch_type
        file_path += '.npy'
        delta_s = np.load(file_path)
    # show adv
    fig = plt.figure(figsize=(8, 8))
    plt.title('Stack of ' + str(args.img_stack) +
              ' adversarial signals seen by Agent')
    plt.axis('off')
    columns, rows = args.img_stack // 2, args.img_stack // 2
    for i in range(1, columns * rows + 1):
        # denormalize while showing the image
        img = (delta_s[i - 1] + 1) * 128
        fig.add_subplot(rows, columns, i)
        plt.imshow(img, cmap='gray')
    plt.show()

    for i_ep in range(10):
        score = 0
        state = env.reset()

        for t in range(1000):
            # steps range to render attack in 1000
            attack_render = [30, 40]
            if t in np.arange(attack_render[0], attack_render[1] + 1):
                if t in attack_render:
                    s_with_ds = (state + delta_s)
                    # clip the image limits and denormalize for displaying
                    s_with_ds = np.clip(s_with_ds, -1, 0.9921875)
                    s_with_ds = (s_with_ds + 1) * 128
                    title = 'Attack Started' if t == attack_render[
                        0] else 'Attack ended'
                    title += ' (showing first frame of 4 frames visible to policy)'
                    plt.imshow(s_with_ds[0], cmap='gray')
                    plt.axis('off')
                    plt.title(title)
                    plt.show()
                state += delta_s

            action = agent.select_action(state)
            state_, reward, done, die = env.step(action *
                                                 np.array([2., 1., 1.]) +
                                                 np.array([-1., 0., 0.]))
            if args.render:
                env.render()
            score += reward
            state = state_
            if done:
                break

        print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))

示例#3

0

显示文件

 def get_children_state(self):
     new_states = []
     env=Env(mid=True,mid_state=self.cur_state)
     actions = np.argwhere(env.feasible_actions)
     for action in actions:
         new_state = env.get_new_state(action)
         new_states.append([new_state[:, :, 0]])
     return new_states

示例#4

0

显示文件

文件： deepcoder.py 项目： yt4766269/NAPLProgramSynthesis

def dfs_search(target=None):
    env = Env()
    dfs = DFS()
    initial_and_target_state = env.get_current_state()
    start_time = time.clock() * 1000
    success = dfs.dfs(env)
    print(dfs.action)
    end_time = time.clock() * 1000
    print('time: {} ms'.format(end_time - start_time))
    return success, end_time - start_time, initial_and_target_state, dfs.action

示例#5

0

显示文件

def Astart(heuristic,lambda_):
    open=[]
    close=[]
    ctg_list = []
    new_goal_states,new_goal_path=generate_goal_states()
    play_env=Env()
    init_states_flatten = DCAnet.state_to_nnet_input([play_env.state[:,:,0]])
    open.append(Node(play_env.state[:,:,0],None,heuristic([init_states_flatten]),0))
    cur_expand_node = open.pop(0)

    while cur_expand_node.cur_state.tolist() not in new_goal_states:
        close.append(cur_expand_node)
        child_states=cur_expand_node.get_children_state()
        if child_states !=[]:
            child_states_flatten=DCAnet.state_to_nnet_input(child_states)
            child_ctg=heuristic(child_states_flatten)
            for child,ctg in zip(child_states,child_ctg):
                if (np.count_nonzero(child[0] == 1))>=10:
                    child_node=Node(child[0],cur_expand_node,ctg,1+cur_expand_node.cost)
                    open.append(child_node)
                    ctg_list.append(ctg+(1+cur_expand_node.cost)*lambda_)

        min_ctg=np.argmin(ctg_list)
        ctg_list.pop(min_ctg)
        cur_expand_node=open.pop(min_ctg)
        play_env = Env(mid=True, mid_state=cur_expand_node.cur_state)
        if len(close)%10000==0:
            print('Progress log:')
            print('Length of close = %s' %(len(close)))
        # if cur_expand_node.get_children_state()==[]:
        #     print('DEAD END ENCOUNTER, Num of pegs left is %s'%play_env.n_pegs)
        #     print(cur_expand_node.cur_state)
        #     print('\n')

    previous_path_idx=new_goal_states.index(cur_expand_node.cur_state.tolist())
    previous_path=new_goal_path[previous_path_idx]
    path=[]
    path.append(cur_expand_node.cur_state)
    while cur_expand_node.parent!=None:
        path.append(cur_expand_node.parent.cur_state)
        cur_expand_node=cur_expand_node.parent

    return previous_path,path,len(close)+len(open)

示例#6

0

显示文件

def populate_buffer(agent, n_workers, buffer):
    env = Env()
    agents = [agent for _ in range(n_workers)]
    pool = ThreadPool(n_workers)
    while len(buffer.buffer) < buffer.capacity:
        results = pool.map(collect_random_data, agents)
        for data in results:
            shuffle(data)
            buffer.add_list(data)
    pool.close()
    pool.join()

示例#7

0

显示文件

文件： dnn.py 项目： yt4766269/NAPLProgramSynthesis

 def generate_dataset_from_raw(self, raw_data_file_path):
     raw_data = np.load(raw_data_file_path)
     dataset = []
     for item in raw_data:
         state = item[0]
         init_target = state[int(len(state) / 2):]
         actions = item[1]
         env = Env(target_state=init_target)
         for action in actions:
             dataset.append({"state": state, "action": action})
             state, reward, done = env.step(action[0], action[1])
             if done:
                 break
     return dataset

示例#8

0

显示文件

def generate_dataset(raw_data_file_path):
    raw_data = np.load(raw_data_file_path)
    dataset = []
    for item in raw_data:
        state = item[0]
        init_target = state[int(len(state) / 2):]
        actions = item[1]
        env = Env(target_state=init_target)
        for action in actions:
            dataset.append((state, action))
            state, reward, done = env.step(action[0], action[1])
            if done:
                break
    np.save('dataset/dataset_2.npy', dataset)
    print(dataset)

示例#9

0

显示文件

文件： dqn.py 项目： yt4766269/NAPLProgramSynthesis

    def test(self, test_case_count=200, load_dir=None):
        self.target_net = self.target_net.eval()
        if load_dir is not None:
            self.target_net.load_state_dict(torch.load(load_dir))
        count = 0
        total_length = 0
        for _ in tqdm(range(test_case_count)):
            env = Env()
            s = env.get_current_state()
            ep_r = 0
            for i in range(4):
                x = torch.unsqueeze(torch.FloatTensor(s), 0)
                # input only one sample
                root_result, leaf_result = self.target_net(x)
                root_action = torch.argmax(root_result).item()
                if root_action != 3:
                    leaf_action = torch.argmax(leaf_result[root_action]).item()
                    # step
                    s_, r, done = env.step(root_action, leaf_action)
                else:
                    find_path_result = leaf_result[3]
                    find_path_source = torch.argmax(
                        find_path_result[:, :int(find_path_result.shape[1] /
                                                 2)]).item()
                    find_path_target = torch.argmax(
                        find_path_result[:,
                                         int(find_path_result.shape[1] /
                                             2):]).item()
                    # step
                    s_, r, done = env.step(
                        root_action, (find_path_source, find_path_target))
                ep_r += r
                s = s_
                if done:
                    if ep_r > 0:
                        total_length += i
                    break
            if ep_r > 0:
                count += 1

        acc = float(count) / test_case_count
        if acc > self.max_acc and load_dir is None:
            torch.save(self.target_net.state_dict(), 'models/dqn.pkl')
            self.max_acc = acc
        print("acc is: ", acc)
        if count > 0:
            # 因为统计的时候少1，这里补上1
            print("length is: ", float(total_length) / count + 1)

示例#10

0

显示文件

    def play2(self, arg):

        step_for = 16
        step_bck = 15
        match = False
        for_inter_state = []
        bck_inter_state = []
        _, _, heuristic = arg
        while match == False:
            env_for = Env()
            env_bck = DCAEnv()
            non_ter = False
            for _ in range(step_for):
                action = self.naive_policy(env_for, heuristic,
                                           env_for.feasible_actions)
                _, _, end = env_for.step(action)
                if end:
                    non_ter = True
                    break
            if non_ter:
                for_inter_state.append(np.ones((7, 7)).tolist())
            else:
                for_inter_state.append(env_for.state[:, :, 0].tolist())
            non_ter = False
            for _ in range(step_bck):
                action = self.naive_policy(env_bck, heuristic,
                                           env_bck.feasible_actions, False)
                _, _, end = env_bck.step(action)
                if end:
                    non_ter = True
                    break
            if non_ter:
                bck_inter_state.append(np.ones((7, 7)).tolist())
            else:
                bck_inter_state.append(env_bck.state.tolist())

            if env_for.state[:, :, 0].tolist() in bck_inter_state:
                match = True
            if env_bck.state.tolist() in for_inter_state:
                match = True

            if len(for_inter_state) == 1000:
                return 1000, False

        return len(for_inter_state), True

示例#11

0

显示文件

def run_agent():
    agent = Agent(args.img_stack, device)
    agent.load_param()
    env = Env(args.seed, args.img_stack, args.action_repeat)

    state = env.reset()

    # Prepare attack
    attack = AdvAttack(args.attack_type)
    attack.initialize_perturbation(state.shape)
    attack.load_networks()

    for i_ep in range(50):
        score = 0
        state = env.reset()

        for t in range(1000):
            action = agent.select_action(state)
            # update buffer for training the attack
            attack.update_buffer(state)

            # write to tensorboard
            input_imgs_to_net = torch.tensor(
                (attack.buffer['s'] + attack.buffer['d_s']))
            input_imgs_grid = make_grid(input_imgs_to_net[0].reshape(
                4, 1, 96, 96))
            writer.add_image('Four stack of input state with adversarial',
                             input_imgs_grid)
            writer.add_graph(attack.net, input_imgs_to_net)
            writer.close()

            # train attack
            attack.train()

            state_, reward, done, die = env.step(action *
                                                 np.array([2., 1., 1.]) +
                                                 np.array([-1., 0., 0.]))
            if args.render:
                env.render()
            score += reward
            state = state_
            if done or die:
                break

        print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))

示例#12

0

显示文件

文件： agent.py 项目： dobro12/ALiLQR

def main():
    env = Env(enable_draw=True, base_fix=False)
    agent = Agent(env)

    time_horizon = 10
    com_pos = np.array([0.0, 0, 0.1])
    rpy = np.zeros(3)
    com_vel = np.zeros(3)
    base_ang_vel = np.zeros(3)
    target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel])
    target_x = target_x.reshape((-1, 1))
    target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape(
        (12, 1))
    init_u_list = np.array([target_u for i in range(time_horizon)])

    state = env.reset()
    t = 0
    while t < 10:
        com_pos = env.model.com_pos
        rpy = env.model.base_rpy
        com_vel = env.model.base_vel
        base_ang_vel = np.matmul(env.model.base_rot.T, env.model.base_ang_vel)
        init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel])
        init_x = init_x.reshape((-1, 1))

        delta_time_list = np.array([0.01] * time_horizon)
        foot_pos_list = np.array(
            [env.model.foot_pos_list for i in range(time_horizon + 1)])
        contact_phi_list = np.array([[1, 1, 1, 1]
                                     for i in range(time_horizon + 1)])

        target_x_list = np.array([target_x for i in range(time_horizon + 1)])
        target_u_list = np.array([target_u for i in range(time_horizon)])

        action, u_list = agent.get_action(init_x, init_u_list, delta_time_list,
                                          foot_pos_list, contact_phi_list,
                                          target_x_list, target_u_list)
        init_u_list = deepcopy(u_list)

        state = env.step(action)

        #time.sleep(env.time_step)
        t += env.time_step

示例#13

0

显示文件

文件： beam_search.py 项目： yt4766269/NAPLProgramSynthesis

def beam_search(net, beam_size=3):
    states = []
    probs = []
    trajectories = []
    env = Env()
    for i in range(beam_size):
        states.append(deepcopy(env))
        probs.append(1.0)
        trajectories.append([])
    for _ in range(4):
        candidate_states = []
        for k in range(beam_size):
            s = states[k].get_current_state()
            x = torch.unsqueeze(torch.FloatTensor(s), 0)
            # input only one sample
            actions_value = net(x)
            candidates = topk_actions(actions_value, beam_size)
            for i in range(beam_size):
                # step
                env = deepcopy(states[k])
                action = candidates[i][1]
                temp_traj = copy.copy(trajectories[k])
                temp_traj.append((states[k].get_current_state(), action))
                s_, r, done = env.step(action[0], action[1])
                new_state = env
                if r > 0:
                    return True, temp_traj
                # if (new_state, candidates[i][0] * probs[k], temp_traj) not in candidate_states:
                #     candidate_states.append((new_state, candidates[i][0] * probs[k], temp_traj))
                candidate_states.append(
                    (new_state, candidates[i][0] * probs[k], temp_traj))
        candidate_states = sorted(candidate_states,
                                  key=lambda x: x[1],
                                  reverse=True)

        for i in range(beam_size):
            states[i] = candidate_states[i][0]
            probs[i] = candidate_states[i][1]
            trajectories[i] = candidate_states[i][2]

    return False, None

示例#14

0

显示文件

文件： utils.py 项目： alxwdm/TichuAgent

def play_greedy_game(verbose=True):
    """
    This function plays a Tichu game with four "greedy" players.
    Uses greedyAgent. 
    This is an Agent with very simple heuristic play moves.
    Always tries to win a stack except opponent is leading.
    Raises an Exception if 10 consecutive false moves are made.
    (This should not happen when environment and greedyAgent is bugfree.)
    """
    agent = greedyAgent()
    env = Env(train_mode=not (verbose))
    state, rewards, done, active_player = env.reset()
    conseq_active_counter = 0
    cummulative_reward = [0, 0, 0, 0]
    while True:
        my_state = state[active_player]
        action = agent.act(my_state)
        last_active = active_player
        if not env.game.players[active_player].finished:
            cummulative_reward[active_player] += rewards[active_player]
        state, rewards, done, active_player = env.step(active_player, action)
        new_active = active_player
        if last_active == new_active:
            conseq_active_counter += 1
        else:
            conseq_active_counter = 0
        if done:
            if verbose:
                print('-----')
            for i in range(4):
                cummulative_reward[i] += rewards[i]
                if verbose:
                    print('Cummulative reward of player {}: {}'.format(
                        i, cummulative_reward[i]))
            return
        if conseq_active_counter > 10:
            raise Exception(
                "Active counter exceeded. Possible infinity loop detected.")

示例#15

0

显示文件

 def _env_init(self):
     self.env = Env()

示例#16

0

显示文件

文件： analyze_main.py 项目： liuran2011/analyze

 def _env_init(self):
     self.env = Env()
     self.env.check()

示例#17

0

显示文件

from net import Net
from history import History
from env.env import Env
import settings

logger = getLogger('train')
logger.setLevel(DEBUG)
logger.addHandler(StreamHandler())

if __name__ == '__main__':
    servers_count = 3
    containers_count = 5

    tf.reset_default_graph()  #Очищаем граф tensorflow

    env = Env(servers_count=servers_count, containers_count=containers_count)
    net = Net(learning_rate=settings.LEARNING_RATE,
              input_count=env.state_size,
              output_count=env.actions_count,
              hidden_count=settings.NN_HIDDEN_COUNT)

    history = History()

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)

        gradBuffer = sess.run(tf.trainable_variables())

        for i, grad in enumerate(gradBuffer):

示例#18

0

显示文件

import torch
from network.FullConnected import Net
import env.action as action_def
from env.env import Env

env = Env()
N_STATES = env.n_state


class GrammarNet(torch.nn.Module):
    def forward(self, state):
        root_result = self.root_grammar(state)
        delete_node_reslt = self.delete_node_grammar(state)
        delete_edge_result = self.delete_edge_grammar(state)
        filter_result = self.filter_grammar(state)
        find_path_result = self.find_path_grammar(state)

        return root_result, [
            delete_node_reslt, delete_edge_result, filter_result,
            find_path_result
        ]

    def __init__(self):
        super().__init__()
        self.root_grammar = Net(n_state=N_STATES,
                                n_action=len(action_def.action))
        self.delete_node_grammar = Net(n_state=N_STATES,
                                       n_action=len(
                                           action_def.delete_node_action))
        self.delete_edge_grammar = Net(n_state=N_STATES,
                                       n_action=len(

示例#19

0

显示文件

文件： test_env.py 项目： alxwdm/TichuAgent

def test_env_state():
    env = Env()
    state, _, _, _ = env.reset()
    assert np.shape(state) == (4, 4, 3)
    for i in range(4):
        assert sum(state[i][0][2]) == 14

示例#20

0

显示文件

def main():
    env = Env(enable_draw=True, base_fix=False)
    agent = Agent(env)

    delta_time = 0.025
    time_horizon = 10
    com_pos = np.array([0.0, 0, 0.25])
    rpy = np.zeros(3)
    com_vel = np.zeros(3)
    base_ang_vel = np.zeros(3)
    target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel])
    target_x = target_x.reshape((-1, 1))
    target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape(
        (12, 1))
    init_u_list = np.array([target_u for i in range(time_horizon)])

    temp_length = int(0.3 / delta_time)
    temp_contact_phi_list = [[0, 1, 1, 0]] * temp_length + [[
        1, 1, 1, 1
    ]] * temp_length + [[1, 0, 0, 1]] * temp_length + [[1, 1, 1, 1]
                                                       ] * temp_length
    total_contact_phi_list = np.array([[1, 1, 1, 1]] * temp_length +
                                      temp_contact_phi_list * 1000)

    state = env.reset()
    t = 0
    last_t = 0
    while t < 100:

        if last_t == 0 or t - last_t >= delta_time:
            last_t = t
            com_pos = env.model.com_pos
            print(com_pos)
            rpy = env.model.base_rpy
            com_vel = env.model.base_vel
            base_ang_vel = np.matmul(env.model.base_rot.T,
                                     env.model.base_ang_vel)
            init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel])
            init_x = init_x.reshape((-1, 1))

            delta_time_list = np.array([delta_time] * time_horizon)
            foot_pos_list = np.array(
                [env.model.foot_pos_list for i in range(time_horizon + 1)])
            contact_phi_list = total_contact_phi_list[:time_horizon + 1]
            total_contact_phi_list = total_contact_phi_list[1:]

            target_x_list = np.array(
                [target_x for i in range(time_horizon + 1)])
            target_u_list = np.array([target_u for i in range(time_horizon)])

            action, u_list = agent.get_action(init_x, init_u_list,
                                              delta_time_list, foot_pos_list,
                                              contact_phi_list, target_x_list,
                                              target_u_list)
            init_u_list = deepcopy(u_list)
            for leg_idx in range(4):
                if contact_phi_list[0, leg_idx] == 0.0:
                    action[leg_idx * 3:(leg_idx + 1) * 3] = [0, 0, -3.0]

        state = env.step(action, contact_phi_list[0, :])

        t += env.time_step

示例#21

0

显示文件

def main():
    config = read_config("config.yaml")
    agent_config = config['Agent']
    network_config = agent_config['Network']
    training_config = config['Training']
    files_config = config['Files']
    eval_config = config['Evaluation']

    print('\t\t --------------------------------------------')
    print('\t\t ------  Parameters of the experiment  ------')
    print('\t\t --------------------------------------------\n')

    print('## Agent params')
    print('Agent : ' + agent_config['name'])
    print('Gamma : ', agent_config['gamma'])
    print('')

    print('## Network Params')
    print('Network used : ' + network_config['name'])
    print('Number of filters : ', network_config['n_filters'])
    print('activation function : ' + network_config['activation'])
    print('state embedding size : ', network_config['state_embedding_size'])
    print('')

    print('## Training params')
    print('Number of iteration : ', training_config['n_iter'])
    print('Learning rate : ', network_config['lr'])
    print('Number of games per iteration : ', training_config['n_games'])
    print('Number of workers : ', training_config['n_workers'])
    print('Batch size : ', training_config['batch_size'])
    print('Buffer size : ', training_config['buffer_size'])
    print('')

    print('## Evaluation params')
    print('Number of games per iteration : ', eval_config['n_games'])
    print('Number of workers : ', eval_config['n_workers'])
    print('')

    sleep(2.0)

    # Init files and tensorboard
    model_name = agent_config['name']
    checkpoints_dir = os.path.join(model_name, files_config['checkpoints_dir'])
    tensorboard_log_dir = os.path.join(model_name,
                                       files_config['tensorboard_log_dir'])
    results_log_path = os.path.join(model_name,
                                    files_config['results_log_path'])

    # fix random seed
    if config['Seed'] is None:
        np.random.seed(seed=42)
    else:
        np.random.seed(int(seed))

    print('\n\n')
    env = Env()

    # if train from scratch
    if training_config["init_checkpoint"] == 0:
        # initialize dir for tensorboard
        flush_or_create(tensorboard_log_dir)
        # initialize dir for checkpoitns
        flush_or_create(checkpoints_dir)
        # init agent and network from scratch
        agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir,
                                 tensorboard_log_dir)
        # initialize iteration number
        start = 0

    # else restart training from last checkpoint
    else:
        agent = ActorCriticAgent(agent_config,
                                 network_config,
                                 checkpoints_dir,
                                 tensorboard_log_dir,
                                 restore=True)
        print('\nnetwork restored from checkpoint # ', latest_checkpoint)
        print('')
        start = latest_checkpoint

    # intialize the summary writer and results log file
    log_file = open(results_log_path,
                    "wb+")  # open log file to write in during evaluation

    display_every = training_config["display_every"]
    n_games_train = training_config["n_games"]
    n_workers_train = training_config["n_workers"]
    T_update_net = training_config["T_update_net"]
    T_update_target_net = training_config["T_update_target_net"]
    n_games_eval = eval_config["n_games"]
    n_workers_eval = eval_config["n_workers"]
    prefill_buffer = training_config["prefill_buffer"]
    # gamma = agent_config['gamma']

    summary_dict = dict({})
    data_buffer = Buffer(capacity=training_config['buffer_size'])

    logger = logging.getLogger(__name__)

    if prefill_buffer:
        # populate buffer with intial data from random games
        print('\nPopulating Buffer ... \n')
        populate_buffer(agent, n_workers_train, data_buffer)

    print('\n\n')
    print('Starting training\n\n')
    batch_size = training_config['batch_size']
    for it in tqdm(np.arange(start, training_config["n_iter"]),
                   desc="parallel gameplay iterations"):
        # play games to generate data and train the network
        env.reset()
        try:
            agent.train(env, n_games_train, data_buffer, batch_size,
                        n_workers_train, display_every, T_update_net)
        except Exception as error:
            print('\n\n#### AN ERROR OCCURED WHILE TRAINING ####\n\n')
            agent.net.summary_writer.close()
            agent.net.sess.close()
            log_file.close()
            logger.error(error)
            raise
        agent.net.save_checkpoint(checkpoints_dir, it=it + 1)

        # play games with latest checkpoint and track average final reward
        results = agent.evaluate(env, n_games_eval, n_workers_eval)
        # save results
        pickle.dump(results, log_file)
        print('')

    agent.net.summary_writer.close()
    agent.net.sess.close()
    log_file.close()
    print('End of training')

示例#22

0

显示文件

def collect_random_data(agent):
    env = Env()
    random_agent = RandomAgent()
    end = False
    states = []
    actions = []
    rewards = []
    data = []
    discount_G = 1.0
    G = 0.
    t = 0
    while not end:
        states.append(env.state)
        action = random_agent.select_action(env.feasible_actions)
        action_index = 4 * action[0] + action[1]
        actions.append(action_index)
        reward, _, end = env.step(action)
        rewards.append(reward)
        # discount = gamma
        # for s in range(t):
        # 	values[t-s-1] += discount * reward
        # 	discount = discount * gamma
        t += 1
        G += discount_G * reward
        discount_G = discount_G * agent.gamma

    R = 0.

    # evaluate state values of all states encountered in a batch to save time
    state_values = agent.net.get_value(
        np.array(states).reshape(-1, 7, 7, agent.state_channels)).reshape(-1)

    for s in range(t):
        R = rewards[t - s - 1] + agent.gamma * R
        advantage = R - state_values[t - s - 1]
        data = [
            dict({
                "state": states[t - s - 1],
                "advantage": advantage,
                "action": actions[t - s - 1],
                "critic_target": R
            })
        ] + data

    assert (G == R)
    assert (len(state_values) == len(states) == len(actions) == len(rewards) ==
            t)

    # data = []
    # for s in range(len(states)-1):
    # 	advantage = rewards[s] + values[s+1] - values[s]
    # 	data.append(dict({"state" : states[s],
    # 					  "advantage" : advantage,
    # 					  "critic_target" : values[s],
    # 					  "action" : actions[s]}))

    # T = len(states)-1
    # advantage = rewards[T] - values[T] # next state value is 0 because it is terminal
    # data.append(dict({"state" : states[T],
    # 				  "advantage" : advantage,
    # 				  "critic_target" : values[T],
    # 				  "action" : actions[T]}))

    return data