示例#1
0
    def __init__(self,
                 env_name,
                 info=False,
                 gamma=0.9,
                 entropy_beta=0.01,
                 global_update_step=20):
        self.env = gym.make(env_name)
        num_inputs = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.shape[0]

        # init some other parameters....
        self.gamma = gamma
        self.global_update_step = global_update_step
        self.info = info

        # build up the personal network...
        self.value_network_local = models.Value(num_inputs)
        self.policy_network_local = models.Policy(num_inputs, num_actions)
 def __init__(self, args, env):
     # define the arguments and environments...
     self.args = args
     self.env = env
     # define the num of inputs and num of actions
     num_inputs = self.env.observation_space.shape[0]
     num_actions = self.env.action_space.shape[0]
     # define the model save dir...
     self.saved_path = self.args.save_dir + self.args.env_name + '/'
     # check the path
     if not os.path.exists(self.args.save_dir):
         os.mkdir(self.args.save_dir)
     if not os.path.exists(self.saved_path):
         os.mkdir(self.saved_path)
     # define the networks...
     self.policy_network = models.Policy(num_inputs, num_actions)
     self.value_network = models.Value(num_inputs)
     # define the optimizer
     self.optimizer_value = torch.optim.Adam(self.value_network.parameters(), lr=self.args.value_lr, weight_decay=self.args.l2_reg)
     # init the filter...
     self.running_state = ZFilter((num_inputs,), clip=5)
    def __init__(self, env, args):
        # define the parameters...
        self.env = env
        # get the environment's input size and output size
        num_inputs = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.shape[0]
        # get the parameters
        self.args = args
        self.saved_path = 'saved_models/' + str(self.args.env_name) + '/'
        # check the path
        if not os.path.exists(self.saved_path):
            os.mkdir(self.saved_path)

        # check if cuda is avaiable...
        self.use_cuda = torch.cuda.is_available() and self.args.cuda
        print('The cuda is avaiable: ' + str(torch.cuda.is_available()))
        print('If use the cuda: ' + str(self.args.cuda))

        # define the network...
        self.policy_network = models.Policy(num_inputs, num_actions)
        self.value_network = models.Value(num_inputs)

        if self.use_cuda:
            self.policy_network.cuda()
            self.value_network.cuda()

        # define the optimizer
        self.optimizer_value = torch.optim.Adam(
            self.value_network.parameters(),
            lr=self.args.value_lr,
            weight_decay=self.args.l2_reg)
        self.optimizer_policy = torch.optim.Adam(
            self.policy_network.parameters(),
            lr=self.args.policy_lr,
            weight_decay=self.args.l2_reg)

        # init the Filter...
        self.running_state = ZFilter((num_inputs, ), clip=5)
def post_evaluate(policies_dict, add_noise_post=False, n_test_episodes=5):
    '''function tests n best policies for 5 episodes each
    1. policies_dict - structure with 2 lists: path to policy parameters, reward obtained using this policy
     add_noise parameter - if true - adds parameter noise with sigma(std) same as it was used during training
    '''

    # after getting best n policies it is necessary to post evaluate all of them to choose the best one
    post_evaluation = {}

    # n_test_episodes = 5
    for policy in policies_dict.paths:
        value_path = policy + "_value"
        policy_path = policy + "_policy"

        env = gym.make(args.env)
        num_inputs = env.observation_space.shape[0]
        num_actions = env.action_space.shape[0]

        # loading pretrained networks
        if 'Nonoise' in policy:
            policy_layer = Policy(num_inputs, num_actions)
        else:
            policy_layer = PolicyLayerNorm(num_inputs, num_actions)

        value_layer = models.Value(num_inputs)

        policy_layer.load_state_dict(torch.load(policy_path))
        value_layer.load_state_dict(torch.load(value_path))

        if add_noise_post and not 'Nonoise' in policy:

            right_part = policy.split('seed')[1]
            current_seed = right_part.split('/')[0]

            current_setting = policy.split('/')[-1]

            sigma_path = args.src + "/seed" + current_seed + "/sigma_behaviour/" + current_setting

            sigma_episode = int(policy.split('episode_')[1][0])
            with open(sigma_path, 'rb') as f:
                sigmas = pickle.load(f)

            current_sigma = sigmas[sigma_episode]

        rewards_batch = []

        for i_episode in range(n_test_episodes):

            #seed to make equivalent initial conditions for all policies
            env.seed(i_episode)
            torch.manual_seed(i_episode)

            state = env.reset()

            reward_sum = 0
            for t in range(1000):
                state = torch.FloatTensor(state)

                if add_noise_post and not 'Nonoise' in policy:
                    action = select_action(policy_layer,
                                           state,
                                           current_sigma,
                                           noise=True)
                elif not add_noise_post and not 'Nonoise' in policy:
                    action = select_action(policy_layer, state, noise=False)
                else:
                    action = select_action(policy_layer, state, noise=False)

                action = action.data[0].numpy()

                next_state, reward, done, _ = env.step(action)

                reward_sum += reward

                # next_state = running_state(next_state)

                if done:
                    break

                state = next_state

            rewards_batch.append(reward_sum)

        post_evaluation[policy] = sum(rewards_batch) / (len(rewards_batch))

    return post_evaluation
import gym
import ppo_agent
import models
import mujoco_py

env = gym.make('Humanoid-v1')

num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

print('The number of states is ' + str(num_inputs))
print('The number of actions is ' + str(num_actions))

policy_network = models.Policy(num_inputs, num_actions)
value_network = models.Value(num_inputs)

ppo_man = ppo_agent.ppo_brain(env, policy_network, value_network, use_cuda=False)
ppo_man.test_network('saved_models/Humanoid-v1/policy_net_model_400.pt')


示例#6
0
from a3c_agent_continues import A3C_Workers

torch.set_default_tensor_type('torch.DoubleTensor')

if __name__ == '__main__':
    env_name = 'Pendulum-v0'
    save_path = 'saved_models/Pendulum-v0/'
    # the number of cpu...
    num_of_workers = multiprocessing.cpu_count()

    env = gym.make(env_name)
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]

    # build up the center network....
    value_network_global = models.Value(num_inputs)
    policy_network_global = models.Policy(num_inputs, num_actions)

    value_network_global.share_memory()
    policy_network_global.share_memory()

    # build up the workers...
    workers = []
    processor = []

    #worker_test = A3C_Workers(env_name)
    #worker_test.test_the_network(path='saved_models/policy_model_3700.pt')
    for idx in range(num_of_workers):
        if idx == 0:
            workers.append(A3C_Workers(env_name, True))
        else:
示例#7
0
def create_value(db: Session, value: schemas.ValueCreate):
    db_value = models.Value(title=value.title, description=value.description)
    db.add(db_value)
    db.commit()
    db.refresh(db_value)
    return db_value