def __init__(self, env_name, params):
        self.env = envs.make(env_name)
        self.params = params
        self.action_bound = self.env.action_bound[1]
        
        self.iterations = params["iterations"]
        self.mem_len = params["mem_len"]
        self.seed = params["seed"]
        self.render = params["render"]
        self.log_interval = params["log_interval"]
        self.warmup = params["warmup"]
        self.batch_size = params["batch_size"]
        self.save = params["save"]

        hidden_dim = params["hidden_dim"]
        state_dim = self.env.observation_space
        action_dim = self.env.action_space
        cuda = params["cuda"]
        network_settings = params["network_settings"]

        actor = utils.Actor(state_dim, hidden_dim, action_dim)
        target_actor = utils.Actor(state_dim, hidden_dim, action_dim)
        critic = utils.Critic(state_dim+action_dim, hidden_dim, 1)
        target_critic = utils.Critic(state_dim+action_dim, hidden_dim, 1)
        self.memory = utils.ReplayMemory(1000000)
        self.agent = sw.Sleepwalk(actor, 
                                critic,
                                target_actor, 
                                target_critic,
                                network_settings,
                                GPU=cuda)

        self.noise = utils.OUNoise(action_dim)
        self.noise.set_seed(self.seed)
        self.memory = utils.ReplayMemory(self.mem_len)

        self.pol_opt = torch.optim.Adam(actor.parameters())
        self.crit_opt = torch.optim.Adam(critic.parameters())

        if cuda:
            self.Tensor = torch.cuda.FloatTensor
        else:
            self.Tensor = torch.Tensor
        
        if self.render:
            self.env.init_rendering()
        
        self.best = None

        # initialize experiment logging
        self.logging = params["logging"]
        if self.logging:
            self.directory = os.getcwd()
            filename = self.directory + "/data/qprop.csv"
            with open(filename, "w") as csvfile:
                self.writer = csv.writer(csvfile)
                self.writer.writerow(["episode", "reward"])
                self.train()
        else:
            self.train()
示例#2
0
    def __init__(self, agent_dict={}, actor_dict={}, critic_dict={}):
        """ Initialize Agent object

        Params
        ======
            agent_dict(dict): dictionary containing parameters for agent
            actor_dict(dict): dictionary containing parameters for agents actor-model
            critic_dict(dict): dictionary containing parameters for agents critic-model
        """
        enable_cuda = agent_dict.get("enable_cuda", False)
        if enable_cuda:
            self.device = torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device("cpu")

        self.num_agents = agent_dict.get("num_agents", 20)

        self.num_episodes = agent_dict.get("num_episodes", 10000)
        self.save_after = agent_dict.get("save_after", -1)
        self.name = agent_dict.get("name", "reacher")

        self.gamma = agent_dict.get("gamma", 0.9)

        self.tau = agent_dict.get("tau", 0.001)

        self.noise = utils.OUNoise((self.num_agents, 4), 0)

        self.num_replays = agent_dict.get("num_replays", 1)

        self.learning_rate_actor = agent_dict.get("learning_rate_actor", 1E-3)
        self.learning_rate_critic = agent_dict.get("learning_rate_critic",
                                                   1E-3)

        self.criterion = nn.MSELoss()

        memory_size = agent_dict.get("memory_size", 2**14)
        batchsize = agent_dict.get("batchsize", 2**10)
        replay_reg = agent_dict.get("replay_reg", 0.0)

        self.replay_buffer = utils.ReplayBuffer(memory_size, batchsize)

        self.actor = model.ActorModel(actor_dict).to(self.device)
        self.actor_target = model.ActorModel(actor_dict).to(self.device)

        self.critic = model.CriticModel(critic_dict).to(self.device)
        self.critic_target = model.CriticModel(critic_dict).to(self.device)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.learning_rate_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.learning_rate_critic)

        utils.copy_model(self.actor, self.actor_target, tau=1.0)
        utils.copy_model(self.critic, self.critic_target, tau=1.0)

        seed = agent_dict.get("seed", 0)

        torch.manual_seed(seed)
        np.random.seed(seed)
示例#3
0
    def __init__(self, obs_size, act_size, seed=0, params=None, logger=None):
        """
        Initialize a Deep Deterministic Policy Gradient (DDPG) agent.

        Parameters
        ----------
        obs_size : number
            Number of observation elements.
        act_size : number
            Number of action elements.
        seed : number, optional
            Random seed. The default is 0.
        params :
            Hyperparameters data structure.

        """

        if params is None:
            params = ddpg_params()

        # logger for storing training data
        self.logger = logger

        # parameters
        self.params = params
        self.step_count = 0

        if not torch.cuda.is_available() and self.params['device'] != 'cpu':
            print("GPU is not available. Selecting CPU...")
            self.params['device'] = 'cpu'

        # initialize actor
        self.actor = models.DeterministicActor(obs_size, act_size,
                                               seed).to(self.params['device'])
        self.target_actor = models.DeterministicActor(obs_size, act_size, seed)
        self.target_actor.load_state_dict(self.actor.state_dict())

        # initialize critic
        self.critic = models.QCritic(obs_size, act_size,
                                     seed).to(self.params['device'])
        self.target_critic = models.QCritic(obs_size, act_size, seed)
        self.target_critic.load_state_dict(self.critic.state_dict())

        # create optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.params['actor_lr'])
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.params['critic_lr'])

        # Experience replay
        self.buffer = utils.ExperienceBuffer(obs_size, act_size,
                                             params['buffer_length'])

        # Noise model
        self.noise_model = utils.OUNoise(size=act_size,
                                         mean=self.params['noise_mean'],
                                         mac=self.params['noise_mac'],
                                         var=self.params['noise_var'],
                                         varmin=self.params['noise_var_min'],
                                         decay=self.params['noise_decay'])
示例#4
0
def main():
    env = envs.make("model_training")
    max_rpm = env.action_bound[1]
    action_dim = env.action_space
    state_dim = env.observation_space
    epochs = 250000
    hidden_dim = 64
    dyn = model.Transition(state_dim, action_dim, hidden_dim, GPU)
    if GPU:
        dyn = dyn.cuda()
        Tensor = torch.cuda.FloatTensor
    else:
        Tensor = torch.Tensor

    counter = 0
    running = True
    H = 100
    noise = utils.OUNoise(action_dim, mu=10)
    env.init_rendering()
    while running:
        # set random state
        state = Tensor(env.reset())      
        state_actions = []
        next_states = []
        
        # run trajectory
        loss = 0
        for i in range(1, H+1):
            action = np.array([noise.noise()], dtype="float32")*env.action_bound[1]
            action_tensor = torch.from_numpy(action)
            if GPU:
                action_tensor = action_tensor.cuda()
            state_action = torch.cat([state, action_tensor],dim=1)
            next_state, _, _, _ = env.step(action.reshape(action_dim,))
            #env.render()
            next_state = Tensor(next_state)
            state_actions.append(state_action)
            next_states.append(next_state)
            state = next_state
        state_actions = torch.stack(state_actions).squeeze(1)
        next_states = torch.stack(next_states).squeeze(1)
        #print(state_actions.size())
        #print(next_states.size())
        traj = {"state_actions": state_actions,
                "next_states": next_states}
        loss = dyn.batch_update(traj)
        print("---Model loss: {:.8f}---".format(loss))

        counter += 1

        if counter > epochs:
            running = False
            print("Saving figures")
            directory = os.getcwd()
            """
            fig1.savefig(directory+"/figures/one_step_loss.pdf", bbox_inches="tight")
            """
            print("Saving model")
            torch.save(dyn, directory+"/saved_models/one_step.pth.tar")
示例#5
0
    def __init__(self, env_name, params):
        self.env = envs.make(env_name)
        self.params = params
        self.action_bound = self.env.action_bound[1]

        self.iterations = params["iterations"]
        self.seed = params["seed"]
        self.render = params["render"]
        self.log_interval = params["log_interval"]
        self.save = params["save"]

        self.cuda = params["cuda"]
        state_dim = self.env.observation_space
        action_dim = self.env.action_space
        hidden_dim = params["hidden_dim"]
        network_settings = params["network_settings"]

        pi = utils.Actor(state_dim, hidden_dim, action_dim)
        beta = utils.Actor(state_dim, hidden_dim, action_dim)
        critic = utils.Critic(state_dim, hidden_dim, 1)
        self.agent = fmis.FMIS(pi,
                               beta,
                               critic,
                               self.env,
                               network_settings,
                               GPU=self.cuda)

        self.pi_optim = torch.optim.Adam(self.agent.parameters())

        self.memory = fmis.ReplayMemory(1000000)

        if self.cuda:
            self.Tensor = torch.cuda.FloatTensor
        else:
            self.Tensor = torch.Tensor

        if self.render:
            self.env.init_rendering()

        self.best = None

        # use OU noise to explore and learn the model for n warmup episodes
        self.noise = utils.OUNoise(action_dim, mu=10)
        self.warmup = 5

        # initialize experiment logging
        self.logging = params["logging"]
        if self.logging:
            self.directory = os.getcwd()
            filename = self.directory + "/data/fmis.csv"
            with open(filename, "w") as csvfile:
                self.writer = csv.writer(csvfile)
                self.writer.writerow(["episode", "reward"])
                self.train()
        else:
            self.train()
示例#6
0
def run_hiro(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if args.save_models and not os.path.exists("./pytorch_models"):
        os.makedirs("./pytorch_models")
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    if not os.path.exists(os.path.join(args.log_dir, args.log_file)):
        os.makedirs(os.path.join(args.log_dir, args.log_file))

    env = gym.make(args.env_name)
    obs = env.reset()

    goal = obs['desired_goal']
    state = obs['observation']

    # # Write Hyperparameters to file
    # print("---------------------------------------")
    # print("Current Arguments:")
    # with open(os.path.join(args.log_dir, args.log_file, "hps.txt"), 'w') as f:
    #     for arg in vars(args):
    #         print("{}: {}".format(arg, getattr(args, arg)))
    #         f.write("{}: {}\n".format(arg, getattr(args, arg)))
    # print("---------------------------------------\n")

    writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.log_file))
    # torch.cuda.set_device(0)

    env_name = type(env).__name__
    file_name = 'hiro_{}'.format(env_name)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = state.shape[0]
    goal_dim = goal.shape[0]
    action_dim = env.action_space.shape[0]

    max_action = int(env.action_space.high[0])

    # Initialize policy, replay buffers
    controller_policy = hiro.Controller(state_dim=state_dim,
                                        goal_dim=state_dim,
                                        action_dim=action_dim,
                                        max_action=max_action,
                                        actor_lr=args.ctrl_act_lr,
                                        critic_lr=args.ctrl_crit_lr,
                                        ctrl_rew_type=args.ctrl_rew_type)

    manager_policy = hiro.Manager(state_dim=state_dim,
                                  goal_dim=goal_dim,
                                  action_dim=state_dim,
                                  actor_lr=args.man_act_lr,
                                  critic_lr=args.man_crit_lr,
                                  candidate_goals=args.candidate_goals)

    calculate_controller_reward = hiro_controller_reward

    if args.noise_type == "ou":
        man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma)
        ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma)

    elif args.noise_type == "normal":
        man_noise = utils.NormalNoise(sigma=args.man_noise_sigma)
        ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma)

    manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size)
    controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size)

    # Logging Parameters
    total_timesteps = 0
    timesteps_since_eval = 0
    timesteps_since_manager = 0
    timesteps_since_subgoal = 0
    episode_num = 0
    done = True
    evaluations = []

    while total_timesteps < args.max_timesteps:
        if done:
            if total_timesteps != 0:
                print('Training Controller...')
                ctrl_act_loss, ctrl_crit_loss = controller_policy.train(
                    controller_buffer, episode_timesteps, args.ctrl_batch_size,
                    args.discount, args.ctrl_tau)

                writer.add_scalar('data/controller_actor_loss', ctrl_act_loss,
                                  total_timesteps)
                writer.add_scalar('data/controller_critic_loss',
                                  ctrl_crit_loss, total_timesteps)

                writer.add_scalar('data/controller_ep_rew', episode_reward,
                                  total_timesteps)
                writer.add_scalar('data/manager_ep_rew', manager_transition[4],
                                  total_timesteps)

                # Train Manager
                if timesteps_since_manager >= args.train_manager_freq:
                    print('Training Manager...')

                    timesteps_since_manager = 0
                    man_act_loss, man_crit_loss = manager_policy.train(
                        controller_policy, manager_buffer,
                        ceil(episode_timesteps / args.train_manager_freq),
                        args.man_batch_size, args.discount, args.man_tau)

                    writer.add_scalar('data/manager_actor_loss', man_act_loss,
                                      total_timesteps)
                    writer.add_scalar('data/manager_critic_loss',
                                      man_crit_loss, total_timesteps)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval = 0
                    avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish = evaluate_policy(
                        env, writer, manager_policy, controller_policy,
                        calculate_controller_reward, args.ctrl_rew_scale,
                        args.manager_propose_freq, len(evaluations))

                    writer.add_scalar('eval/avg_ep_rew', avg_ep_rew,
                                      total_timesteps)
                    writer.add_scalar('eval/avg_controller_rew',
                                      avg_controller_rew, total_timesteps)
                    writer.add_scalar('eval/avg_steps_to_finish', avg_steps,
                                      total_timesteps)
                    writer.add_scalar('eval/perc_env_goal_achieved',
                                      avg_env_finish, total_timesteps)

                    evaluations.append(
                        [avg_ep_rew, avg_controller_rew, avg_steps])

                    if args.save_models:
                        controller_policy.save(file_name + '_controller',
                                               directory="./pytorch_models")
                        manager_policy.save(file_name + '_manager',
                                            directory="./pytorch_models")

                    np.save("./results/%s" % (file_name), evaluations)

                # Process final state/obs, store manager transition, if it was not just created
                if len(manager_transition[-2]) != 1:
                    manager_transition[1] = state
                    manager_transition[5] = float(True)

                    # Every manager transition should have same length of sequences
                    if len(manager_transition[-2]
                           ) <= args.manager_propose_freq:
                        while len(manager_transition[-2]
                                  ) <= args.manager_propose_freq:
                            manager_transition[-1].append(np.inf)
                            manager_transition[-2].append(state)

                    manager_buffer.add(manager_transition)

            # Reset environment
            obs = env.reset()
            goal = obs['desired_goal']
            state = obs['observation']
            """
            obs = env.reset()  
            => {"observation", "achieved_goal", "desired_goal"}
                    (10, )        (3, )            (3, )
            goal = obs['desired_goal']  => (3, )
            state = obs['observation']  => (10, )
            """

            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

            # Create new manager transition
            subgoal = manager_policy.sample_goal(state, goal)

            timesteps_since_subgoal = 0

            # Create a high level transition
            manager_transition = [
                state, None, goal, subgoal, 0, False, [state], []
            ]

        # TODO: Scale action to environment
        action = controller_policy.select_action(state, subgoal)
        action = ctrl_noise.perturb_action(action, max_action)

        # Perform action, get (nextst, r, d)
        next_tup, manager_reward, env_done, _ = env.step(action)

        # Update cumulative reward (env. reward) for manager
        manager_transition[4] += manager_reward * args.man_rew_scale

        # Process
        next_goal = obs['desired_goal']
        next_state = obs['observation']

        # Append low level sequence for off policy correction
        manager_transition[-1].append(action)
        manager_transition[-2].append(next_state)

        # Calculate reward, transition subgoal
        controller_reward = calculate_controller_reward(
            state, subgoal, next_state, args.ctrl_rew_scale)
        subgoal = controller_policy.subgoal_transition(state, subgoal,
                                                       next_state)

        # Is the episode over?
        if env_done:
            done = True

        episode_reward += controller_reward

        # Store low level transition
        controller_buffer.add(
            (
                state, next_state, subgoal, \
                action, controller_reward, float(done), \
                [], []
            )
        )

        # Update state parameters
        state = next_state
        goal = next_goal

        # Update counters
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1
        timesteps_since_manager += 1
        timesteps_since_subgoal += 1

        if timesteps_since_subgoal % args.manager_propose_freq == 0:
            # Finish, add transition
            manager_transition[1] = state
            manager_transition[5] = float(True)

            manager_buffer.add(manager_transition)

            subgoal = manager_policy.sample_goal(state, goal)
            subgoal = man_noise.perturb_action(subgoal, max_action=np.inf)

            # Reset number of timesteps since we sampled a subgoal
            timesteps_since_subgoal = 0

            # Create a high level transition
            manager_transition = [
                state, None, goal, subgoal, 0, False, [state], []
            ]

    # Final evaluation
    evaluations.append([
        evaluate_policy(env, writer, manager_policy, controller_policy,
                        calculate_controller_reward, args.ctrl_rew_scale,
                        args.manager_propose_freq, len(evaluations))
    ])

    if args.save_models:
        controller_policy.save(file_name + '_controller',
                               directory="./pytorch_models")
        manager_policy.save(file_name + '_manager',
                            directory="./pytorch_models")

    np.save("./results/%s" % (file_name), evaluations)
示例#7
0
batch_size = 64
DDPG = ut.DDPG(dim_state, device)

print("NN initalised")

# initalise noise adding

# Training process
EPISODES = trange(N_episodes, desc='Episode: ', leave=True)
best_loss = 0
q = 0

for i in EPISODES:

    # intialise sampler
    ou_noise = ut.OUNoise()

    # Reset enviroment data
    done = False

    state = env.reset()

    # intialise variables
    total_episode_reward = 0.
    t = 0

    temp_loss = []

    while not done:

        # Create state tensor, remember to use single precision (torch.float32)
    def __init__(self, env_name, params):
        # initialize environment
        self.env = envs.make(env_name)
        self.env_name = env_name

        # save important experiment parameters for the training loop
        self.iterations = params["iterations"]
        self.mem_len = params["mem_len"]
        self.seed = params["seed"]
        self.render = params["render"]
        self.log_interval = params["log_interval"]
        self.warmup = params["warmup"]
        self.batch_size = params["batch_size"]
        self.save = params["save"]

        # initialize DDPG agent using experiment parameters from config file
        self.action_bound = self.env.action_bound[1]
        state_dim = self.env.observation_space
        action_dim = self.env.action_space
        hidden_dim = params["hidden_dim"]
        cuda = params["cuda"]
        network_settings = params["network_settings"]
        actor = ddpg.Actor(state_dim, hidden_dim, action_dim)
        target_actor = ddpg.Actor(state_dim, hidden_dim, action_dim)
        critic = utils.Critic(state_dim + action_dim, hidden_dim, 1)
        target_critic = utils.Critic(state_dim + action_dim, hidden_dim, 1)
        self.agent = ddpg.DDPG(actor,
                               target_actor,
                               critic,
                               target_critic,
                               network_settings,
                               GPU=cuda)

        # intitialize ornstein-uhlenbeck noise for random action exploration
        ou_scale = params["ou_scale"]
        ou_mu = params["ou_mu"]
        ou_sigma = params["ou_sigma"]
        self.noise = utils.OUNoise(action_dim,
                                   scale=ou_scale,
                                   mu=ou_mu,
                                   sigma=ou_sigma)
        self.noise.set_seed(self.seed)
        self.memory = utils.ReplayMemory(self.mem_len)

        self.pol_opt = torch.optim.Adam(actor.parameters())
        self.crit_opt = torch.optim.Adam(critic.parameters())

        # want to save the best policy
        self.best = None

        # send to GPU if flagged in experiment config file
        if cuda:
            self.Tensor = torch.cuda.FloatTensor
            self.agent = self.agent.cuda()
        else:
            self.Tensor = torch.Tensor

        if self.render:
            self.env.init_rendering()

        # initialize experiment logging. This wipes any previous file with the same name
        self.logging = params["logging"]
        if self.logging:
            self.directory = os.getcwd()
            filename = self.directory + "/data/ddpg.csv"
            with open(filename, "w") as csvfile:
                self.writer = csv.writer(csvfile)
                self.writer.writerow(["episode", "reward"])
                self.train()
        else:
            self.train()
示例#9
0
    def __init__(self, env_name, params):
        # initialize environment
        self.__env = gym.make(env_name)
        self.__env_name = env_name

        # save important experiment parameters for the training loop
        self.__iterations = params["iterations"]
        self.__mem_len = params["mem_len"]
        self.__seed = params["seed"]
        self.__render = params["render"]
        self.__log_interval = params["log_interval"]
        self.__warmup = params["warmup"]
        self.__batch_size = params["batch_size"]
        self.__learning_updates = params["learning_updates"]
        self.__save = params["save"]

        # initialize DDPG agent using experiment parameters from config file
        state_dim = self.__env.observation_space.shape[0]
        action_dim = self.__env.action_space.shape[0]
        hidden_dim = params["hidden_dim"]
        cuda = params["cuda"]
        network_settings = params["network_settings"]
        actor = Actor(state_dim, hidden_dim, action_dim)
        target_actor = Actor(state_dim, hidden_dim, action_dim)
        critic = utils.Critic(state_dim + action_dim, hidden_dim, 1)
        target_critic = utils.Critic(state_dim + action_dim, hidden_dim, 1)
        self.__agent = DDPG(actor,
                            target_actor,
                            critic,
                            target_critic,
                            network_settings,
                            GPU=cuda)

        # intitialize ornstein-uhlenbeck noise for random action exploration
        ou_scale = params["ou_scale"]
        ou_mu = params["ou_mu"]
        ou_sigma = params["ou_sigma"]
        self.__noise = utils.OUNoise(action_dim,
                                     scale=ou_scale,
                                     mu=ou_mu,
                                     sigma=ou_sigma)
        self.__noise.set_seed(self.__seed)
        self.__memory = ReplayMemory(self.__mem_len)
        self.__pol_opt = torch.optim.Adam(actor.parameters(),
                                          params["actor_lr"])
        self.__crit_opt = torch.optim.Adam(critic.parameters(),
                                           params["critic_lr"])

        # want to save the best policy
        self.__best = None

        # send to GPU if flagged in experiment config file
        if cuda:
            self.__Tensor = torch.cuda.FloatTensor
            self.__agent = self.__agent.cuda()
        else:
            self.__Tensor = torch.Tensor

        # initialize experiment logging. This wipes any previous file with the same name
        self.__logging = params["logging"]
        self.__directory = os.getcwd()
        if self.__logging:
            filename = self.__directory + "/data/ddpg-" + self.__env_name + ".csv"
            with open(filename, "w") as csvfile:
                self.__writer = csv.writer(csvfile)
                self.__writer.writerow(["episode", "reward"])
                self._run_algo()
        else:
            self._run_algo()
示例#10
0
SIGMA_INIT = float(conf.get('actor', 'sigma_init'))
ADJUST_STEP = int(conf.get('actor', 'adjust_step'))
P_LEARNING_RATE = float(conf.get('actor', 'learning_rate'))
Q_HIDDEN_SIZE = int(conf.get('critic', 'hidden_size'))
Q_LEARNING_RATE = float(conf.get('critic', 'learning_rate'))
BATCH_SIZE = int(conf.get('main', 'batch_size'))
NUM_PARAL = int(conf.get('main', 'num_paral'))
AUDIO_SEGMENT = int(conf.get('main', 'audio_segment'))
frameRate_Hz = int(conf.get('main', 'frameRate_Hz'))

### Condition Setting
device = 'cuda' if torch.cuda.is_available() else 'cpu'
policy = models.stacked_BLSTM(IN_SIZE, OUT_SIZE, P_HIDDEN_SIZE, P_NUM_LAYERS,
                              SIGMA_INIT).to(device)
q_func = models.Qfunction(IN_SIZE, OUT_SIZE, Q_HIDDEN_SIZE).to(device)
ou_noise = utils.OUNoise(BATCH_SIZE, OUT_SIZE)
loss_fun = nn.MSELoss(reduction='none')
p_optim = torch.optim.SGD(policy.parameters(), lr=P_LEARNING_RATE)
q_optim = torch.optim.Adam(q_func.parameters(), lr=Q_LEARNING_RATE)
train_loader = utils.Batch_generator('training', BATCH_SIZE)

policy.load_state_dict(torch.load('exp/pretrain.model'))
#p_optim.load_state_dict(torch.load('exp/p_optim.state'))
#q_func.load_state_dict(torch.load('exp/q1000.model'))
#q_optim.load_state_dict(torch.load('exp/q_optim.state'))

for iteration in range(10000000):
    policy.train()
    q_func.train()
    start = time.time()