Exemplo n.º 1
0
def train(env, policy, normalizer, hp, parentPipes, args):
    logger = DataLog()
    total_steps = 0
    best_return = -99999999
    if os.path.isdir(args.logdir) == False:
        os.mkdir(args.logdir)
    previous_dir = os.getcwd()
    os.chdir(args.logdir)
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False: os.mkdir('logs')
    hp.to_text('hyperparameters')

    for step in range(hp.nb_steps):

        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        if (parentPipes):
            process_count = len(parentPipes)
        if parentPipes:
            p = 0
            while (p < hp.nb_directions):
                temp_p = p
                n_left = hp.nb_directions - p  #Number of processes required to complete the search
                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send([
                        _EXPLORE,
                        [normalizer, policy, hp, "positive", deltas[temp_p]]
                    ])
                    temp_p = temp_p + 1
                temp_p = p
                for k in range(min([process_count, n_left])):
                    positive_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send([
                        _EXPLORE,
                        [normalizer, policy, hp, "negative", deltas[temp_p]]
                    ])
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    negative_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                p = p + process_count

                # print('mp step has worked, ', p)
                print('total steps till now: ', total_steps,
                      'Processes done: ', p)

        else:
            # Getting the positive rewards in the positive directions
            for k in range(hp.nb_directions):
                positive_rewards[k] = explore(env, policy, "positive",
                                              deltas[k], hp)

            # Getting the negative rewards in the negative/opposite directions
            for k in range(hp.nb_directions):
                negative_rewards[k] = explore(env, policy, "negative",
                                              deltas[k], hp)

        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {
            k: max(r_pos, r_neg)
            for k, (
                r_pos,
                r_neg) in enumerate(zip(positive_rewards, negative_rewards))
        }
        order = sorted(scores.keys(),
                       key=lambda x: -scores[x])[:int(hp.nb_best_directions)]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k])
                    for k in order]

        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array([x[0]
                                for x in rollouts] + [x[1] for x in rollouts])
        sigma_r = all_rewards.std(
        )  # Standard deviation of only rewards in the best directions is what it should be
        # Updating our policy
        policy.update(rollouts, sigma_r, args)

        # Printing the final reward of the policy after the update
        reward_evaluation = explore(env, policy, None, None, hp)
        logger.log_kv('steps', total_steps)
        logger.log_kv('return', reward_evaluation)
        if (reward_evaluation > best_return):
            best_policy = policy.theta
            best_return = reward_evaluation
            np.save("iterations/best_policy.npy", best_policy)
        print('Step:', step, 'Reward:', reward_evaluation)
        policy_path = "iterations/" + "policy_" + str(step)
        np.save(policy_path, policy.theta)
        logger.save_log('logs/')
        make_train_plots_ars(log=logger.log,
                             keys=['steps', 'return'],
                             save_loc='logs/')
Exemplo n.º 2
0
def train(env, policy, normalizer, hp, job_name="default_exp"):
    """
    Training using Augmented Random Search
    :param env : OpenAI gym environment
    :param policy : Object of class Policy
    :param normalizer : Object of class normalizer
    :param hp : Object of class hp
    :param job_name : Name of the directory where you want to save data
    :returns : Nothing, trains the agent  
    """
    logger = DataLog()
    total_steps = 0
    best_return = -99999999
    if os.path.isdir(job_name) == False:
        os.mkdir(job_name)
    previous_dir = os.getcwd()
    os.chdir(job_name)
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False: os.mkdir('logs')
    hp.to_text('hyperparameters')
    for step in range(hp.nb_steps):
        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas(hp)
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions

        # Getting the positive rewards in the positive directions
        for k in range(hp.nb_directions):
            positive_rewards[k], step_count_positive = explore(
                env, normalizer, policy, "positive", deltas[k], hp)
            # break
            # print('done: ',k)
        # Getting the negative rewards in the negative/opposite directions
        for k in range(hp.nb_directions):
            negative_rewards[k], step_count_negative = explore(
                env, normalizer, policy, "negative", deltas[k], hp)
            # break
            # print('done: ', k)
        total_steps = total_steps + step_count_positive + step_count_negative

        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {
            k: max(r_pos, r_neg)
            for k, (
                r_pos,
                r_neg) in enumerate(zip(positive_rewards, negative_rewards))
        }
        order = sorted(scores.keys(),
                       key=lambda x: -scores[x])[:hp.nb_best_directions]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k])
                    for k in order]

        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array([x[0]
                                for x in rollouts] + [x[1] for x in rollouts])
        sigma_r = all_rewards.std(
        )  # Standard deviation of only rewards in the best directions is what it should be

        # Updating our policy
        policy.update(rollouts, sigma_r, args)
        # Printing the final reward of the policy after the update
        reward_evaluation, _ = explore(env, normalizer, policy, None, None, hp)
        logger.log_kv('steps', total_steps)
        logger.log_kv('return', reward_evaluation)
        if (reward_evaluation > best_return):
            best_policy = policy.theta
            best_return = reward_evaluation
            np.save("iterations/best_policy.npy", best_policy)
        print('Step:', step, 'Reward:', reward_evaluation)
        policy_path = "iterations/" + "policy_" + str(step)
        np.save(policy_path, policy.theta)
        logger.save_log('logs/')
        make_train_plots_ars(log=logger.log,
                             keys=['steps', 'return'],
                             save_loc='logs/')