def train(env, policy, hp, parentPipes, args): args.logdir = "experiments/" + args.logdir logger = DataLog() total_steps = 0 best_return = -99999999 working_dir = os.getcwd() if os.path.isdir(args.logdir) == False: os.mkdir(args.logdir) previous_dir = os.getcwd() os.chdir(args.logdir) if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False: os.mkdir('logs') hp.to_text('hyperparameters') log_dir = os.getcwd() os.chdir(working_dir) for step in range(hp.nb_steps): if hp.domain_Rand: env.Set_Randomization(default=False) else: env.randomize_only_inclines() #Cirriculum learning if (step > hp.curilearn): avail_deg = [7, 9, 11, 11] env.incline_deg = avail_deg[random.randint(0, 3)] else: avail_deg = [5, 7, 9] env.incline_deg = avail_deg[random.randint(0, 2)] # Initializing the perturbations deltas and the positive/negative rewards deltas = policy.sample_deltas() positive_rewards = [0] * hp.nb_directions negative_rewards = [0] * hp.nb_directions if (parentPipes): process_count = len(parentPipes) if parentPipes: p = 0 while (p < hp.nb_directions): temp_p = p n_left = hp.nb_directions - p #Number of processes required to complete the search for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send( [_EXPLORE, [policy, hp, "positive", deltas[temp_p]]]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): positive_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send( [_EXPLORE, [policy, hp, "negative", deltas[temp_p]]]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): negative_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 p = p + process_count print('total steps till now: ', total_steps, 'Processes done: ', p) else: # Getting the positive rewards in the positive directions for k in range(hp.nb_directions): positive_rewards[k] = explore(env, policy, "positive", deltas[k], hp) # Getting the negative rewards in the negative/opposite directions for k in range(hp.nb_directions): negative_rewards[k] = explore(env, policy, "negative", deltas[k], hp) # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions scores = { k: max(r_pos, r_neg) for k, ( r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) } order = sorted(scores.keys(), key=lambda x: -scores[x])[:int(hp.nb_best_directions)] rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] # Gathering all the positive/negative rewards to compute the standard deviation of these rewards all_rewards = np.array([x[0] for x in rollouts] + [x[1] for x in rollouts]) sigma_r = all_rewards.std( ) # Standard deviation of only rewards in the best directions is what it should be # Updating our policy policy.update(rollouts, sigma_r, args) #Start evaluating after only second stage if step >= hp.curilearn: # policy evaluation after specified iterations if step % hp.evalstep == 0: reward_evaluation = policyevaluation(env, policy, hp) logger.log_kv('steps', step) logger.log_kv('return', reward_evaluation) if (reward_evaluation > best_return): best_policy = policy.theta best_return = reward_evaluation np.save(log_dir + "/iterations/best_policy.npy", best_policy) print('Step:', step, 'Reward:', reward_evaluation) policy_path = log_dir + "/iterations/" + "policy_" + str(step) np.save(policy_path, policy.theta) logger.save_log(log_dir + "/logs/") make_train_plots_ars(log=logger.log, keys=['steps', 'return'], save_loc=log_dir + "/logs/")
def train(env, policy, normalizer, hp, parentPipes, args): logger = DataLog() total_steps = 0 best_return = -99999999 if os.path.isdir(args.logdir) == False: os.mkdir(args.logdir) previous_dir = os.getcwd() os.chdir(args.logdir) if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False: os.mkdir('logs') hp.to_text('hyperparameters') for step in range(hp.nb_steps): # Initializing the perturbations deltas and the positive/negative rewards deltas = policy.sample_deltas() positive_rewards = [0] * hp.nb_directions negative_rewards = [0] * hp.nb_directions if (parentPipes): process_count = len(parentPipes) if parentPipes: p = 0 while (p < hp.nb_directions): temp_p = p n_left = hp.nb_directions - p #Number of processes required to complete the search for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send([ _EXPLORE, [normalizer, policy, hp, "positive", deltas[temp_p]] ]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): positive_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send([ _EXPLORE, [normalizer, policy, hp, "negative", deltas[temp_p]] ]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): negative_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 p = p + process_count # print('mp step has worked, ', p) print('total steps till now: ', total_steps, 'Processes done: ', p) else: # Getting the positive rewards in the positive directions for k in range(hp.nb_directions): positive_rewards[k] = explore(env, policy, "positive", deltas[k], hp) # Getting the negative rewards in the negative/opposite directions for k in range(hp.nb_directions): negative_rewards[k] = explore(env, policy, "negative", deltas[k], hp) # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions scores = { k: max(r_pos, r_neg) for k, ( r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) } order = sorted(scores.keys(), key=lambda x: -scores[x])[:int(hp.nb_best_directions)] rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] # Gathering all the positive/negative rewards to compute the standard deviation of these rewards all_rewards = np.array([x[0] for x in rollouts] + [x[1] for x in rollouts]) sigma_r = all_rewards.std( ) # Standard deviation of only rewards in the best directions is what it should be # Updating our policy policy.update(rollouts, sigma_r, args) # Printing the final reward of the policy after the update reward_evaluation = explore(env, policy, None, None, hp) logger.log_kv('steps', step) logger.log_kv('return', reward_evaluation) if (reward_evaluation > best_return): best_policy = policy.theta best_return = reward_evaluation np.save("iterations/best_policy.npy", best_policy) print('Step:', step, 'Reward:', reward_evaluation) policy_path = "iterations/" + "policy_" + str(step) np.save(policy_path, policy.theta) logger.save_log('logs/') make_train_plots_ars(log=logger.log, keys=['steps', 'return'], save_loc='logs/')