def main_loop():
    colors = []
    num_episodes = args.num_ensembles
    for i in range(num_episodes):
        colors.append('#%06X' % randint(0, 0xFFFFFF))
    improved_context_list_np = sample_initial_context_normal(
        args.num_ensembles)
    if initial_training:
        train_on_initial(improved_context_list_np)
    for i_iter in range(args.max_iter_num):

        # generate multiple trajectories that reach the minimum batch_size
        policy_np.training = False
        if len(replay_memory) == 0 or not args.rm_as_context:
            context_list_np = improved_context_list_np
        else:
            context_list_np = replay_memory.data
        batch_np, log_np = agent_np.collect_episodes(context_list_np,
                                                     args.num_req_steps,
                                                     args.num_ensembles)

        disc_rew_np = discounted_rewards(batch_np.memory, args.gamma)
        iter_dataset_np = BaseDataset(batch_np.memory,
                                      disc_rew_np,
                                      args.device_np,
                                      args.dtype,
                                      max_len=max_episode_len)
        print('np avg actions: ', log_np['action_mean'])
        advantages_np = estimate_v_a(iter_dataset_np, disc_rew_np,
                                     value_replay_memory, value_np, args)

        improved_context_list_np = improvement_step_all(
            iter_dataset_np, advantages_np, args.max_kl_np, args)
        # training
        value_replay_memory.add(iter_dataset_np)
        train_value_np(value_replay_memory)

        tn0 = time.time()
        replay_memory.add(iter_dataset_np)
        train_np(replay_memory)
        tn1 = time.time()
        tot_steps_np.append(tot_steps_np[-1] + log_np['num_steps'])
        avg_rewards_np.append(log_np['avg_reward'])

        if i_iter % args.log_interval == 0:
            print(i_iter)
            print('np: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(
                log_np['min_reward'], log_np['max_reward'],
                log_np['avg_reward']))
        print('new sigma', args.fixed_sigma)
        plot_rewards_history(tot_steps_np, avg_rewards_np)
        store_avg_rewards(
            tot_steps_np[-1], avg_rewards_np[-1],
            np_file.replace(
                str(args.seed) + '.csv', 'avg' + str(args.seed) + '.csv'))
        if tot_steps_np[-1] > args.tot_steps:
            break
示例#2
0
def main_loop():
    improved_context_list_np = sample_initial_context_normal(env,  init_sigma=0.05)
    train_on_initial(improved_context_list_np)
    #policy_np.apply(InitFunc.init_zero)
    for i_iter in range(args.max_iter_num):

        # define context set
        policy_np.training = False
        if len(replay_memory) == 0 or not args.rm_as_context:
            context_list_np = improved_context_list_np
        else:
            context_list_np = replay_memory.data

        # collect samples
        batch_np, log_np = agent_np.collect_episodes(context_list_np, args.num_req_steps, args.num_ensembles)

        # compute discounted rewards
        disc_rew_np = discounted_rewards(batch_np.memory, args.gamma)
        iter_dataset_np = BaseDataset(batch_np.memory, disc_rew_np, args.device_np, args.dtype,  max_len=max_episode_len)

        # estimate advantages
        if args.learn_baseline:
            if args.value_net:
                state_list = [ep['states'][:ep['real_len']] for ep in iter_dataset_np]
                advantages_np, returns = critic_estimate(value_net, state_list, disc_rew_np, args)
                update_critic(value_net, torch.cat(state_list, dim=0), torch.cat(returns, dim=0))
            else:
                advantages_np = estimate_v_a(iter_dataset_np, disc_rew_np, value_replay_memory, value_np, args)
                value_replay_memory.add(iter_dataset_np)
                train_value_np(value_replay_memory)
        else:
            advantages_np = disc_rew_np

        # update step
        improved_context_list_np = improvement_step_all(iter_dataset_np, advantages_np, args.max_kl_np, args)

        # training
        replay_memory.add(iter_dataset_np)
        train_np(replay_memory)

        # prints & plots
        tot_steps_np.append(tot_steps_np[-1] + log_np['num_steps'])
        avg_rewards_np.append(log_np['avg_reward'])
        if i_iter % args.log_interval == 0:
            print(i_iter)
            print('np avg actions: ', log_np['action_mean'])
            print('np: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(log_np['min_reward'], log_np['max_reward'], log_np['avg_reward']))
            # plot_pca_proj(iter_dataset_np, advantages_np, policy_np)
        print('new sigma', args.fixed_sigma)
        plot_rewards_history(tot_steps_np, avg_rewards_np)
        store_avg_rewards(tot_steps_np[-1], avg_rewards_np[-1], np_file.replace(str(args.seed)+'.csv', 'avg'+str(args.seed)+'.csv'))

        if tot_steps_np[-1] > args.tot_steps or log_np['avg_reward'] < -5000:
            break
        """clean up gpu memory"""
        torch.cuda.empty_cache()
示例#3
0
def main_loop(improved_context_list):
    colors = []
    num_episodes = args.num_ensembles
    for i in range(num_episodes):
        colors.append('#%06X' % randint(0, 0xFFFFFF))

    for i_iter in range(args.max_iter_num):
        print('sampling episodes')  # (1)
        # generate multiple trajectories that reach the minimum batch_size
        batch, log = agent.collect_episodes(improved_context_list,
                                            render=(i_iter % 10 == 0))

        disc_rew = discounted_rewards(batch.memory, args.gamma)
        complete_dataset = BaseDataset(batch.memory,
                                       disc_rew,
                                       args.device_np,
                                       args.dtype,
                                       max_len=max_episode_len)
        advantages = estimate_v_a(complete_dataset, disc_rew)

        t0 = time.time()
        improved_context_list = improvement_step_all(complete_dataset,
                                                     advantages)
        t1 = time.time()

        # create training set
        tn0 = time.time()
        replay_memory.add(complete_dataset)
        train_np(replay_memory)
        tn1 = time.time()

        tv0 = time.time()
        if i_iter % args.plot_every == 0:
            # plot_initial_context(improved_context_list, colors, env, args, i_iter)
            # plot_training_set(i_iter, replay_memory, env, args)
            plot_policy(model, improved_context_list, replay_memory, i_iter,
                        log['avg_reward'], env, args, colors)
            plot_improvements(complete_dataset, disc_rew, env, i_iter, args,
                              colors)
        tv1 = time.time()
        tot_steps.append(tot_steps[-1] + log['num_steps'])
        avg_rewards.append(log['avg_reward'])
        if i_iter % args.log_interval == 0:
            print(
                '{}\tT_sample {:.4f} \tT_update {:.4f} \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'
                .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'],
                        log['max_reward'], log['avg_reward']))
            print('Training:  \tT_policy {:.2f}  \nT_plots {:.2f}'.format(
                tn1 - tn0, tv1 - tv0))

        if i_iter % args.plot_every == 0:
            plot_rewards_history(avg_rewards, tot_steps, args)
    plot_rewards_history(avg_rewards, tot_steps, args)
    """clean up gpu memory"""
    torch.cuda.empty_cache()
示例#4
0
def main_loop():
    improved_context = sample_initial_context()
    avg_rewards = []
    for i_iter in range(args.max_iter_num):
        print('sampling episodes')
        # (1)
        # generate multiple trajectories that reach the minimum batch_size
        # introduce param context=None when np is policy, these will be the context points used to predict
        policy_np.training = False
        batch, log, memory = agent.collect_samples(args.min_batch_size, context=improved_context)  # batch of batch_size transitions from multiple
        print(log['num_steps'], log['num_episodes'])                     # episodes (separated by mask=0). Stored in Memory

        disc_rew = discounted_rewards(batch, args.gamma)
        complete_dataset = BaseDataset(batch, disc_rew, args.device_np, args.dtype)
        value_replay_memory.add(complete_dataset)
        train_value_np(value_replay_memory)

        estimated_disc_rew, values_stdevs = estimate_disc_rew(complete_dataset, i_iter)
        memory.set_disc_rew(estimated_disc_rew)

        t0 = time.time()
        all_improved_context_0 = improvement_step(batch)
        all_improved_context = imporove_mean_stdv(complete_dataset, estimated_disc_rew, values_stdevs)
        t1 = time.time()
        key = 'means' if args.improve_mean else 'actions'
        improved_context = [all_improved_context['states'], all_improved_context[key]]

        # plot improved context and actions' discounted rewards
        plot_improvements(batch, improved_context, env, i_iter, args)

        # create training set
        training_set = all_improved_context['means']
        frac_action_in_training = int(frac_replace_actions * training_set.shape[1])
        training_set[:, :frac_action_in_training, :] = all_improved_context['actions'][:, :frac_action_in_training, :]

        dataset = MemoryDatasetNP(batch, training_set, args.device_np, args.dtype, max_len=999)
        replay_memory.add(dataset)

        plot_training_set(i_iter, replay_memory, env, args)

        print('replay memory size:', len(replay_memory))
        train_np(replay_memory)

        plot_NP_policy(policy_np, improved_context, i_iter, log['avg_reward'], env, args, num_samples=1)

        avg_rewards.append(log['avg_reward'])
        if i_iter % args.log_interval == 0:
            print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format(
                  i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward']))

    plot_rewards_history(avg_rewards, args)

    """clean up gpu memory"""
    torch.cuda.empty_cache()
示例#5
0
def main_loop():

    for i_iter in range(args.max_iter_num):
        # collect samples
        batch, log = agent.collect_episodes(args.num_req_steps)

        # compute discounted rewards
        disc_rew_mlp = discounted_rewards(batch.memory, args.gamma)
        iter_dataset = BaseDataset(batch.memory,
                                   disc_rew_mlp,
                                   args.device_np,
                                   args.dtype,
                                   max_len=max_episode_len)

        # estimate advantages
        if args.value_net:
            state_list = [ep['states'][:ep['real_len']] for ep in iter_dataset]
            advantages, returns = critic_estimate(value_net, state_list,
                                                  disc_rew_mlp, args)
            update_critic(value_net, torch.cat(state_list, dim=0),
                          torch.cat(returns, dim=0))
        else:
            advantages = estimate_v_a(iter_dataset, disc_rew_mlp,
                                      value_replay_memory, value_net, args)
            value_replay_memory.add(iter_dataset)
            train_value(value_replay_memory)

        # returned context not used but added to iter_dataset inside the function
        improved_context_list = improvement_step_all(iter_dataset, advantages,
                                                     args.max_kl_mlp, args)

        # training
        replay_memory.add(iter_dataset)
        train_policy(replay_memory)

        # prints & plots
        tot_steps.append(tot_steps[-1] + log['num_steps'])
        avg_rewards.append(log['avg_reward'])

        if i_iter % args.log_interval == 0:
            print(i_iter)
            print('mlp: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(
                log['min_reward'], log['max_reward'], log['avg_reward']))
        print('new sigma', args.fixed_sigma)
        store_avg_rewards(
            tot_steps[-1], avg_rewards[-1],
            mlp_file.replace(
                str(args.seed) + '.csv', 'avg' + str(args.seed) + '.csv'))
        if i_iter % args.plot_every == 0:
            plot_rewards_history(tot_steps, avg_rewards)
        if tot_steps[-1] > args.tot_steps:
            break
    """clean up gpu memory"""
    torch.cuda.empty_cache()
def main_loop():
    colors = []
    num_episodes = args.num_ensembles
    for i in range(num_episodes):
        colors.append('#%06X' % randint(0, 0xFFFFFF))
    #print('sampling initial context')
    if args.init_normal:
        improved_context_list = sample_initial_context_normal(args.num_ensembles)
    else:
        improved_context_list = sample_initial_context_uniform(args.num_ensembles)
    plot_initial_context(improved_context_list, colors, env, args, '00')
    if initial_training:
        train_on_initial(improved_context_list)
    for i_iter in range(args.max_iter_num):
        print('sampling episodes')
        # (1)
        # generate multiple trajectories that reach the minimum batch_size
        # introduce param context=None when np is policy, these will be the context points used to predict
        policy_np.training = False
        batch, log = agent.collect_episodes(improved_context_list)  # batch of batch_size transitions from multiple
        #print(log['num_steps'], log['num_episodes'])                # episodes (separated by mask=0). Stored in Memory

        disc_rew = discounted_rewards(batch.memory, args.gamma)
        complete_dataset = BaseDataset(batch.memory, disc_rew, args.device_np, args.dtype,  max_len=max_episode_len)

        value_replay_memory.add(complete_dataset)
        train_value_np(value_replay_memory)
        estimated_disc_rew, values_stdevs = estimate_disc_rew(complete_dataset, i_iter, episode_specific_value=args.episode_specific_value)

        t0 = time.time()
        improved_context_list = improvement_step(complete_dataset, estimated_disc_rew, values_stdevs)
        t1 = time.time()
        #plot_initial_context(improved_context_list, colors, env, args, i_iter)
        # plot improved context and actions' discounted rewards
        if i_iter % args.plot_every == 0:
            plot_improvements(complete_dataset, estimated_disc_rew, env, i_iter, args, colors)

        # create training set
        replay_memory.add(complete_dataset)
        train_np(replay_memory)
        #plot_training_set(i_iter, replay_memory, env, args)
        if i_iter % args.plot_every == 0:
            plot_NP_policy(policy_np, improved_context_list, i_iter, log['avg_reward'], env, args, colors)

        avg_rewards.append(log['avg_reward'])
        if i_iter % args.log_interval == 0 and False:
            print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format(
                  i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward']))

    plot_rewards_history(avg_rewards, args)

    """clean up gpu memory"""
    torch.cuda.empty_cache()
示例#7
0
def main_loop():
    for i_iter in range(args.max_iter_num):
        # (1)
        # generate multiple trajectories that reach the minimum batch_size
        # introduce param context=None when np is policy, these will be the context points used to predict
        batch, log, memory = agent.collect_samples(
            args.min_batch_size
        )  # batch of batch_size transitions from multiple
        print(log['num_steps'], log['num_episodes']
              )  # episodes (separated by mask=0). Stored in Memory
        disc_rew = discounted_rewards(batch, args.gamma)
        memory.set_disc_rew(disc_rew)
        complete_dataset = BaseDataset(batch,
                                       disc_rew,
                                       args.device,
                                       dtype,
                                       max_len=max_episode_len)

        t0 = time.time()
        update_params_trpo(
            batch, i_iter
        )  # estimate advantages from samples and update policy by TRPO step
        t1 = time.time()
        plot_policy(policy_net, (i_iter, log['avg_reward'], 'policies'))

        if i_iter % args.log_interval == 0:
            print(
                '{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'
                .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'],
                        log['max_reward'], log['avg_reward']))

        if not args.episode_specific_value:
            iter_dataset = {}
            iter_states, iter_q = merge_padded_lists(
                [episode['states'] for episode in complete_dataset], [
                    episode['discounted_rewards']
                    for episode in complete_dataset
                ],
                max_lens=[episode['real_len'] for episode in complete_dataset])
            iter_dataset['states'] = iter_states
            iter_dataset['discounted_rewards'] = iter_q
            iter_dataset['real_len'] = iter_states.shape[-2]
            complete_dataset = [iter_dataset]
        value_replay_memory.add(complete_dataset)
        train_value_np(value_replay_memory)
def main_loop():
    colors = []
    num_episodes = args.num_ensembles
    for i in range(num_episodes):
        colors.append('#%06X' % randint(0, 0xFFFFFF))

    improved_context_list = sample_initial_context_normal(args.num_ensembles)
    if initial_training:
        train_on_initial(improved_context_list)
    for i_iter in range(args.max_iter_num):
        if tot_steps_trpo[-1] - tot_steps_np[-1] < 1000:
            batch_trpo, log_trpo, memory_trpo = agent_trpo.collect_samples(args.min_batch_size)  # batch of batch_size transitions from multiple
            update_params_trpo(batch_trpo)  # generate multiple trajectories that reach the minimum batch_size
            tot_steps_trpo.append(tot_steps_trpo[-1] + log_trpo['num_steps'])
            avg_rewards_trpo.append(log_trpo['avg_reward'])

        # generate multiple trajectories that reach the minimum batch_size
        policy_np.training = False
        batch, log = agent.collect_episodes(improved_context_list)  # batch of batch_size transitions from multiple
        #print(log['num_steps'], log['num_episodes'])                # episodes (separated by mask=0). Stored in Memory

        disc_rew = discounted_rewards(batch.memory, args.gamma)
        complete_dataset = BaseDataset(batch.memory, disc_rew, args.device_np, args.dtype,  max_len=max_episode_len)

        value_replay_memory.add(complete_dataset)

        advantages = estimate_v_a(complete_dataset, disc_rew)

        tv0 = time.time()
        train_value_np(value_replay_memory)
        tv1 = time.time()

        t0 = time.time()
        improved_context_list = improvement_step_all(complete_dataset, advantages)
        t1 = time.time()
        #plot_initial_context(improved_context_list, colors, env, args, i_iter)

        # create training set
        tn0 = time.time()
        replay_memory.add(complete_dataset)
        train_np(replay_memory)
        tn1 = time.time()
        tot_steps_np.append(tot_steps_np[-1] + log['num_steps'])

        #plot_training_set(i_iter, replay_memory, env, args)
        if i_iter % args.plot_every == 0 and False:
            plot_NP_policy(policy_np, improved_context_list, replay_memory, i_iter, log['avg_reward'], env, args, colors)
            plot_improvements(complete_dataset, advantages, env, i_iter, args, colors)

        avg_rewards_np.append(log['avg_reward'])
        if i_iter % args.log_interval == 0:
            print('{}\tT_sample {:.4f} \tT_update {:.4f} \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(
                  i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward']))
            print('Training:  \tT_policy {:.2f}  \tT_value {:.2f}'.format(tn1-tn0, tv1-tv0))

            if i_iter % args.plot_every == 0:
                plot_rewards_history(trpo=[tot_steps_trpo, avg_rewards_trpo], mi=[tot_steps_np, avg_rewards_np],
                                     args=args)
        plot_rewards_history(trpo=[tot_steps_trpo, avg_rewards_trpo], mi=[tot_steps_np, avg_rewards_np], args=args)
    args.max_kl = args.max_kl*0.99
    args.fixed_sigma = args.fixed_sigma*0.96
    """clean up gpu memory"""
    torch.cuda.empty_cache()
示例#9
0
def main_loop():
    improved_context_list_mi = sample_initial_context_normal(env)
    for i_iter in range(args.max_iter_num):

        # define context set
        if len(replay_memory_mi) == 0 or not args.rm_as_context:
            context_list_np = improved_context_list_mi
        else:
            context_list_np = replay_memory_mi.data

        # collect samples
        batch_mi, log_mi = agent_mi.collect_episodes(context_list_np,
                                                     args.num_req_steps,
                                                     args.num_ensembles)

        # compute discounted rewards
        disc_rew_mi = discounted_rewards(batch_mi.memory, args.gamma)
        iter_dataset_mi = BaseDataset(batch_mi.memory,
                                      disc_rew_mi,
                                      args.device_np,
                                      args.dtype,
                                      max_len=max_episode_len)

        # estimate advantages
        if args.value_net:
            state_list = [
                ep['states'][:ep['real_len']] for ep in iter_dataset_mi
            ]
            advantages_mi, returns = critic_estimate(value_net, state_list,
                                                     disc_rew_mi, args)
            update_critic(value_net, torch.cat(state_list, dim=0),
                          torch.cat(returns, dim=0))
        else:
            advantages_mi = estimate_v_a(iter_dataset_mi, disc_rew_mi,
                                         value_replay_memory, model, args)
            value_replay_memory.add(iter_dataset_mi)

        # update step
        improved_context_list_mi = improvement_step_all(
            iter_dataset_mi, advantages_mi, args.max_kl_mi, args)

        # training
        replay_memory_mi.add(iter_dataset_mi)
        train_mi(replay_memory_mi)

        # prints & plots
        tot_steps_mi.append(tot_steps_mi[-1] + log_mi['num_steps'])
        avg_rewards_mi.append(log_mi['avg_reward'])
        if i_iter % 1 == 0:
            plot_pca_proj(iter_dataset_mi, advantages_mi, model)
        if i_iter % args.log_interval == 0:
            print(i_iter)
            print('mi: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(
                log_mi['min_reward'], log_mi['max_reward'],
                log_mi['avg_reward']))
        print('new sigma', args.fixed_sigma)
        store_avg_rewards(
            tot_steps_mi[-1], log_mi['avg_reward'],
            mi_file.replace(
                str(args.seed) + '.csv', 'avg' + str(args.seed) + '.csv'))
        if i_iter % args.plot_every == 0:
            plot_rewards_history(tot_steps_mi, avg_rewards_mi)
        if tot_steps_mi[-1] > args.tot_steps:
            break
    """clean up gpu memory"""
    torch.cuda.empty_cache()
示例#10
0
def main_loop(improved_context_list):
    colors = []
    num_episodes = args.num_ensembles
    for i in range(num_episodes):
        colors.append('#%06X' % randint(0, 0xFFFFFF))

    for i_iter in range(args.max_iter_num):
        if tot_steps_trpo[-1] - tot_steps_mi[-1] < 1000:
            batch_trpo, log_trpo, memory_trpo = agent_trpo.collect_samples(
                args.min_batch_size
            )  # batch of batch_size transitions from multiple
            update_params_trpo(
                batch_trpo
            )  # generate multiple trajectories that reach the minimum batch_size
            tot_steps_trpo.append(tot_steps_trpo[-1] + log_trpo['num_steps'])
            avg_rewards_trpo.append(log_trpo['avg_reward'])

        batch_mi, log_mi = agent_mi.collect_episodes(improved_context_list,
                                                     render=(i_iter % 10 == 0))
        disc_rew = discounted_rewards(batch_mi.memory, args.gamma)
        complete_dataset = BaseDataset(batch_mi.memory,
                                       disc_rew,
                                       args.device_np,
                                       args.dtype,
                                       max_len=max_episode_len)
        advantages = estimate_v_a(complete_dataset, disc_rew)

        t0 = time.time()
        improved_context_list = improvement_step_all(complete_dataset,
                                                     advantages)
        t1 = time.time()

        # create training set
        tn0 = time.time()
        replay_memory.add(complete_dataset)
        train_np(replay_memory)
        tn1 = time.time()

        tv0 = time.time()
        if False and i_iter % args.plot_every == 0:
            # plot_initial_context(improved_context_list, colors, env, args, i_iter)
            # plot_training_set(i_iter, replay_memory, env, args)
            # plot_policy(model, improved_context_list, replay_memory, i_iter, log['avg_reward'], env, args, colors)
            plot_improvements(complete_dataset, disc_rew, env, i_iter, args,
                              colors)
        tv1 = time.time()
        tot_steps_mi.append(tot_steps_mi[-1] + log_mi['num_steps'])
        avg_rewards_mi.append(log_mi['avg_reward'])
        if i_iter % args.log_interval == 0:
            print(
                '{}\n R_min_trpo {:.2f} \tR_max_trpo {:.2f} \tR_avg_trpo {:.2f}\nR_min_mi {:.2f} \tR_max_mi {:.2f} \tR_avg_mi {:.2f} '
                .format(i_iter, log_trpo['min_reward'], log_trpo['max_reward'],
                        log_trpo['avg_reward'], log_mi['min_reward'],
                        log_mi['max_reward'], log_mi['avg_reward']))

        if i_iter % args.plot_every == 0:
            plot_rewards_history(trpo=[tot_steps_trpo, avg_rewards_trpo],
                                 mi=[tot_steps_mi, avg_rewards_mi],
                                 args=args)
    plot_rewards_history(trpo=[tot_steps_trpo, avg_rewards_trpo],
                         mi=[tot_steps_mi, avg_rewards_mi],
                         args=args)
    """clean up gpu memory"""
    torch.cuda.empty_cache()
def main_loop():
    improved_context = sample_initial_context(args.min_batch_size, dtype=dtype)
    avg_rewards = []
    for i_iter in range(args.max_iter_num):
        print('sampling episodes')
        # (1)
        # generate multiple trajectories that reach the minimum batch_size
        # introduce param context=None when np is policy, these will be the context points used to predict
        policy_np.training = False
        batch, log, memory = agent.collect_samples(
            args.min_batch_size, context=improved_context
        )  # batch of batch_size transitions from multiple
        print(log['num_steps'], log['num_episodes']
              )  # episodes (separated by mask=0). Stored in Memory

        disc_rew = discounted_rewards(batch, args.gamma)

        memory.set_disc_rew(disc_rew)
        t0 = time.time()
        all_improved_context = improvement_step(batch)
        t1 = time.time()
        key = 'means' if improve_mean else 'actions'
        improved_context = [
            all_improved_context['states'], all_improved_context[key]
        ]

        # plot improved context and actions' discounted rewards
        plot_improvements(
            batch,
            [all_improved_context['states'], all_improved_context['means']],
            i_iter)

        # create training set
        training_set = all_improved_context['means']
        num_action_in_training = int(frac_replace_actions *
                                     training_set.shape[1])
        print('replacing {} means with actions'.format(num_action_in_training))
        training_set[:, :num_action_in_training, :] = all_improved_context[
            'actions'][:, :num_action_in_training, :]

        dataset = MemoryDatasetNP(batch,
                                  training_set,
                                  device_np,
                                  dtype,
                                  max_len=999)
        replay_memory.add(dataset)

        plot_training_set(i_iter)

        print('replay memory size:', len(replay_memory))
        train_np(replay_memory)

        plot_NP_policy(improved_context, i_iter, log['avg_reward'])

        avg_rewards.append(log['avg_reward'])
        if i_iter % args.log_interval == 0:
            print(
                '{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'
                .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'],
                        log['max_reward'], log['avg_reward']))

    fig_rew, ax_rew = plt.subplots(1, 1)
    ax_rew.plot(np.arange(len(avg_rewards)), avg_rewards)
    ax_rew.set_xlabel('iterations')
    ax_rew.set_ylabel('average reward')
    fig_rew.savefig(directory_path + '/average reward')
    plt.close(fig_rew)
    """clean up gpu memory"""
    torch.cuda.empty_cache()
示例#12
0
def main_loop():
    colors = []
    num_episodes = args.num_ensembles
    for i in range(num_episodes):
        colors.append('#%06X' % randint(0, 0xFFFFFF))

    improved_context_list = sample_initial_context_normal(args.num_ensembles)
    if initial_training:
        train_on_initial(improved_context_list)
    for i_iter in range(args.max_iter_num):
        print('sampling episodes')
        # (1)
        # generate multiple trajectories that reach the minimum batch_size
        policy_np.training = False
        batch, log = agent.collect_episodes(improved_context_list)  # batch of batch_size transitions from multiple
        #print(log['num_steps'], log['num_episodes'])                # episodes (separated by mask=0). Stored in Memory

        disc_rew = discounted_rewards(batch.memory, args.gamma)
        complete_dataset = BaseDataset(batch.memory, disc_rew, args.device_np, args.dtype,  max_len=max_episode_len)
        if not args.episode_specific_value:
            iter_dataset = {}
            iter_states, iter_q = merge_padded_lists([episode['states'] for episode in complete_dataset],
                                                     [episode['discounted_rewards'] for episode in complete_dataset],
                                                     max_lens=[episode['real_len'] for episode in complete_dataset])
            iter_dataset['states'] = iter_states
            iter_dataset['discounted_rewards'] = iter_q
            iter_dataset['real_len'] = iter_states.shape[-2]
            value_replay_memory.add([iter_dataset])
        else:
            value_replay_memory.add(complete_dataset)

        estimated_disc_rew, values_stdevs = estimate_disc_rew(complete_dataset, i_iter, episode_specific_value=args.episode_specific_value)

        tv0 = time.time()
        train_value_np(value_replay_memory)
        tv1 = time.time()

        t0 = time.time()
        improved_context_list = improvement_step_all(complete_dataset, estimated_disc_rew)
        t1 = time.time()
        #plot_initial_context(improved_context_list, colors, env, args, i_iter)
        # plot improved context and actions' discounted rewards
        #if i_iter % args.plot_every == 0:
        plot_improvements(complete_dataset, estimated_disc_rew, env, i_iter, args, colors)

        # create training set
        tn0 = time.time()
        replay_memory.add(complete_dataset)
        train_np(replay_memory)
        tn1 = time.time()

        #if i_iter % args.plot_every == 0:
        #   plot_NP_policy(policy_np, improved_context_list, replay_memory, i_iter, log['avg_reward'], env, args, colors)

        avg_rewards.append(log['avg_reward'])
        if i_iter % args.log_interval == 0:
            print('{}\tT_sample {:.4f} \tT_update {:.4f} \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(
                  i_iter, log['sample_time'], t1 - t0, log['min_reward'], log['max_reward'], log['avg_reward']))
            print('Training:  \tT_policy {:.2f}  \tT_value {:.2f}'.format(tn1-tn0, tv1-tv0))
        if log['avg_reward'] > 195:
            print('converged')
            plot_rewards_history(avg_rewards, args)
        #if i_iter % args.plot_every == 0:
        plot_rewards_history(avg_rewards, args)
        #args.fixed_sigma = args.fixed_sigma * args.gamma
    plot_rewards_history(avg_rewards, args)

    """clean up gpu memory"""
    torch.cuda.empty_cache()
示例#13
0
def main_loop():
    colors = []
    num_episodes = args.num_ensembles
    for i in range(num_episodes):
        colors.append('#%06X' % randint(0, 0xFFFFFF))

    improved_context_list = sample_initial_context_normal(args.num_ensembles)
    plot_initial_context(improved_context_list, colors, env, args, '00')
    if initial_training:
        train_on_initial(improved_context_list)
    for i_iter in range(args.max_iter_num):
        print('sampling episodes')
        # (1)
        # generate multiple trajectories that reach the minimum batch_size
        policy_np.training = False
        batch, log = agent.collect_episodes(
            improved_context_list, args.num_req_steps, args.num_ensembles
        )  # batch of batch_size transitions from multiple
        #print(log['num_steps'], log['num_episodes'])                # episodes (separated by mask=0). Stored in Memory

        estimated_disc_rew = discounted_rewards(batch.memory, args.gamma)
        complete_dataset = BaseDataset(batch.memory,
                                       estimated_disc_rew,
                                       args.device_np,
                                       args.dtype,
                                       max_len=max_episode_len)

        t0 = time.time()
        improved_context_list = improvement_step_all(complete_dataset,
                                                     estimated_disc_rew)
        t1 = time.time()
        #plot_initial_context(improved_context_list, colors, env, args, i_iter)
        # plot improved context and actions' discounted rewards
        if i_iter % args.plot_every == 0:
            plot_improvements(complete_dataset, estimated_disc_rew, env,
                              i_iter, args, colors)

        # create training set
        tn0 = time.time()
        replay_memory.add(complete_dataset)
        train_np(replay_memory)
        tn1 = time.time()

        #plot_training_set(i_iter, replay_memory, env, args)
        if i_iter % args.plot_every == 0:
            plot_NP_policy(policy_np, improved_context_list, replay_memory,
                           i_iter, log['avg_reward'], env, args, colors)

        avg_rewards.append(log['avg_reward'])
        tot_steps.append(tot_steps[-1] + log['num_steps'])
        if i_iter % args.log_interval == 0:
            print(
                '{}\tT_sample {:.4f} \tT_update {:.4f} \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'
                .format(i_iter, log['sample_time'], t1 - t0, log['min_reward'],
                        log['max_reward'], log['avg_reward']))
            print('Training:  \tT_policy {:.2f}'.format(tn1 - tn0))
        if log['avg_reward'] > 95:
            print('converged')
            plot_rewards_history(avg_rewards, args)
        if i_iter % args.plot_every == 0:
            plot_rewards_history(avg_rewards, tot_steps, args)
            #if args.pick_context:
            plot_chosen_context(improved_context_list, args.num_context,
                                i_iter, args, env)
            plot_all_training_set(i_iter, replay_memory, env, args)
        if args.fixed_sigma is not None:
            args.fixed_sigma = args.fixed_sigma * args.gamma
    plot_rewards_history(avg_rewards, args)
    """clean up gpu memory"""
    torch.cuda.empty_cache()
示例#14
0
    model = MultiLayerPerceptron(state_dim, action_dim,
                                 512).to(device).double()

    agent = AgentMLP(env, model, num_epi, device, fixed_sigma=args.fixed_sigma)

    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

    model_trainer = MLPTrainer(device, model, optimizer, print_freq=50)
    replay_memory = ReplayMemoryDataset(20)
    tot_steps = [0]
    avg_rewards = [0]
    for i_iter in range(500):
        batch, log = agent.collect_episodes(
        )  # batch of batch_size transitions from multiple

        disc_rew = discounted_rewards(batch.memory, 0.999)
        complete_dataset = BaseDataset(batch.memory,
                                       disc_rew,
                                       device,
                                       torch.float64,
                                       max_len=max_episode_len)
        print('average reward at', i_iter, log['avg_reward'].item())
        t0 = time.time()
        improved_context_list_mi = improvement_step_all(
            complete_dataset, disc_rew, 0.01, args)
        t1 = time.time()

        # create training set
        tn0 = time.time()
        replay_memory.add(complete_dataset)
        data_loader = DataLoader(replay_memory, batch_size=1, shuffle=True)
示例#15
0
def main_loop():
    colors = []
    num_episodes = args.num_ensembles
    for i in range(num_episodes):
        colors.append('#%06X' % randint(0, 0xFFFFFF))
    improved_context_list_np = sample_initial_context_normal(
        args.num_ensembles)
    for i_iter in range(args.max_iter_num):

        # generate multiple trajectories that reach the minimum batch_size
        policy_np.training = False
        if len(replay_memory) == 0 or not args.rm_as_context:
            context_list_np = improved_context_list_np
        else:
            context_list_np = replay_memory.data
        ts0 = time.time()
        batch_np, log_np = agent_np.collect_episodes(context_list_np,
                                                     args.num_req_steps,
                                                     args.num_ensembles)
        print('sampling:', time.time() - ts0)
        disc_rew_np = discounted_rewards(batch_np.memory, args.gamma)
        iter_dataset_np = BaseDataset(batch_np.memory,
                                      disc_rew_np,
                                      args.device_np,
                                      args.dtype,
                                      max_len=max_episode_len)
        print('np avg actions: ', log_np['action_mean'])
        if args.value_net:
            state_list = [
                ep['states'][:ep['real_len']] for ep in iter_dataset_np
            ]
            advantages_np, returns = critic_estimate(state_list, disc_rew_np,
                                                     args)
            update_critic(torch.cat(state_list, dim=0),
                          torch.cat(returns, dim=0))
        else:
            advantages_np = estimate_v_a(iter_dataset_np, disc_rew_np,
                                         value_replay_memory, value_np, args)
            value_replay_memory.add(iter_dataset_np)
            train_value_np(value_replay_memory)

        improved_context_list_np = improvement_step_all(
            iter_dataset_np, advantages_np, args.max_kl_np, args)
        # training
        tn0 = time.time()
        replay_memory.add(iter_dataset_np)
        train_np(replay_memory)
        tn1 = time.time()
        tot_steps_np.append(tot_steps_np[-1] + log_np['num_steps'])
        avg_rewards_np.append(log_np['avg_reward'])
        if i_iter % args.plot_every in [0, 1]:
            if 'CartPole' in args.env_name:
                plot_NP_policy_CP(policy_np,
                                  replay_memory,
                                  i_iter,
                                  env,
                                  args,
                                  use_np_sigma=args.plot_np_sigma)
                plot_rm(replay_memory, i_iter, args)
                plot_improvements_CP(iter_dataset_np, advantages_np, env,
                                     i_iter, args, colors)
            elif 'MountainCar' in args.env_name:
                plot_NP_policy_MC(policy_np,
                                  replay_memory,
                                  i_iter,
                                  env,
                                  args,
                                  use_np_sigma=args.plot_np_sigma)
                plot_improvements_MC(iter_dataset_np, advantages_np, env,
                                     i_iter, args, colors)
                plot_improvements_MC_all(iter_dataset_np, advantages_np, env,
                                         i_iter, args, colors)

        if i_iter % args.log_interval == 0:
            print(i_iter)
            print('np: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(
                log_np['min_reward'], log_np['max_reward'],
                log_np['avg_reward']))
        print('new sigma', args.fixed_sigma)
        plot_rewards_history(tot_steps_np, avg_rewards_np)
        store_avg_rewards(
            tot_steps_np[-1], avg_rewards_np[-1],
            np_file.replace(
                str(args.seed) + '.csv', 'avg' + str(args.seed) + '.csv'))
        if args.fixed_sigma is not None:
            sigma_history.append(torch.tensor(args.fixed_sigma))
        else:
            sigma_history.append(
                torch.cat([ep['stddevs']
                           for ep in iter_dataset_np.data]).mean(dim=0))
        plot_sigma_history(sigma_history)
        if tot_steps_np[-1] > args.tot_steps:
            break
        """clean up gpu memory"""
        torch.cuda.empty_cache()
示例#16
0
def main_loop():
    colors = []
    num_episodes = args.num_ensembles
    for i in range(num_episodes):
        colors.append('#%06X' % randint(0, 0xFFFFFF))
        improved_context_list_np = sample_initial_context_normal(
            args.num_ensembles)
        improved_context_list_mi = improved_context_list_np
    if args.use_np:
        if initial_training:
            train_on_initial(improved_context_list_np)
    for i_iter in range(args.max_iter_num):
        if args.use_trpo and tot_steps_trpo[
                -1] < args.tot_steps:  #  and tot_steps_trpo[-1] - max(tot_steps_mi[-1], tot_steps_np[-1]) < 1000:
            batch_trpo, log, memory_trpo = agent_trpo.collect_samples(
                args.min_batch_size
            )  # batch of batch_size transitions from multiple
            store_rewards_trpo(memory_trpo.memory, trpo_file)
            update_params_trpo(
                batch_trpo
            )  # generate multiple trajectories that reach the minimum batch_size
            tot_steps_trpo.append(tot_steps_trpo[-1] + log['num_steps'])
            avg_rewards_trpo.append(log['avg_reward'])
            print('trpo avg actions: ', log['action_mean'])
        if args.use_np and tot_steps_np[-1] < args.tot_steps:
            # generate multiple trajectories that reach the minimum batch_size
            policy_np.training = False
            batch_np, log_np = agent_np.collect_episodes(
                improved_context_list_np
            )  # batch of batch_size transitions from multiple
            store_rewards(batch_np.memory, np_file)
            disc_rew_np = discounted_rewards(batch_np.memory, args.gamma)
            complete_dataset_np = BaseDataset(batch_np.memory,
                                              disc_rew_np,
                                              args.device_np,
                                              args.dtype,
                                              max_len=max_episode_len)
            print('np avg actions: ', log_np['action_mean'])
            advantages_np = estimate_v_a(complete_dataset_np, disc_rew_np)

            improved_context_list_np = improvement_step_all(
                complete_dataset_np, advantages_np, args.max_kl_np)
            # training
            value_replay_memory.add(complete_dataset_np)
            train_value_np(value_replay_memory)

            tn0 = time.time()
            replay_memory.add(complete_dataset_np)
            train_np(replay_memory)
            tn1 = time.time()
            tot_steps_np.append(tot_steps_np[-1] + log_np['num_steps'])
            avg_rewards_np.append(log_np['avg_reward'])

        if args.use_mi and tot_steps_mi[-1] < args.tot_steps:
            # generate multiple trajectories that reach the minimum batch_size
            batch_mi, log_mi = agent_mi.collect_episodes(
                improved_context_list_mi
            )  # batch of batch_size transitions from multiple
            store_rewards(batch_mi.memory, mi_file)
            #print(log['num_steps'], log['num_episodes'])                # episodes (separated by mask=0). Stored in Memory
            print('mi avg actions: ', log_mi['action_mean'])

            disc_rew_mi = discounted_rewards(batch_mi.memory, args.gamma)
            complete_dataset_mi = BaseDataset(batch_mi.memory,
                                              disc_rew_mi,
                                              args.device_np,
                                              args.dtype,
                                              max_len=max_episode_len)
            advantages_mi = estimate_v_a_mi(complete_dataset_mi, disc_rew_mi)

            t0 = time.time()
            improved_context_list_mi = improvement_step_all(
                complete_dataset_mi, advantages_mi, args.max_kl_mi)
            t1 = time.time()

            # create training set
            tn0 = time.time()
            replay_memory_mi.add(complete_dataset_mi)
            train_mi(replay_memory_mi)
            tn1 = time.time()
            tot_steps_mi.append(tot_steps_mi[-1] + log_mi['num_steps'])

            avg_rewards_mi.append(log_mi['avg_reward'].item())
        if i_iter % args.log_interval == 0:
            print(i_iter)
            if args.use_trpo:
                print('trpo: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.
                      format(log['min_reward'], log['max_reward'],
                             log['avg_reward']))
            if args.use_np:
                print(
                    'np: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(
                        log_np['min_reward'], log_np['max_reward'],
                        log_np['avg_reward']))
            if args.use_mi:
                print(
                    'mi: \tR_min {:.2f} \tR_max {:.2f} \tR_avg {:.2f}'.format(
                        log_mi['min_reward'], log_mi['max_reward'],
                        log_mi['avg_reward']))
        print(args.fixed_sigma)
        if i_iter % args.plot_every == 0:
            plot_rewards_history(
                [tot_steps_trpo, tot_steps_np, tot_steps_mi],
                [avg_rewards_trpo, avg_rewards_np, avg_rewards_mi])
    """clean up gpu memory"""
    torch.cuda.empty_cache()