arg.lr_gamma = 0.95 arg.PI_STD=1 arg.goal_radius_range=[0.1,0.3] arg.TERMINAL_VEL = 0.025 arg.goal_radius_range=[0.15,0.3] arg.std_range = [0.02,0.3,0.02,0.3] arg.TERMINAL_VEL = 0.025 # terminal velocity? # norm(action) that you believe as a signal to stop 0.1. arg.DELTA_T=0.2 arg.EPISODE_LEN=35 number_updates=100 # agent convert to torch model import policy_torch baselines_mlp_model =TD3.load('trained_agent/accac_final_1000000_9_11_20_25.zip') agent = policy_torch.copy_mlp_weights(baselines_mlp_model,layers=[512,512],n_inputs=32) # loading enviorment, same as training env=firefly_accac.FireflyAccAc(arg) # ---seting the env for inverse---- # TODO, move it to a function of env env.agent_knows_phi=False for i in range(10): filename=(str(time.localtime().tm_mday)+'_'+str(time.localtime().tm_hour)+'_'+str(time.localtime().tm_min)) single_theta_inverse(arg, env, agent, filename, number_updates=number_updates, true_theta=None, phi=None, init_theta=None,
arg.NUM_thetas = 1 arg.ADAM_LR = 0.1 arg.LR_STEP = 2 arg.LR_STOP = 0.003 arg.lr_gamma = 0.95 arg.PI_STD = 1 arg.goal_radius_range = [0.1, 0.3] arg.TERMINAL_VEL = 0.025 number_updates = 100 # agent convert to torch model import policy_torch baselines_mlp_model = TD3.load( 'trained_agent//acc_retrain_1000000_2_18_21_4.zip') agent = policy_torch.copy_mlp_weights(baselines_mlp_model, layers=[128, 128], n_inputs=30) # loading enviorment, same as training env = firefly_acc.FireflyAcc(arg) # ---seting the env for inverse---- # TODO, move it to a function of env env.agent_knows_phi = False for i in range(10): filename = ("test_acc_EP" + str(arg.NUM_EP) + "updates" + str(number_updates) + "lr" + str(arg.ADAM_LR) + 'step' + str(arg.LR_STEP) + str(time.localtime().tm_mday) + '_' + str(time.localtime().tm_hour) + '_' + str(time.localtime().tm_min)) single_theta_inverse(arg,
import numpy as np from numpy import pi import matplotlib.pyplot as plt agent_name = "DDPG_selu_skip_96reward1000000_9 26 16 43" num_episode = 20 arg = Config() # arg.gains_range[0:2]=[0.9,0.91] # arg.std_range=[0.02,0.03,0.02,0.03] # arg.std_range=[0.0001,0.001,0.0001,0.001] # arg.gains_range=[0.99,1.,0.99,1.] env = ffenv.FireflyEnv(arg) baselines_selu = DDPG.load(agent_name) torch_model_selu = policy_torch.copy_mlp_weights(baselines_selu, layers=[256, 256, 64, 32], act_fn=nn.functional.selu) torch_model_selu.name = 'selu' # baselines_relu = DDPG.load("DDPG_theta") # torch_model_relu = policy_torch.copy_mlp_weights(baselines_relu,layers=[32,64]) # torch_model_relu.name='relu' agent = torch_model_selu # create saving vars all_ep = [] # for ecah episode, for i in range(num_episode): ep_data = {}
def load_policy(self): '''load policy''' sbpolicy=DDPG.load("DDPG_theta") # 100k step trained, with std noise. # convert to torch policy return policy_torch.copy_mlp_weights(sbpolicy)
def run_inverse(data=None,theta=None,filename=None): import os import warnings warnings.filterwarnings('ignore') from copy import copy import time import random seed=time.time().as_integer_ratio()[0] seed=0 random.seed(seed) import torch torch.manual_seed(seed) import numpy as np np.random.seed(int(seed)) from numpy import pi torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # -----------invser functions------------- from InverseFuncs import trajectory, getLoss, reset_theta, theta_range,reset_theta_log, single_inverse # ---------loading env and agent---------- from stable_baselines import DDPG,TD3 from FireflyEnv import ffenv_new_cord from Config import Config arg = Config() DISCOUNT_FACTOR = 0.99 arg.NUM_SAMPLES=2 arg.NUM_EP = 1000 arg.NUM_IT = 2 # number of iteration for gradient descent arg.NUM_thetas = 1 arg.ADAM_LR = 0.007 arg.LR_STEP = 2 arg.LR_STOP = 50 arg.lr_gamma = 0.95 arg.PI_STD=1 arg.goal_radius_range=[0.05,0.2] # agent convert to torch model import policy_torch baselines_mlp_model = TD3.load('trained_agent//TD_95gamma_mc_smallgoal_500000_9_24_1_6.zip') agent = policy_torch.copy_mlp_weights(baselines_mlp_model,layers=[128,128]) # loading enviorment, same as training env=ffenv_new_cord.FireflyAgentCenter(arg) env.agent_knows_phi=False true_theta_log = [] true_loss_log = [] true_loss_act_log = [] true_loss_obs_log = [] final_theta_log = [] stderr_log = [] result_log = [] number_update=100 if data is None: save_dict={'theta_estimations':[]} else: save_dict=data # use serval theta to inverse for num_thetas in range(arg.NUM_thetas): # make sure phi and true theta stay the same true_theta = torch.Tensor(data['true_theta']) env.presist_phi=True env.reset(phi=true_theta,theta=true_theta) # here we first testing teacher truetheta=phi case theta=torch.Tensor(data['theta_estimations'][0]) phi=torch.Tensor(data['phi']) save_dict['true_theta']=true_theta.data.clone().tolist() save_dict['phi']=true_theta.data.clone().tolist() save_dict['inital_theta']=theta.data.clone().tolist() for num_update in range(number_update): states, actions, tasks = trajectory( agent, phi, true_theta, env, arg.NUM_EP) result = single_theta_inverse(true_theta, phi, arg, env, agent, states, actions, tasks, filename, num_thetas, initial_theta=theta) save_dict['theta_estimations'].append(result.tolist()) if filename is None: savename=('inverse_data/' + filename + "EP" + str(arg.NUM_EP) + "updates" + str(number_update)+"sample"+str(arg.NUM_SAMPLES) +"IT"+ str(arg.NUM_IT) + '.pkl') torch.save(save_dict, savename) elif filename[:-4]=='.pkl': torch.save(save_dict, filename) else: torch.save(save_dict, (filename+'.pkf')) print(result) print('done')
arg.DELTA_T = 0.1 arg.EPISODE_TIME = 1 # # maximum length of time for one episode. if monkey can't firefly within this time period, new firefly comes arg.EPISODE_LEN = int(arg.EPISODE_TIME / arg.DELTA_T) arg.NUM_SAMPLES = 2 arg.NUM_EP = 50 arg.NUM_IT = 200 # number of iteration for gradient descent arg.NUM_thetas = 10 arg.ADAM_LR = 0.2 arg.LR_STEP = 2 arg.LR_STOP = 50 arg.lr_gamma = 0.95 # agent import policy_torch baselines_mlp_model = DDPG.load("DDPG_theta") agent = policy_torch.copy_mlp_weights(baselines_mlp_model) # agent=baselines_mlp_model env = ffenv_sigmoid.FireflyEnv(arg) env.max_goal_radius = (arg.goal_radius_range[1] ) # use the largest world size for goal radius env.box = arg.WORLD_SIZE env.reset_theta = False true_theta_log = [] true_loss_log = [] true_loss_act_log = [] true_loss_obs_log = [] final_theta_log = [] stderr_log = [] result_log = []