def run_job(args, save_dir=None): # Continue training from an existing iteration if args.continue_run > -1: save_dir = os.path.join(SCRIPT_DIR, args.continue_run_filepath) tf.reset_default_graph() with tf.Session( config=get_gpu_config(args.use_gpu, args.gpu_frac)) as sess: ############################################## ### initialize some commonly used parameters (from args) ############################################## env_name = args.env_name continue_run = args.continue_run K = args.K num_iters = args.num_iters num_trajectories_per_iter = args.num_trajectories_per_iter horizon = args.horizon ### set seeds npr.seed(args.seed) tf.set_random_seed(args.seed) ####################### ### hardcoded args ####################### ### data types args.tf_datatype = tf.float32 args.np_datatype = np.float32 ### supervised learning noise, added to the training dataset args.noiseToSignal = 0.01 ### these are for *during* MPC rollouts, # they allow you to run the H-step candidate actions on the real dynamics # and compare the model's predicted outcomes vs. the true outcomes execute_sideRollouts = False plot_sideRollouts = True ######################################## ### create loader, env, rand policy ######################################## loader = Loader(save_dir) env, dt_from_xml = create_env(env_name) args.dt_from_xml = dt_from_xml random_policy = Policy_Random(env.env) #doing a render here somehow allows it to not produce a seg fault error later when visualizing if args.visualize_MPC_rollout: render_env(env) render_stop(env) ################################################# ### initialize or load in info ################################################# #check for a variable which indicates that we should duplicate each data point #e.g., for baoding, since ballA/B are interchangeable, we store as 2 different points if 'duplicateData_switchObjs' in dir(env.unwrapped_env): duplicateData_switchObjs = True indices_for_switching = [ env.unwrapped_env.objInfo_start1, env.unwrapped_env.objInfo_start2, env.unwrapped_env.targetInfo_start1, env.unwrapped_env.targetInfo_start2 ] else: duplicateData_switchObjs = False indices_for_switching = [] #initialize data processor data_processor = DataProcessor(args, duplicateData_switchObjs, indices_for_switching) #start a fresh run if continue_run == -1: #random training/validation data if args.load_existing_random_data: rollouts_trainRand, rollouts_valRand = loader.load_initialData( ) else: #training rollouts_trainRand = collect_random_rollouts( env, random_policy, args.num_rand_rollouts_train, args.rand_rollout_length, dt_from_xml, args) #validation rollouts_valRand = collect_random_rollouts( env, random_policy, args.num_rand_rollouts_val, args.rand_rollout_length, dt_from_xml, args) #convert (rollouts --> dataset) dataset_trainRand = data_processor.convertRolloutsToDatasets( rollouts_trainRand) dataset_valRand = data_processor.convertRolloutsToDatasets( rollouts_valRand) #onPol train/val data dataset_trainOnPol = Dataset() rollouts_trainOnPol = [] rollouts_valOnPol = [] #lists for saving trainingLoss_perIter = [] rew_perIter = [] scores_perIter = [] trainingData_perIter = [] #initialize counter counter = 0 #continue from an existing run else: #load data iter_data = loader.load_iter(continue_run - 1) #random data rollouts_trainRand, rollouts_valRand = loader.load_initialData() #onPol data rollouts_trainOnPol = iter_data.train_rollouts_onPol rollouts_valOnPol = iter_data.val_rollouts_onPol #convert (rollouts --> dataset) dataset_trainRand = data_processor.convertRolloutsToDatasets( rollouts_trainRand) dataset_valRand = data_processor.convertRolloutsToDatasets( rollouts_valRand) #lists for saving trainingLoss_perIter = iter_data.training_losses rew_perIter = iter_data.rollouts_rewardsPerIter scores_perIter = iter_data.rollouts_scoresPerIter trainingData_perIter = iter_data.training_numData #initialize counter counter = continue_run #how many iters to train for num_iters += continue_run ### check data dims inputSize, outputSize, acSize = check_dims(dataset_trainRand, env) ### amount of data numData_train_rand = get_num_data(rollouts_trainRand) ############################################## ### dynamics model + controller ############################################## dyn_models = Dyn_Model(inputSize, outputSize, acSize, sess, params=args) mpc_rollout = MPCRollout(env, dyn_models, random_policy, execute_sideRollouts, plot_sideRollouts, args) ### init TF variables sess.run(tf.global_variables_initializer()) ############################################## ### saver ############################################## saver = Saver(save_dir, sess) saver.save_initialData(args, rollouts_trainRand, rollouts_valRand) ############################################## ### THE MAIN LOOP ############################################## firstTime = True rollouts_info_prevIter, list_mpes, list_scores, list_rewards = None, None, None, None while counter < num_iters: #init vars for this iteration saver_data = DataPerIter() saver.iter_num = counter #onPolicy validation doesn't exist yet, so just make it same as rand validation if counter == 0: rollouts_valOnPol = rollouts_valRand #convert (rollouts --> dataset) dataset_trainOnPol = data_processor.convertRolloutsToDatasets( rollouts_trainOnPol) dataset_valOnPol = data_processor.convertRolloutsToDatasets( rollouts_valOnPol) # amount of data numData_train_onPol = get_num_data(rollouts_trainOnPol) # mean/std of all data data_processor.update_stats(dyn_models, dataset_trainRand, dataset_trainOnPol) #preprocess datasets to mean0/std1 + clip actions preprocessed_data_trainRand = data_processor.preprocess_data( dataset_trainRand) preprocessed_data_valRand = data_processor.preprocess_data( dataset_valRand) preprocessed_data_trainOnPol = data_processor.preprocess_data( dataset_trainOnPol) preprocessed_data_valOnPol = data_processor.preprocess_data( dataset_valOnPol) #convert datasets (x,y,z) --> training sets (inp, outp) inputs, outputs = data_processor.xyz_to_inpOutp( preprocessed_data_trainRand) inputs_val, outputs_val = data_processor.xyz_to_inpOutp( preprocessed_data_valRand) inputs_onPol, outputs_onPol = data_processor.xyz_to_inpOutp( preprocessed_data_trainOnPol) inputs_val_onPol, outputs_val_onPol = data_processor.xyz_to_inpOutp( preprocessed_data_valOnPol) ##################################### ## Training the model ##################################### if (not (args.print_minimal)): print("\n#####################################") print("Training the dynamics model..... iteration ", counter) print("#####################################\n") print(" amount of random data: ", numData_train_rand) print(" amount of onPol data: ", numData_train_onPol) ### copy train_onPol until it's big enough if len(inputs_onPol) > 0: while inputs_onPol.shape[0] < inputs.shape[0]: inputs_onPol = np.concatenate([inputs_onPol, inputs_onPol]) outputs_onPol = np.concatenate( [outputs_onPol, outputs_onPol]) ### copy val_onPol until it's big enough while inputs_val_onPol.shape[0] < args.batchsize: inputs_val_onPol = np.concatenate( [inputs_val_onPol, inputs_val_onPol], 0) outputs_val_onPol = np.concatenate( [outputs_val_onPol, outputs_val_onPol], 0) #re-initialize all vars (randomly) if training from scratch ##restore model if doing continue_run if args.warmstart_training: if firstTime: if continue_run > 0: restore_path = save_dir + '/models/model_aggIter' + str( continue_run - 1) + '.ckpt' saver.tf_saver.restore(sess, restore_path) print("\n\nModel restored from ", restore_path, "\n\n") else: sess.run(tf.global_variables_initializer()) #number of training epochs if counter == 0: nEpoch_use = args.nEpoch_init else: nEpoch_use = args.nEpoch #train model or restore model if args.always_use_savedModel: if continue_run > 0: restore_path = save_dir + '/models/model_aggIter' + str( continue_run - 1) + '.ckpt' else: restore_path = save_dir + '/models/finalModel.ckpt' saver.tf_saver.restore(sess, restore_path) print("\n\nModel restored from ", restore_path, "\n\n") #empty vars, for saving training_loss = 0 training_lists_to_save = dict( training_loss_list=0, val_loss_list_rand=0, val_loss_list_onPol=0, val_loss_list_xaxis=0, rand_loss_list=0, onPol_loss_list=0, ) else: ## train model training_loss, training_lists_to_save = dyn_models.train( inputs, outputs, inputs_onPol, outputs_onPol, nEpoch_use, inputs_val=inputs_val, outputs_val=outputs_val, inputs_val_onPol=inputs_val_onPol, outputs_val_onPol=outputs_val_onPol) #saving rollout info rollouts_info = [] list_rewards = [] list_scores = [] list_mpes = [] if not args.print_minimal: print("\n#####################################") print("performing on-policy MPC rollouts... iter ", counter) print("#####################################\n") for rollout_num in range(num_trajectories_per_iter): ########################################### ########## perform 1 MPC rollout ########################################### if not args.print_minimal: print("\n####################### Performing MPC rollout #", rollout_num) #reset env randomly starting_observation, starting_state = env.reset( return_start_state=True) rollout_info = mpc_rollout.perform_rollout( starting_state, starting_observation, controller_type=args.controller_type, take_exploratory_actions=False) # Note: can sometimes set take_exploratory_actions=True # in order to use ensemble disagreement for exploration ########################################### ####### save rollout info (if long enough) ########################################### if len(rollout_info['observations']) > K: list_rewards.append(rollout_info['rollout_rewardTotal']) list_scores.append(rollout_info['rollout_meanFinalScore']) list_mpes.append(np.mean(rollout_info['mpe_1step'])) rollouts_info.append(rollout_info) rollouts_info_prevIter = rollouts_info.copy() # visualize, if desired if args.visualize_MPC_rollout: print( "\n\nPAUSED FOR VISUALIZATION. Continue when ready to visualize." ) import IPython IPython.embed() for vis_index in range(len(rollouts_info)): visualize_rendering(rollouts_info[vis_index], env, args) ######################################################### ### aggregate some random rollouts into training data ######################################################### num_rand_rollouts = 5 rollouts_rand = collect_random_rollouts(env, random_policy, num_rand_rollouts, args.rollout_length, dt_from_xml, args) #convert (rollouts --> dataset) dataset_rand_new = data_processor.convertRolloutsToDatasets( rollouts_rand) #concat this dataset with the existing dataset_trainRand dataset_trainRand = concat_datasets(dataset_trainRand, dataset_rand_new) ######################################################### ### aggregate MPC rollouts into train/val ######################################################### num_mpc_rollouts = len(rollouts_info) rollouts_train = [] rollouts_val = [] for i in range(num_mpc_rollouts): rollout = Rollout(rollouts_info[i]['observations'], rollouts_info[i]['actions'], rollouts_info[i]['rollout_rewardTotal'], rollouts_info[i]['starting_state']) if i < int(num_mpc_rollouts * 0.9): rollouts_train.append(rollout) else: rollouts_val.append(rollout) #aggregate into training data if counter == 0: rollouts_valOnPol = [] rollouts_trainOnPol = rollouts_trainOnPol + rollouts_train rollouts_valOnPol = rollouts_valOnPol + rollouts_val ######################################################### ### save everything about this iter of model training ######################################################### trainingData_perIter.append(numData_train_rand + numData_train_onPol) trainingLoss_perIter.append(training_loss) ### stage relevant info for saving saver_data.training_numData = trainingData_perIter saver_data.training_losses = trainingLoss_perIter saver_data.training_lists_to_save = training_lists_to_save # Note: the on-policy rollouts include curr iter's rollouts # (so next iter can be directly trained on these) saver_data.train_rollouts_onPol = rollouts_trainOnPol saver_data.val_rollouts_onPol = rollouts_valOnPol saver_data.normalization_data = data_processor.get_normalization_data( ) saver_data.counter = counter ### save all info from this training iteration saver.save_model() saver.save_training_info(saver_data) ######################################################### ### save everything about this iter of MPC rollouts ######################################################### # append onto rewards/scores rew_perIter.append([np.mean(list_rewards), np.std(list_rewards)]) scores_perIter.append([np.mean(list_scores), np.std(list_scores)]) # save saver_data.rollouts_rewardsPerIter = rew_perIter saver_data.rollouts_scoresPerIter = scores_perIter saver_data.rollouts_info = rollouts_info saver.save_rollout_info(saver_data) counter = counter + 1 firstTime = False return
def run_eval(args, save_dir): ########################## ## params ########################## ### read in params from saved config file paramfile = open(save_dir + '/params.pkl', 'rb') params = pickle.load(paramfile) ### can manually set some options here, for these eval runs (to override options from training) # params.kappa = 1 # params.horizon = 20 # params.mppi_beta = 0.6 #overwrite config's value with the commandline arg value params.use_ground_truth_dynamics = args.use_ground_truth_dynamics #if run length wasn't specified in args, default to config file's value if args.eval_run_length == -1: args.eval_run_length = params.rollout_length ########################## ## other initializations ########################## ### set seeds npr.seed(args.seed) tf.set_random_seed(args.seed) #loader and data processor loader = Loader(save_dir) #env, rand policy env, dt_from_xml = create_env(params.env_name) random_policy = Policy_Random(env.env) #load data from the iteration (for plotting) iter_data = loader.load_iter(args.iter_num) trainingLoss_perIter = iter_data.training_losses rew_perIter = iter_data.rollouts_rewardsPerIter scores_perIter = iter_data.rollouts_scoresPerIter trainingData_perIter = iter_data.training_numData #mean/std info normalization_data = iter_data.normalization_data ### data dims outputSize = normalization_data.mean_z.shape[0] acSize = normalization_data.mean_y.shape[0] inputSize = normalization_data.mean_x.shape[0] + acSize with tf.Session( config=get_gpu_config(args.use_gpu, args.gpu_frac)) as sess: ############################################## ### dynamics model + controller ############################################## dyn_models = Dyn_Model(inputSize, outputSize, acSize, sess, params=params) mpc_rollout = MPCRollout( env, dyn_models, random_policy, execute_sideRollouts=args.execute_sideRollouts, plot_sideRollouts=True, params=params) ############################################## ### restore the saved dynamics model ############################################## #restore model sess.run(tf.global_variables_initializer()) restore_path = save_dir + '/models/model_aggIter' + str( args.iter_num) + '.ckpt' saver = tf.train.Saver(max_to_keep=0) saver.restore(sess, restore_path) print("\n\nModel restored from ", restore_path, "\n\n") #restore mean/std dyn_models.normalization_data = normalization_data ################################ ########### RUN ROLLOUTS ################################ list_rewards = [] list_scores = [] rollouts = [] for rollout_num in range(args.num_eval_rollouts): # Note: if you want to evaluate a particular goal, call env.reset with a reset_state # where that reset_state dict has reset_pose, reset_vel, and reset_goal starting_observation, starting_state = env.reset( return_start_state=True) if not params.print_minimal: print("\n############# Performing MPC rollout #", rollout_num) mpc_rollout.rollout_length = args.eval_run_length rollout_info = mpc_rollout.perform_rollout( starting_state, starting_observation, controller_type=params.controller_type, take_exploratory_actions=False) #save info from MPC rollout list_rewards.append(rollout_info['rollout_rewardTotal']) list_scores.append(rollout_info['rollout_meanFinalScore']) rollouts.append(rollout_info) #save all eval rollouts pickle.dump(rollouts, open(save_dir + '/saved_rollouts/rollouts_eval.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL) print("REWARDS: ", list_rewards, " .... mean: ", np.mean(list_rewards), " std: ", np.std(list_rewards)) print("SCORES: ", list_scores, " ... mean: ", np.mean(list_scores), " std: ", np.std(list_scores), "\n\n")