def trainModelParallel(inputData): settingsFileName = inputData[0] settings = inputData[1] np.random.seed(int(settings['random_seed'])) import os if ('THEANO_FLAGS' in os.environ): os.environ['THEANO_FLAGS'] = os.environ[ 'THEANO_FLAGS'] + "mode=FAST_RUN,device=" + settings[ 'training_processor_type'] + ",floatX=" + settings['float_type'] else: os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device=" + settings[ 'training_processor_type'] + ",floatX=" + settings['float_type'] import keras.backend keras.backend.set_floatx(settings['float_type']) print("K.floatx()", keras.backend.floatx()) from ModelEvaluation import SimWorker, evalModelParrallel, collectExperience, simEpoch, evalModel, simModelParrallel from model.ModelUtil import validBounds, fixBounds, anneal_value from model.LearningAgent import LearningAgent, LearningWorker from util.SimulationUtil import validateSettings from util.SimulationUtil import createEnvironment from util.SimulationUtil import createRLAgent from util.SimulationUtil import createActor, getAgentName from util.SimulationUtil import getDataDirectory, createForwardDynamicsModel, createSampler from util.ExperienceMemory import ExperienceMemory from RLVisualize import RLVisualize from NNVisualize import NNVisualize #from sim.PendulumEnvState import PendulumEnvState #from sim.PendulumEnv import PendulumEnv #from sim.BallGame2DEnv import BallGame2DEnv settings = validateSettings(settings) model_type = settings["model_type"] directory = getDataDirectory(settings) if not os.path.exists(directory): os.makedirs(directory) # copy settings file out_file_name = directory + os.path.basename(settingsFileName) print("Saving settings file with data: ", out_file_name) out_file = open(out_file_name, 'w') out_file.write(json.dumps(settings, indent=4)) out_file.close() ### Try and save algorithm and model files for reference if "." in settings['model_type']: ### convert . to / and copy file over file_name = settings['model_type'] k = file_name.rfind(".") file_name = file_name[:k] file_name_read = file_name.replace(".", "/") file_name_read = file_name_read + ".py" print("model file name:", file_name) print("os.path.basename(file_name): ", os.path.basename(file_name)) file = open(file_name_read, 'r') out_file = open(directory + file_name + ".py", 'w') out_file.write(file.read()) file.close() out_file.close() if "." in settings['agent_name']: ### convert . to / and copy file over file_name = settings['agent_name'] k = file_name.rfind(".") file_name = file_name[:k] file_name_read = file_name.replace(".", "/") file_name_read = file_name_read + ".py" print("model file name:", file_name) print("os.path.basename(file_name): ", os.path.basename(file_name)) file = open(file_name_read, 'r') out_file = open(directory + file_name + ".py", 'w') out_file.write(file.read()) file.close() out_file.close() if (settings['train_forward_dynamics']): if "." in settings['forward_dynamics_model_type']: ### convert . to / and copy file over file_name = settings['forward_dynamics_model_type'] k = file_name.rfind(".") file_name = file_name[:k] file_name_read = file_name.replace(".", "/") file_name_read = file_name_read + ".py" print("model file name:", file_name) print("os.path.basename(file_name): ", os.path.basename(file_name)) file = open(file_name_read, 'r') out_file = open(directory + file_name + ".py", 'w') out_file.write(file.read()) file.close() out_file.close() rounds = settings["rounds"] epochs = settings["epochs"] epsilon = settings["epsilon"] discount_factor = settings["discount_factor"] reward_bounds = np.array(settings["reward_bounds"]) batch_size = settings["batch_size"] train_on_validation_set = settings["train_on_validation_set"] state_bounds = np.array(settings['state_bounds']) discrete_actions = np.array(settings['discrete_actions']) #9*6 num_actions = discrete_actions.shape[0] # number of rows print("Sim config file name: " + str(settings["sim_config_file"])) action_space_continuous = settings['action_space_continuous'] if (settings['num_available_threads'] == 1): input_anchor_queue = multiprocessing.Queue( settings['queue_size_limit']) input_anchor_queue_eval = multiprocessing.Queue( settings['queue_size_limit']) output_experience_queue = multiprocessing.Queue( settings['queue_size_limit']) eval_episode_data_queue = multiprocessing.Queue( settings['queue_size_limit']) else: input_anchor_queue = multiprocessing.Queue(settings['epochs']) input_anchor_queue_eval = multiprocessing.Queue(settings['epochs']) output_experience_queue = multiprocessing.Queue( settings['queue_size_limit']) eval_episode_data_queue = multiprocessing.Queue( settings['eval_epochs']) if (settings['on_policy']): ## So that off policy agent does not learn output_experience_queue = None sim_work_queues = [] action_space_continuous = settings['action_space_continuous'] if action_space_continuous: action_bounds = np.array(settings["action_bounds"], dtype=float) ### Using a wrapper for the type of actor now actor = createActor(settings['environment_type'], settings, None) exp_val = None if (not validBounds(action_bounds)): # Check that the action bounds are spcified correctly print("Action bounds invalid: ", action_bounds) sys.exit() if (not validBounds(state_bounds)): # Probably did not collect enough bootstrapping samples to get good state bounds. print("State bounds invalid: ", state_bounds) state_bounds = fixBounds(np.array(state_bounds)) bound_fixed = validBounds(state_bounds) print("State bounds fixed: ", bound_fixed) sys.exit() if (not validBounds(reward_bounds)): print("Reward bounds invalid: ", reward_bounds) sys.exit() if settings['action_space_continuous']: experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), settings['expereince_length'], continuous_actions=True, settings=settings) else: experience = ExperienceMemory(len(state_bounds[0]), 1, settings['expereince_length']) experience.setSettings(settings) if settings['visualize_learning']: title = settings['agent_name'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] rlv = RLVisualize(title=title + " agent on " + str(settings['environment_type']), settings=settings) rlv.setInteractive() rlv.init() if (settings['train_forward_dynamics']): if settings['visualize_learning']: title = settings['forward_dynamics_model_type'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] nlv = NNVisualize(title=str("Dynamics Model") + " with " + title, settings=settings) nlv.setInteractive() nlv.init() if (settings['train_reward_predictor']): if settings['visualize_learning']: title = settings['forward_dynamics_model_type'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] rewardlv = NNVisualize(title=str("Reward Model") + " with " + title, settings=settings) rewardlv.setInteractive() rewardlv.init() if (settings['debug_critic']): #True criticLosses = [] criticRegularizationCosts = [] if (settings['visualize_learning']): title = settings['agent_name'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] critic_loss_viz = NNVisualize(title=str("Critic Loss") + " with " + title) critic_loss_viz.setInteractive() critic_loss_viz.init() critic_regularization_viz = NNVisualize( title=str("Critic Reg Cost") + " with " + title) critic_regularization_viz.setInteractive() critic_regularization_viz.init() if (settings['debug_actor']): # True actorLosses = [] actorRegularizationCosts = [] if (settings['visualize_learning']): #False title = settings['agent_name'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] actor_loss_viz = NNVisualize(title=str("Actor Loss") + " with " + title) actor_loss_viz.setInteractive() actor_loss_viz.init() actor_regularization_viz = NNVisualize( title=str("Actor Reg Cost") + " with " + title) actor_regularization_viz.setInteractive() actor_regularization_viz.init() model = createRLAgent(settings['agent_name'], state_bounds, discrete_actions, reward_bounds, settings) #return a model class forwardDynamicsModel = None if (settings['train_forward_dynamics']): #False if (settings['forward_dynamics_model_type'] == "SingleNet"): print( "Creating forward dynamics network: Using single network model" ) forwardDynamicsModel = createForwardDynamicsModel(settings, state_bounds, action_bounds, None, None, agentModel=model) else: print("Creating forward dynamics network") forwardDynamicsModel = createForwardDynamicsModel(settings, state_bounds, action_bounds, None, None, agentModel=None) forwardDynamicsModel.setActor(actor) forwardDynamicsModel.init(len(state_bounds[0]), len(action_bounds[0]), state_bounds, action_bounds, actor, None, settings) (agent, learning_workers) = createLearningAgent(settings, output_experience_queue, state_bounds, action_bounds, reward_bounds) masterAgent = agent ### These are the workers for training (sim_workers, sim_work_queues) = createSimWorkers( settings, input_anchor_queue, output_experience_queue, eval_episode_data_queue, model, forwardDynamicsModel, exp_val, state_bounds, action_bounds, reward_bounds) eval_sim_workers = sim_workers eval_sim_work_queues = sim_work_queues if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): #True (eval_sim_workers, eval_sim_work_queues) = createSimWorkers( settings, input_anchor_queue_eval, output_experience_queue, eval_episode_data_queue, model, forwardDynamicsModel, exp_val, state_bounds, action_bounds, reward_bounds, default_sim_id=settings['override_sim_env_id']) # id=1 else: input_anchor_queue_eval = input_anchor_queue best_eval = -100000000.0 best_dynamicsLosses = best_eval * -1.0 values = [] discounted_values = [] bellman_error = [] reward_over_epoc = [] dynamicsLosses = [] dynamicsRewardLosses = [] for lw in learning_workers: print("Learning worker") print(lw) if (int(settings["num_available_threads"]) > 1): for sw in sim_workers: print("Sim worker") print(sw) sw.start() if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for sw in eval_sim_workers: print("Sim worker") print(sw) sw.start() ## This needs to be done after the simulation worker processes are created exp_val = createEnvironment(settings["forwardDynamics_config_file"], settings['environment_type'], settings, render=settings['shouldRender'], index=0) exp_val.setActor(actor) exp_val.getActor().init() exp_val.init() ### This is for a single-threaded Synchronous sim only. if (int(settings["num_available_threads"]) == 1 ): # This is okay if there is one thread only... sim_workers[0].setEnvironment(exp_val) sim_workers[0].start() if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): eval_sim_workers[0].setEnvironment(exp_val) eval_sim_workers[0].start() masterAgent.setPolicy(model) if (settings['train_forward_dynamics']): masterAgent.setForwardDynamics(forwardDynamicsModel) tmp_p = 1.0 message = {} if (settings['load_saved_model']): tmp_p = settings['min_epsilon'] data = ('Update_Policy', tmp_p, model.getStateBounds(), model.getActionBounds(), model.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters()) if (settings['train_forward_dynamics']): data = ('Update_Policy', tmp_p, model.getStateBounds(), model.getActionBounds(), model.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters(), masterAgent.getForwardDynamics().getNetworkParameters()) message['type'] = 'Update_Policy' message['data'] = data for m_q in sim_work_queues: print("trainModel: Sending current network parameters: ", m_q) m_q.put(message) if (int(settings["num_available_threads"]) == 1): experience, state_bounds, reward_bounds, action_bounds = collectExperience( actor, exp_val, model, settings, sim_work_queues=None, eval_episode_data_queue=None ) #experience: state, action, nextstate, rewards, else: if (settings['on_policy']): experience, state_bounds, reward_bounds, action_bounds = collectExperience( actor, None, model, settings, sim_work_queues=sim_work_queues, eval_episode_data_queue=eval_episode_data_queue) else: experience, state_bounds, reward_bounds, action_bounds = collectExperience( actor, None, model, settings, sim_work_queues=input_anchor_queue, eval_episode_data_queue=eval_episode_data_queue) masterAgent.setExperience(experience) if ('keep_seperate_fd_exp_buffer' in settings and (settings['keep_seperate_fd_exp_buffer'])): masterAgent.setFDExperience(copy.deepcopy(experience)) if (not validBounds(action_bounds)): # Check that the action bounds are spcified correctly print("Action bounds invalid: ", action_bounds) sys.exit() if (not validBounds(state_bounds)): # Probably did not collect enough bootstrapping samples to get good state bounds. print("State bounds invalid: ", state_bounds) state_bounds = fixBounds(np.array(state_bounds)) bound_fixed = validBounds(state_bounds) print("State bounds fixed: ", bound_fixed) if (not validBounds(reward_bounds)): print("Reward bounds invalid: ", reward_bounds) sys.exit() print("Reward History: ", experience._reward_history) print("Action History: ", experience._action_history) print("Action Mean: ", np.mean(experience._action_history)) print("Experience Samples: ", (experience.samples())) if (settings["save_experience_memory"]): print("Saving initial experience memory") file_name = directory + getAgentName() + "_expBufferInit.hdf5" experience.saveToFile(file_name) if (settings['load_saved_model'] or (settings['load_saved_model'] == 'network_and_scales')): ## Transfer learning experience.setStateBounds(copy.deepcopy(model.getStateBounds())) experience.setRewardBounds(copy.deepcopy(model.getRewardBounds())) experience.setActionBounds(copy.deepcopy(model.getActionBounds())) model.setSettings(settings) else: ## Normal model.setStateBounds(state_bounds) model.setActionBounds(action_bounds) model.setRewardBounds(reward_bounds) experience.setStateBounds(copy.deepcopy(model.getStateBounds())) experience.setRewardBounds(copy.deepcopy(model.getRewardBounds())) experience.setActionBounds(copy.deepcopy(model.getActionBounds())) masterAgent_message_queue = multiprocessing.Queue(settings['epochs']) if (settings['train_forward_dynamics']): if (not settings['load_saved_model']): forwardDynamicsModel.setStateBounds(state_bounds) forwardDynamicsModel.setActionBounds(action_bounds) forwardDynamicsModel.setRewardBounds(reward_bounds) masterAgent.setForwardDynamics(forwardDynamicsModel) ## Now everything related to the exp memory needs to be updated bellman_errors = [] masterAgent.setPolicy(model) print("Master agent state bounds: ", repr(masterAgent.getPolicy().getStateBounds())) for sw in sim_workers: # Need to update parameter bounds for models print("exp: ", sw._exp) print("sw modle: ", sw._model.getPolicy()) ## If not on policy if (not settings['on_policy']): for lw in learning_workers: lw._agent.setPolicy(model) lw.setMasterAgentMessageQueue(masterAgent_message_queue) lw.updateExperience(experience) print("ls policy: ", lw._agent.getPolicy()) lw.start() tmp_p = 1.0 if (settings['load_saved_model']): tmp_p = settings['min_epsilon'] data = ('Update_Policy', tmp_p, model.getStateBounds(), model.getActionBounds(), model.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters()) if (settings['train_forward_dynamics']): data = ('Update_Policy', tmp_p, model.getStateBounds(), model.getActionBounds(), model.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters(), masterAgent.getForwardDynamics().getNetworkParameters()) message['type'] = 'Update_Policy' message['data'] = data for m_q in sim_work_queues: print("trainModel: Sending current network parameters: ", m_q) m_q.put(message) del model ## Give gloabl access to processes to they can be terminated when ctrl+c is pressed global sim_processes sim_processes = sim_workers global learning_processes learning_processes = learning_workers global _input_anchor_queue _input_anchor_queue = input_anchor_queue global _output_experience_queue _output_experience_queue = output_experience_queue global _eval_episode_data_queue _eval_episode_data_queue = eval_episode_data_queue global _sim_work_queues _sim_work_queues = sim_work_queues trainData = {} trainData["mean_reward"] = [] trainData["std_reward"] = [] trainData["mean_bellman_error"] = [] trainData["std_bellman_error"] = [] trainData["mean_discount_error"] = [] trainData["std_discount_error"] = [] trainData["mean_forward_dynamics_loss"] = [] trainData["std_forward_dynamics_loss"] = [] trainData["mean_forward_dynamics_reward_loss"] = [] trainData["std_forward_dynamics_reward_loss"] = [] trainData["mean_eval"] = [] trainData["std_eval"] = [] trainData["mean_critic_loss"] = [] trainData["std_critic_loss"] = [] trainData["mean_critic_regularization_cost"] = [] trainData["std_critic_regularization_cost"] = [] trainData["mean_actor_loss"] = [] trainData["std_actor_loss"] = [] trainData["mean_actor_regularization_cost"] = [] trainData["std_actor_regularization_cost"] = [] trainData["anneal_p"] = [] if (False): print("State Bounds:", masterAgent.getStateBounds()) print("Action Bounds:", masterAgent.getActionBounds()) print("Exp State Bounds: ", experience.getStateBounds()) print("Exp Action Bounds: ", experience.getActionBounds()) print("Starting first round") if (settings['on_policy']): sim_epochs_ = epochs for round_ in range( 0, rounds): #annel value # the parameter of greedy exploration if ('annealing_schedule' in settings and (settings['annealing_schedule'] != False)): p = anneal_value(float(round_ / rounds), settings_=settings) else: p = ((settings['initial_temperature'] / math.log(round_ + 2))) p = max(settings['min_epsilon'], min(settings['epsilon'], p)) # Keeps it between 1.0 and 0.2 if (settings['load_saved_model']): p = settings['min_epsilon'] for epoch in range(epochs): if (settings['on_policy']): out = simModelParrallel( sw_message_queues=sim_work_queues, model=masterAgent, settings=settings, eval_episode_data_queue=eval_episode_data_queue, anchors=settings['num_on_policy_rollouts']) ( tuples, discounted_sum, q_value, evalData ) = out # tuples = states, actions, result_states, rewards, falls, G_ts, advantage, exp_actions (__states, __actions, __result_states, __rewards, __falls, __G_ts, advantage__, exp_actions__) = tuples for i in range(1): masterAgent.train(_states=__states, _actions=__actions, _rewards=__rewards, _result_states=__result_states, _falls=__falls, _advantage=advantage__, _exp_actions=exp_actions__) if (('anneal_on_policy' in settings) and settings['anneal_on_policy']): p_tmp_ = p else: p_tmp_ = 1.0 data = ('Update_Policy', p_tmp_, masterAgent.getStateBounds(), masterAgent.getActionBounds(), masterAgent.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters()) message = {} message['type'] = 'Update_Policy' message['data'] = data if (settings['train_forward_dynamics']): data = ('Update_Policy', p_tmp_, masterAgent.getStateBounds(), masterAgent.getActionBounds(), masterAgent.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters(), masterAgent.getForwardDynamics(). getNetworkParameters()) message['data'] = data for m_q in sim_work_queues: ## block on full queue m_q.put(message) if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for m_q in eval_sim_work_queues: ## block on full queue m_q.put(message) else: episodeData = {} episodeData['data'] = epoch episodeData['type'] = 'sim' input_anchor_queue.put(episodeData) if masterAgent.getExperience().samples( ) >= batch_size: #更新policy网络 states, actions, result_states, rewards, falls, G_ts, exp_actions = masterAgent.getExperience( ).get_batch(batch_size) error = masterAgent.bellman_error(states, actions, rewards, result_states, falls) bellman_errors.append(error) if (settings['debug_critic']): loss__ = masterAgent.getPolicy()._get_critic_loss( ) # uses previous call batch data criticLosses.append(loss__) regularizationCost__ = masterAgent.getPolicy( )._get_critic_regularization() criticRegularizationCosts.append(regularizationCost__) if (settings['debug_actor']): #True loss__ = masterAgent.getPolicy()._get_actor_loss( ) # uses previous call batch data actorLosses.append(loss__) regularizationCost__ = masterAgent.getPolicy( )._get_actor_regularization() actorRegularizationCosts.append(regularizationCost__) if not all(np.isfinite(error)): print( "States: " + str(states) + " ResultsStates: " + str(result_states) + " Rewards: " + str(rewards) + " Actions: " + str(actions) + " Falls: ", str(falls)) print("Bellman Error is Nan: " + str(error) + str(np.isfinite(error))) sys.exit() error = np.mean(np.fabs(error)) if error > 10000: print("Error to big: ") print(states, actions, rewards, result_states) if (settings['train_forward_dynamics']): #False dynamicsLoss = masterAgent.getForwardDynamics( ).bellman_error(states, actions, result_states, rewards) dynamicsLoss = np.mean(np.fabs(dynamicsLoss)) #fabs:计算绝对值 dynamicsLosses.append(dynamicsLoss) if (settings['train_reward_predictor']): dynamicsRewardLoss = masterAgent.getForwardDynamics( ).reward_error(states, actions, result_states, rewards) dynamicsRewardLoss = np.mean( np.fabs(dynamicsRewardLoss)) dynamicsRewardLosses.append(dynamicsRewardLoss) if (settings['train_forward_dynamics']): print("Round: " + str(round_) + " Epoch: " + str(epoch) + " p: " + str(p) + " With mean reward: " + str(np.mean(rewards)) + " bellman error: " + str(error) + " ForwardPredictionLoss: " + str(dynamicsLoss)) else: print("Round: " + str(round_) + " Epoch: " + str(epoch) + " p: " + str(p) + " With mean reward: " + str(np.mean(rewards)) + " bellman error: " + str(error)) if (settings["print_levels"][settings["print_level"]] >= settings["print_levels"]['train']): print("Master agent experience size: " + str(masterAgent.getExperience().samples())) if (not settings['on_policy']): ## There could be stale policy parameters in here, use the last set put in the queue data = None while (not masterAgent_message_queue.empty()): ## Don't block try: data = masterAgent_message_queue.get(False) except Exception as inst: pass if (not (data == None)): masterAgent.setExperience(data[0]) masterAgent.getPolicy().setNetworkParameters(data[1]) masterAgent.setStateBounds( masterAgent.getExperience().getStateBounds()) masterAgent.setActionBounds( masterAgent.getExperience().getActionBounds()) masterAgent.setRewardBounds( masterAgent.getExperience().getRewardBounds()) if (settings['train_forward_dynamics']): masterAgent.getForwardDynamics().setNetworkParameters( data[2]) if ('keep_seperate_fd_exp_buffer' in settings and (settings['keep_seperate_fd_exp_buffer'])): masterAgent.setFDExperience(data[3]) # this->_actor->iterate(); ## This will let me know which part of learning is going slower training updates or simulation if (settings["print_levels"][settings["print_level"]] >= settings["print_levels"]['train']): print("sim queue size: ", input_anchor_queue.qsize()) #返回队列的大小 if (output_experience_queue != None): print("exp tuple queue size: ", output_experience_queue.qsize()) if (not settings['on_policy']): data = ('Update_Policy', p, masterAgent.getStateBounds(), masterAgent.getActionBounds(), masterAgent.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters()) if (settings['train_forward_dynamics']): data = ( 'Update_Policy', p, masterAgent.getStateBounds(), masterAgent.getActionBounds(), masterAgent.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters(), masterAgent.getForwardDynamics().getNetworkParameters()) message['type'] = 'Update_Policy' message['data'] = data for m_q in sim_work_queues: ## Don't block on full queue try: m_q.put(message, False) except: print("SimWorker model parameter message queue full: ", m_q.qsize()) if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for m_q in eval_sim_work_queues: ## Don't block on full queue try: m_q.put(message, False) except: print("SimWorker model parameter message queue full: ", m_q.qsize()) if (round_ % settings['plotting_update_freq_num_rounds']) == 0: # Running less often helps speed learning up. # Sync up sim actors if (settings['on_policy']): mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error, mean_eval, std_eval = evalModelParrallel( input_anchor_queue=eval_sim_work_queues, model=masterAgent, settings=settings, eval_episode_data_queue=eval_episode_data_queue, anchors=settings['eval_epochs']) else: mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error, mean_eval, std_eval = evalModelParrallel( input_anchor_queue=input_anchor_queue_eval, model=masterAgent, settings=settings, eval_episode_data_queue=eval_episode_data_queue, anchors=settings['eval_epochs']) print(mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error) if mean_bellman_error > 10000: print("Error to big: ") else: if (settings['train_forward_dynamics']): #false mean_dynamicsLosses = np.mean(dynamicsLosses) std_dynamicsLosses = np.std(dynamicsLosses) dynamicsLosses = [] if (settings['train_reward_predictor']): #false mean_dynamicsRewardLosses = np.mean(dynamicsRewardLosses) std_dynamicsRewardLosses = np.std(dynamicsRewardLosses) dynamicsRewardLosses = [] trainData["mean_reward"].append(mean_reward) trainData["std_reward"].append(std_reward) trainData["anneal_p"].append(p) trainData["mean_bellman_error"].append( np.mean(np.fabs(bellman_errors))) trainData["std_bellman_error"].append(np.std(bellman_errors)) bellman_errors = [] trainData["mean_discount_error"].append(mean_discount_error) trainData["std_discount_error"].append(std_discount_error) trainData["mean_eval"].append(mean_eval) trainData["std_eval"].append(std_eval) if (settings['train_forward_dynamics']): trainData["mean_forward_dynamics_loss"].append( mean_dynamicsLosses) trainData["std_forward_dynamics_loss"].append( std_dynamicsLosses) if (settings['train_reward_predictor']): trainData["mean_forward_dynamics_reward_loss"].append( mean_dynamicsRewardLosses) trainData["std_forward_dynamics_reward_loss"].append( std_dynamicsRewardLosses) if (round_ % settings['saving_update_freq_num_rounds']) == 0: if (settings['train_forward_dynamics']): file_name_dynamics = directory + "forward_dynamics_" + ".pkl" f = open(file_name_dynamics, 'wb') dill.dump(masterAgent.getForwardDynamics(), f) f.close() if mean_dynamicsLosses < best_dynamicsLosses: best_dynamicsLosses = mean_dynamicsLosses print("Saving BEST current forward dynamics agent: " + str(best_dynamicsLosses)) file_name_dynamics = directory + "forward_dynamics_" + "_Best.pkl" f = open(file_name_dynamics, 'wb') dill.dump(masterAgent.getForwardDynamics(), f) #save model f.close() if (mean_eval > best_eval): best_eval = mean_eval print("Saving BEST current agent: " + str(best_eval)) file_name = directory + getAgentName() + "_Best.pkl" f = open(file_name, 'wb') dill.dump(masterAgent.getPolicy(), f) f.close() if settings['save_trainData']: fp = open( directory + "trainingData_" + str(settings['agent_name']) + ".json", 'w') ## because json does not serialize np.float32 for key in trainData: trainData[key] = [float(i) for i in trainData[key]] json.dump(trainData, fp) fp.close() print("Saving current masterAgent") file_name = directory + getAgentName() + ".pkl" f = open(file_name, 'wb') dill.dump(masterAgent.getPolicy(), f) f.close() gc.collect() print("Terminating Workers") if (settings['on_policy']): for m_q in sim_work_queues: ## block on full queue m_q.put(None) if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for m_q in eval_sim_work_queues: ## block on full queue m_q.put(None) for sw in sim_workers: # Should update these more often sw.join() if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for sw in eval_sim_workers: # Should update these more often sw.join() for i in range(len(sim_work_queues)): print("sim_work_queues size: ", sim_work_queues[i].qsize()) while (not sim_work_queues[i].empty()): ### Empty the queue ## Don't block try: data_ = sim_work_queues[i].get(False) except Exception as inst: pass print("sim_work_queues size: ", sim_work_queues[i].qsize()) for i in range(len(eval_sim_work_queues)): print("eval_sim_work_queues size: ", eval_sim_work_queues[i].qsize()) while (not eval_sim_work_queues[i].empty()): ### Empty the queue ## Don't block try: data_ = eval_sim_work_queues[i].get(False) except Exception as inst: pass print("eval_sim_work_queues size: ", eval_sim_work_queues[i].qsize()) print("Finish sim") exp_val.finish() print("Save last versions of files.") file_name = directory + getAgentName() + ".pkl" f = open(file_name, 'wb') dill.dump(masterAgent.getPolicy(), f) f.close() f = open( directory + "trainingData_" + str(settings['agent_name']) + ".json", "w") for key in trainData: trainData[key] = [float(i) for i in trainData[key]] json.dump(trainData, f, sort_keys=True, indent=4) f.close() if (settings['train_forward_dynamics']): file_name_dynamics = directory + "forward_dynamics_" + ".pkl" f = open(file_name_dynamics, 'wb') dill.dump(masterAgent.getForwardDynamics(), f) f.close() print("Delete any plots being used") gc.collect() #立即释放内存
class GAN(AlgorithmInterface): """ 0 is a generated sample 1 is a true sample maximize D while minimizing G """ def __init__(self, model, state_length, action_length, state_bounds, action_bounds, settings_): print("Building GAN Model") super(GAN, self).__init__(model, state_length, action_length, state_bounds, action_bounds, 0, settings_) self._noise_mean = 0.0 self._noise_std = 1.0 self._noise_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) # if settings['action_space_continuous']: if ('size_of_result_state' in self.getSettings()): self._experience = ExperienceMemory( state_length, action_length, self.getSettings()['expereince_length'], continuous_actions=True, settings=self.getSettings(), result_state_length=self.getSettings()['size_of_result_state']) else: self._experience = ExperienceMemory( state_length, action_length, self.getSettings()['expereince_length'], continuous_actions=True, settings=self.getSettings()) self._experience.setStateBounds(copy.deepcopy(self.getStateBounds())) self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds())) self._experience.setActionBounds(copy.deepcopy(self.getActionBounds())) self._modelTarget = copy.deepcopy(model) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = self.getSettings()["fd_learning_rate"] self._regularization_weight = 1e-5 self._discount_factor = self.getSettings()['discount_factor'] self._rho = self.getSettings()['rho'] self._rms_epsilon = self.getSettings()['rms_epsilon'] self._weight_update_steps = self.getSettings( )['steps_until_target_network_update'] self._updates = 0 self._decay_weight = self.getSettings()['regularization_weight'] self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] # self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) # self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) # self._q_valsNextState = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) # self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) # self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) # self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) if ("train_gan_with_gaussian_noise" in self.getSettings() and (self.getSettings()["train_gan_with_gaussian_noise"])): inputs_1 = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model._Noise: self._noise_shared } self._generator_drop = lasagne.layers.get_output( self._model.getForwardDynamicsNetwork(), inputs_1, deterministic=True) self._generator = lasagne.layers.get_output( self._model.getForwardDynamicsNetwork(), inputs_1, deterministic=True) else: inputs_1 = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model._Noise: self._noise_shared } self._generator = lasagne.layers.get_output( self._model.getForwardDynamicsNetwork(), inputs_1, deterministic=True) self._generator_drop = lasagne.layers.get_output( self._model.getForwardDynamicsNetwork(), inputs_1, deterministic=False) # self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getForwardDynamicsNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) # self._q_valsActA_drop = lasagne.layers.get_output(self._model.getForwardDynamicsNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) inputs_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model._Noise: self._noise_shared } self._discriminator = lasagne.layers.get_output( self._model.getCriticNetwork(), inputs_, deterministic=True) self._discriminator_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), inputs_, deterministic=False) """ inputs_2 = { self._modelTarget.getStateSymbolicVariable(): self._model.getResultStates(), self._modelTarget.getActionSymbolicVariable(): self._model.getActions() } """ self._diff = self._model.getRewardSymbolicVariable( ) - self._discriminator_drop loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._diff_g = self._model.getResultStateSymbolicVariable( ) - self._generator_drop loss_g = T.pow(self._diff_g, 2) self._loss_g = T.mean(loss_g) # assert len(lasagne.layers.helper.get_all_params(self._l_outA)) == 16 # Need to remove the action layers from these params self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) print("******Number of Layers is: " + str( len( lasagne.layers.helper.get_all_params( self._model.getCriticNetwork())))) print("******Number of Action Layers is: " + str( len( lasagne.layers.helper.get_all_params( self._model.getForwardDynamicsNetwork())))) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getForwardDynamicsNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model._Noise: self._noise_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) ## MSE update self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) if ("train_gan_with_gaussian_noise" in settings_ and (settings_["train_gan_with_gaussian_noise"])): self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model._Noise: self._noise_shared } self._actGivens_MSE = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model._Noise: self._noise_shared } else: self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model._Noise: self._noise_shared } self._actGivens_MSE = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model._Noise: self._noise_shared } self._actor_regularization = ( self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getForwardDynamicsNetwork(), lasagne.regularization.l2)) ## MSE update self._gen_grad = T.grad(self._loss_g + self._actor_regularization, self._actionParams) print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_generator = lasagne.updates.adam( self._gen_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) ## Some cool stuff to backprop action gradients self._result_state_grad = T.matrix("Action_Grad") self._result_state_grad.tag.test_value = np.zeros( (self._batch_size, self._state_length), dtype=np.dtype(self.getSettings()['float_type'])) self._result_state_grad_shared = theano.shared( np.zeros((self._batch_size, self._state_length), dtype=self.getSettings()['float_type'])) ### Maximize wrt q function self._result_state_mean_grads = T.grad( cost=None, wrt=self._actionParams, known_grads={self._generator: self._result_state_grad_shared}), print("Action grads: ", self._result_state_mean_grads[0]) ## When passing in gradients it needs to be a proper list of gradient expressions self._result_state_mean_grads = list(self._result_state_mean_grads[0]) # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list)) # print ("Action grads: ", self._action_mean_grads) self._generatorGRADUpdates = lasagne.updates.adam( self._result_state_mean_grads, self._actionParams, self._learning_rate * 0.1, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), } ### Some other stuff to learn a reward function self._inputs_reward_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), } self._reward = lasagne.layers.get_output( self._model.getRewardNetwork(), self._inputs_reward_, deterministic=True) self._reward_drop = lasagne.layers.get_output( self._model.getRewardNetwork(), self._inputs_reward_, deterministic=False) ## because rewards are noramlized then scaled by the discount factor to the value stay between -1,1. self._reward_diff = (self._model.getRewardSymbolicVariable() * (1.0 / (1.0 - self.getSettings()['discount_factor'])) ) - self._reward_drop self.__Reward = self._model.getRewardSymbolicVariable() print("self.__Reward", self.__Reward) # self._reward_diff = (self._model.getRewardSymbolicVariable()) - self._reward_drop self._reward_loss_ = T.mean(T.pow(self._reward_diff, 2), axis=1) self._reward_loss = T.mean(self._reward_loss_) self._reward_diff_NoDrop = ( self._model.getRewardSymbolicVariable() * (1.0 / (1.0 - self.getSettings()['discount_factor']))) - self._reward # self._reward_diff_NoDrop = (self._model.getRewardSymbolicVariable()) - self._reward self._reward_loss_NoDrop_ = T.mean(T.pow(self._reward_diff_NoDrop, 2), axis=1) self._reward_loss_NoDrop = T.mean(self._reward_loss_NoDrop_) self._reward_params = lasagne.layers.helper.get_all_params( self._model.getRewardNetwork()) self._reward_givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), } self._reward_updates_ = lasagne.updates.adam( self._reward_loss + (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getRewardNetwork(), lasagne.regularization.l2)), self._reward_params, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) GAN.compile(self) def compile(self): self._train = theano.function([], [self._loss, self._discriminator], updates=self._updates_, givens=self._givens_) # self._trainActor = theano.function([], [actLoss, self._q_valsActA], updates=actionUpdates, givens=actGivens) # self._trainActor = theano.function([], [self._q_func], updates=self._actionUpdates, givens=self._actGivens) self._trainGenerator = theano.function( [], [], updates=self._generatorGRADUpdates, givens=self._actGivens) self._trainGenerator_MSE = theano.function( [], [], updates=self._updates_generator, givens=self._actGivens_MSE) self._discriminate = theano.function( [], self._discriminator, givens={ self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), }) #self._q_val_Target = theano.function([], self._q_valsB_, givens=self._givens_grad) if ("train_gan_with_gaussian_noise" in self.getSettings() and (self.getSettings()["train_gan_with_gaussian_noise"])): self._generate = theano.function( [], self._generator, givens={ self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model._Noise: self._noise_shared }) else: self._generate = theano.function( [], self._generator, givens={ self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model._Noise: self._noise_shared }) """ inputs_ = [ self._model.getStateSymbolicVariable(), self._model.getRewardSymbolicVariable(), # ResultState ] self._bellman_error = theano.function(inputs=inputs_, outputs=self._diff, allow_input_downcast=True) """ # self._diffs = theano.function(input=[State]) self._bellman_error = theano.function( inputs=[], outputs=self._loss_g, allow_input_downcast=True, givens={ self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model._Noise: self._noise_shared }) # self._get_action_grad = theano.function([], outputs=lasagne.updates.get_or_compute_grads(T.mean(self._discriminator), [self._model._actionInputVar] + self._params), allow_input_downcast=True, givens=self._givens_grad) self._get_state_grad = theano.function( [], outputs=lasagne.updates.get_or_compute_grads( T.mean(self._discriminator), [self._model._stateInputVar] + self._params), allow_input_downcast=True, givens=self._givens_grad) self._get_result_state_grad = theano.function( [], outputs=lasagne.updates.get_or_compute_grads( T.mean(self._discriminator), [self._model._resultStateInputVar] + self._params), allow_input_downcast=True, givens=self._givens_grad) self._get_action_grad = theano.function( [], outputs=T.grad( cost=None, wrt=[self._model._actionInputVar] + self._actionParams, known_grads={self._generator: self._result_state_grad_shared}), allow_input_downcast=True, givens=self._actGivens) # self._get_grad_reward = theano.function([], outputs=lasagne.updates.get_or_compute_grads((self._reward_loss_NoDrop), [lasagne.layers.get_all_layers(self._model.getRewardNetwork())[0].input_var] + self._reward_params), allow_input_downcast=True, self._get_grad_reward = theano.function( [], outputs=lasagne.updates.get_or_compute_grads( T.mean(self._reward), [self._model._actionInputVar] + self._reward_params), allow_input_downcast=True, givens=self._inputs_reward_) self._train_reward = theano.function([], [self._reward_loss], updates=self._reward_updates_, givens=self._reward_givens_) self._predict_reward = theano.function([], self._reward, givens=self._inputs_reward_) self._reward_error = theano.function(inputs=[], outputs=self._reward_diff, allow_input_downcast=True, givens=self._reward_givens_) self._reward_values = theano.function( inputs=[], outputs=self.__Reward, allow_input_downcast=True, givens={ # self._model.getStateSymbolicVariable() : self._model.getStates(), # self._model.getResultStateSymbolicVariable() : self._model.getResultStates(), # self._model.getActionSymbolicVariable(): self._model.getActions(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), }) def getStateGrads(self, states, actions=None, alreadyNormed=False): """ The states should be normalized """ # self.setData(states, actions, rewards, result_states) if (alreadyNormed == False): states = norm_state(states, self._state_bounds) states = np.array(states, dtype=theano.config.floatX) self._model.setStates(states) return self._get_state_grad() def getResultStateGrads(self, result_states, actions=None, alreadyNormed=False): """ The states should be normalized """ # self.setData(states, actions, rewards, result_states) if (alreadyNormed == False): result_states = norm_state(result_states, self._state_bounds) result_states = np.array(result_states, dtype=theano.config.floatX) self._model.setResultStates(result_states) return self._get_result_state_grad() def setGradTarget(self, grad): self._result_state_grad_shared.set_value(grad) def getGrads(self, states, actions, result_states, v_grad=None, alreadyNormed=False): if (alreadyNormed == False): states = np.array(norm_state(states, self._state_bounds), dtype=self.getSettings()['float_type']) actions = np.array(norm_action(actions, self._action_bounds), dtype=self.getSettings()['float_type']) result_states = np.array(norm_state(result_states, self._state_bounds), dtype=self.getSettings()['float_type']) # result_states = np.array(result_states, dtype=self.getSettings()['float_type']) self.setData(states, actions, result_states) # if (v_grad != None): self.setGradTarget(v_grad) return self._get_action_grad() def getRewardGrads(self, states, actions, alreadyNormed=False): # states = np.array(states, dtype=self.getSettings()['float_type']) # actions = np.array(actions, dtype=self.getSettings()['float_type']) if (alreadyNormed is False): states = np.array(norm_state(states, self._state_bounds), dtype=self.getSettings()['float_type']) actions = np.array(norm_action(actions, self._action_bounds), dtype=self.getSettings()['float_type']) # rewards = np.array(norm_state(rewards, self._reward_bounds), dtype=self.getSettings()['float_type']) self.setData(states, actions) return self._get_grad_reward() def getNetworkParameters(self): params = [] params.append( lasagne.layers.helper.get_all_param_values( self._model.getCriticNetwork())) params.append( lasagne.layers.helper.get_all_param_values( self._model.getForwardDynamicsNetwork())) params.append( lasagne.layers.helper.get_all_param_values( self._model.getRewardNetwork())) params.append( lasagne.layers.helper.get_all_param_values( self._modelTarget.getCriticNetwork())) params.append( lasagne.layers.helper.get_all_param_values( self._modelTarget.getForwardDynamicsNetwork())) params.append( lasagne.layers.helper.get_all_param_values( self._modelTarget.getRewardNetwork())) return params def setNetworkParameters(self, params): lasagne.layers.helper.set_all_param_values( self._model.getCriticNetwork(), params[0]) lasagne.layers.helper.set_all_param_values( self._model.getForwardDynamicsNetwork(), params[1]) lasagne.layers.helper.set_all_param_values( self._model.getRewardNetwork(), params[2]) lasagne.layers.helper.set_all_param_values( self._modelTarget.getCriticNetwork(), params[3]) lasagne.layers.helper.set_all_param_values( self._modelTarget.getForwardDynamicsNetwork(), params[4]) lasagne.layers.helper.set_all_param_values( self._modelTarget.getRewardNetwork(), params[5]) def setData(self, states, actions, result_states=None, rewards=None): self._model.setStates(states) self._model.setActions(actions) if not (result_states is None): self._model.setResultStates(result_states) if not (rewards is None): self._model.setRewards(rewards) noise = np.random.normal(self._noise_mean, self._noise_std, size=(states.shape[0], 1)) self._noise_shared.set_value(noise) # noise = np.zeros((states.shape[0],1)) # self._noise_shared.set_value(noise) def trainCritic(self, states, actions, result_states, rewards): self.setData(states, actions, result_states, rewards) noise = np.random.normal(self._noise_mean, self._noise_std, size=(states.shape[0], 1)) # print ("Shapes: ", states.shape, actions.shape, rewards.shape, result_states.shape, falls.shape, noise.shape) self._noise_shared.set_value(noise) self._updates += 1 ## Compute actions for TargetNet generated_samples = self._generate() ### Put generated samples in memory for i in range(generated_samples.shape[0]): next_state__ = scale_state(generated_samples[i], self._state_bounds) tup = ([states[i]], [actions[i]], [next_state__], [rewards[i]], [0], [0], [0]) self._experience.insertTuple(tup) tmp_result_states = copy.deepcopy(result_states) tmp_rewards = copy.deepcopy(rewards) ## Pull out a batch of generated samples states__, actions__, generated_samples, rewards__, falls__, G_ts__, exp_actions__ = self._experience.get_batch( min(states.shape[0], self._experience.samples())) """ print("generated_samples: ", generated_samples.shape) print("tmp_result_states: ", tmp_result_states.shape) print("tmp_rewards: ", tmp_rewards.shape) print("states: ", states.shape) print("actions: ", actions.shape) """ ## replace half of the samples with generated ones... for i in range(int(states.shape[0] / 2)): tmp_result_states[i] = generated_samples[i] tmp_rewards[i] = [0] # print("Discriminator targets: ", tmp_rewards) self.setData(states, actions, tmp_result_states, tmp_rewards) loss, _ = self._train() # print("Discriminator loss: ", loss) return loss def trainActor(self, states, actions, result_states, rewards): self.setData(states, actions, result_states, rewards) # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1))) ## Add MSE term if ('train_gan_mse' in self.getSettings() and (self.getSettings()['train_gan_mse'] == False)): pass else: self._trainGenerator_MSE() # print("Policy mean: ", np.mean(self._q_action(), axis=0)) loss = 0 # print("******** Not learning actor right now *****") # return loss generated_samples = self.predict_batch(states, actions) result_state_grads = self.getResultStateGrads(generated_samples, actions, alreadyNormed=True)[0] discriminator_value = self._discriminate() """ From DEEP REINFORCEMENT LEARNING IN PARAMETERIZED ACTION SPACE Hausknecht, Matthew and Stone, Peter actions.shape == result_state_grads.shape """ use_parameter_grad_inversion = True if (use_parameter_grad_inversion): for i in range(result_state_grads.shape[0]): for j in range(result_state_grads.shape[1]): if (result_state_grads[i, j] > 0): inversion = (1.0 - generated_samples[i, j]) / 2.0 else: inversion = (generated_samples[i, j] - (-1.0)) / 2.0 result_state_grads[i, j] = result_state_grads[i, j] * inversion if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['debug']): print("Policy mean: ", np.mean(self._generate(), axis=0)) print("Mean action grad: ", np.mean(result_state_grads, axis=0), " std ", np.std(result_state_grads, axis=0)) ## Set data for gradient self._model.setResultStates(result_states) self._modelTarget.setResultStates(result_states) # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1))) error_MSE = self._bellman_error() ## Why the -1.0?? ## Because the SGD method is always performing MINIMIZATION!! self._result_state_grad_shared.set_value(-1.0 * result_state_grads) self._trainGenerator() # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1))) error_MSE = self._bellman_error() return (np.mean(discriminator_value), error_MSE) def train(self, states, actions, result_states, rewards): loss = self.trainCritic(states, actions, result_states, rewards) # loss = 0 lossActor = self.trainActor(states, actions, result_states, rewards) if (self.getSettings()['train_reward_predictor']): # print ("self._reward_bounds: ", self._reward_bounds) # print( "Rewards, predicted_reward, difference, model diff, model rewards: ", np.concatenate((rewards, self._predict_reward(), self._predict_reward() - rewards, self._reward_error(), self._reward_values()), axis=1)) self.setData(states, actions, result_states, rewards) lossReward = self._train_reward() if (self.getSettings()["print_levels"][self.getSettings( )["print_level"]] >= self.getSettings()["print_levels"]['train']): print("Loss Reward: ", lossReward) return (loss, lossActor) def predict(self, state, deterministic_=True): pass def predict_batch(self, states, deterministic_=True): pass def predict(self, state, action): # states = np.zeros((self._batch_size, self._self._state_length), dtype=theano.config.floatX) # states[0, ...] = state state = np.array(norm_state(state, self._state_bounds), dtype=self.getSettings()['float_type']) # print ("fd state: ", state) action = np.array(norm_action(action, self._action_bounds), dtype=self.getSettings()['float_type']) # self._model.setStates(state) # self._model.setActions(action) self.setData(state, action) # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(1,1))) # print ("State bounds: ", self._state_bounds) # print ("gen output: ", self._generate()[0]) state_ = scale_state(self._generate(), self._state_bounds) # print( "self._state_bounds: ", self._state_bounds) # print ("scaled output: ", state_) return state_ def predict_batch(self, states, actions): ## These input should already be normalized. # self._model.setStates(states) # self._model.setActions(actions) self.setData(states, actions) # self._noise_shared.set_value(np.random.normal(self._noise_mean,self._noise_std, size=(states.shape[0],1))) # print ("State bounds: ", self._state_bounds) # print ("fd output: ", self._forwardDynamics()[0]) # state_ = scale_state(self._generate(), self._state_bounds) state_ = self._generate() return state_ def q_value(self, state): """ For returning a vector of q values, state should NOT be normalized """ # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX) # states[0, ...] = state state = norm_state(state, self._state_bounds) state = np.array(state, dtype=theano.config.floatX) self._model.setStates(state) self._modelTarget.setStates(state) action = self._q_action() self._model.setActions(action) self._modelTarget.setActions(action) return scale_reward(self._discriminate(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # return self._q_valTarget()[0] # return self._q_val()[0] def q_value(self, state, action, next_state): """ For returning a vector of q values, state should NOT be normalized """ # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX) # states[0, ...] = state state = norm_state(state, self._state_bounds) state = np.array(state, dtype=theano.config.floatX) self._model.setStates(state) self._modelTarget.setStates(state) # action = self._q_action() action = norm_state(action, self.getActionBounds()) self._model.setActions(action) self._modelTarget.setActions(action) nextState = norm_state(next_state, self.getStateBounds()) # nextState = np.reshape(nextState, (1,20)) self._model.setResultStates(nextState) self._modelTarget.setResultStates(nextState) # return scale_reward(self._discriminate(), self.getRewardBounds())[0] * (1.0 / (1.0- self.getSettings()['discount_factor'])) return self._discriminate() # return self._q_valTarget()[0] # return self._q_val()[0] def q_values(self, state): """ For returning a vector of q values, state should already be normalized """ state = norm_state(state, self._state_bounds) state = np.array(state, dtype=theano.config.floatX) self._model.setStates(state) self._modelTarget.setStates(state) action = self._q_action() self._model.setActions(action) self._modelTarget.setActions(action) return scale_reward(self._q_val(), self.getRewardBounds()) * ( 1.0 / (1.0 - self.getSettings()['discount_factor'])) # return self._q_valTarget() # return self._q_val() def predict_std(self, state, deterministic_=True): """ This does nothing for a GAN... """ # states = np.zeros((self._batch_size, self._state_length), dtype=theano.config.floatX) # states[0, ...] = state action_std = np.array([0] * len(self._action_bounds)) # np.zeros((state.shape[0], len(self._action_bounds))) # else: # action_ = scale_action(self._q_action()[0], self._action_bounds) # action_ = q_valsActA[0] return action_std def predict_reward(self, state, action): # states = np.zeros((self._batch_size, self._self._state_length), dtype=theano.config.floatX) # states[0, ...] = state state = np.array(norm_state(state, self._state_bounds), dtype=self.getSettings()['float_type']) action = np.array(norm_action(action, self._action_bounds), dtype=self.getSettings()['float_type']) self._model.setStates(state) self._model.setActions(action) predicted_reward = self._predict_reward() reward_ = scale_reward(predicted_reward, self.getRewardBounds( )) # * (1.0 / (1.0- self.getSettings()['discount_factor'])) # reward_ = scale_reward(predicted_reward, self.getRewardBounds())[0] * (1.0 / (1.0- self.getSettings()['discount_factor'])) # reward_ = scale_state(predicted_reward, self._reward_bounds) # print ("reward, predicted reward: ", reward_, predicted_reward) return reward_ def predict_reward_batch(self, states, actions): # states = np.zeros((self._batch_size, self._self._state_length), dtype=theano.config.floatX) # states[0, ...] = state # state = np.array(norm_state(state, self._state_bounds), dtype=self.getSettings()['float_type']) # action = np.array(norm_action(action, self._action_bounds), dtype=self.getSettings()['float_type']) self._model.setStates(states) self._model.setActions(actions) predicted_reward = self._predict_reward() # reward_ = scale_reward(predicted_reward, self.getRewardBounds())[0] # * (1.0 / (1.0- self.getSettings()['discount_factor'])) # reward_ = scale_reward(predicted_reward, self.getRewardBounds())[0] * (1.0 / (1.0- self.getSettings()['discount_factor'])) # reward_ = scale_state(predicted_reward, self._reward_bounds) # print ("reward, predicted reward: ", reward_, predicted_reward) return predicted_reward def bellman_error(self, states, actions, result_states, rewards): self.setData(states, actions, result_states, rewards) return self._bellman_error() def reward_error(self, states, actions, result_states, rewards): # rewards = rewards * (1.0/(1.0-self.getSettings()['discount_factor'])) # scale rewards self.setData(states, actions, result_states, rewards) return self._reward_error() def setStateBounds(self, state_bounds): super(GAN, self).setStateBounds(state_bounds) """ print ("") print("Setting GAN state bounds: ", state_bounds) print("self.getStateBounds(): ", self.getStateBounds()) print ("") """ self._experience.setStateBounds(copy.deepcopy(self.getStateBounds())) def setActionBounds(self, action_bounds): super(GAN, self).setActionBounds(action_bounds) self._experience.setActionBounds(copy.deepcopy(self.getActionBounds())) def setRewardBounds(self, reward_bounds): super(GAN, self).setRewardBounds(reward_bounds) self._experience.setRewardBounds(copy.deepcopy(self.getRewardBounds()))
def trainForwardDynamics(settingsFileName): """ State is the input state and Action is the desired output (y). """ # from model.ModelUtil import * np.random.seed(23) file = open(settingsFileName) settings = json.load(file) print("Settings: ", str(json.dumps(settings))) file.close() import os os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device=" + settings[ 'training_processor_type'] + ",floatX=" + settings['float_type'] # import theano # from theano import tensor as T # import lasagne from util.SimulationUtil import validateSettings from util.SimulationUtil import getDataDirectory from util.SimulationUtil import createForwardDynamicsModel, createRLAgent from model.NeuralNetwork import NeuralNetwork from util.ExperienceMemory import ExperienceMemory import matplotlib.pyplot as plt import math # from ModelEvaluation import * # from util.SimulationUtil import * import time settings = validateSettings(settings) # anchor_data_file = open(settings["anchor_file"]) # _anchors = getAnchors(anchor_data_file) # print ("Length of anchors epochs: ", str(len(_anchors))) # anchor_data_file.close() train_forward_dynamics = True model_type = settings["model_type"] directory = getDataDirectory(settings) if not os.path.exists(directory): os.makedirs(directory) if (settings['train_forward_dynamics']): if "." in settings['forward_dynamics_model_type']: ### convert . to / and copy file over file_name = settings['forward_dynamics_model_type'] k = file_name.rfind(".") file_name = file_name[:k] file_name_read = file_name.replace(".", "/") file_name_read = file_name_read + ".py" print("model file name:", file_name) print("os.path.basename(file_name): ", os.path.basename(file_name)) file = open(file_name_read, 'r') out_file = open(directory + file_name + ".py", 'w') out_file.write(file.read()) file.close() out_file.close() discrete_actions = np.array(settings['discrete_actions']) num_actions = discrete_actions.shape[0] # number of rows rounds = settings["rounds"] epochs = settings["epochs"] # num_states=settings["num_states"] epsilon = settings["epsilon"] discount_factor = settings["discount_factor"] # max_reward=settings["max_reward"] reward_bounds = np.array([[-10.1], [0.0]]) batch_size = settings["batch_size"] train_on_validation_set = settings["train_on_validation_set"] state_bounds = np.array(settings['state_bounds']) discrete_actions = np.array(settings['discrete_actions']) print("Sim config file name: ", str(settings["sim_config_file"])) # c = characterSim.Configuration(str(settings["sim_config_file"])) # c = characterSim.Configuration("../data/epsilon0Config.ini") action_space_continuous = settings['action_space_continuous'] # states2 = np.transpose(np.repeat([states], 2, axis=0)) # print states2 if action_space_continuous: action_bounds = np.array(settings["action_bounds"], dtype=float) if action_space_continuous: experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), settings['expereince_length'], continuous_actions=True, settings=settings) else: experience = ExperienceMemory(len(state_bounds[0]), 1, settings['expereince_length']) experience.setSettings(settings) file_name = directory + getAgentName() + "expBufferInit.hdf5" # experience.saveToFile(file_name) experience.loadFromFile(file_name) state_bounds = experience._state_bounds print("Samples in experience: ", experience.samples()) if (settings['train_forward_dynamics']): if (settings['forward_dynamics_model_type'] == "SingleNet"): print( "Creating forward dynamics network: Using single network model" ) model = createRLAgent(settings['agent_name'], state_bounds, discrete_actions, reward_bounds, settings) forwardDynamicsModel = createForwardDynamicsModel(settings, state_bounds, action_bounds, None, None, agentModel=model) # forwardDynamicsModel = model else: print("Creating forward dynamics network") # forwardDynamicsModel = ForwardDynamicsNetwork(state_length=len(state_bounds[0]),action_length=len(action_bounds[0]), state_bounds=state_bounds, action_bounds=action_bounds, settings_=settings) forwardDynamicsModel = createForwardDynamicsModel(settings, state_bounds, action_bounds, None, None, agentModel=None) if settings['visualize_learning']: from NNVisualize import NNVisualize title = file_name = settings['forward_dynamics_model_type'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 file_name = file_name[k:] nlv = NNVisualize(title=str("Forward Dynamics Model") + " with " + str(file_name)) nlv.setInteractive() nlv.init() if (settings['train_reward_predictor']): if settings['visualize_learning']: rewardlv = NNVisualize(title=str("Reward Model") + " with " + str(settings["model_type"]), settings=settings) rewardlv.setInteractive() rewardlv.init() # experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), experience_length, continuous_actions=True) """ for i in range(experience_length): action_ = np.array([actions[i]]) state_ = np.array([states[i]]) # print "Action: " + str([actions[i]]) experience.insert(norm_state(state_, state_bounds), norm_action(action_, action_bounds), norm_state(state_, state_bounds), norm_reward(np.array([0]), reward_bounds)) """ trainData = {} trainData["mean_reward"] = [] trainData["std_reward"] = [] trainData["mean_bellman_error"] = [] trainData["std_bellman_error"] = [] trainData["mean_discount_error"] = [] trainData["std_discount_error"] = [] trainData["mean_forward_dynamics_loss"] = [] trainData["std_forward_dynamics_loss"] = [] trainData["mean_forward_dynamics_reward_loss"] = [] trainData["std_forward_dynamics_reward_loss"] = [] trainData["mean_eval"] = [] trainData["std_eval"] = [] # dynamicsLosses=[] best_dynamicsLosses = 1000000 _states, _actions, _result_states, _rewards, _falls, _G_ts, exp_actions__ = experience.get_batch( batch_size) """ _states = theano.shared(np.array(_states, dtype=theano.config.floatX)) _actions = theano.shared(np.array(_actions, dtype=theano.config.floatX)) _result_states = theano.shared(np.array(_result_states, dtype=theano.config.floatX)) _rewards = theano.shared(np.array(_rewards, dtype=theano.config.floatX)) """ forwardDynamicsModel.setData(_states, _actions, _result_states) for round_ in range(rounds): t0 = time.time() for epoch in range(epochs): _states, _actions, _result_states, _rewards, _falls, _G_ts, exp_actions__ = experience.get_batch( batch_size) # print _actions # dynamicsLoss = forwardDynamicsModel.train(states=_states, actions=_actions, result_states=_result_states) # forwardDynamicsModel.setData(_states, _actions, _result_states) dynamicsLoss = forwardDynamicsModel.train(_states, _actions, _result_states, _rewards) # dynamicsLoss = forwardDynamicsModel._train() t1 = time.time() if (round_ % settings['plotting_update_freq_num_rounds']) == 0: dynamicsLoss_ = forwardDynamicsModel.bellman_error( _states, _actions, _result_states, _rewards) # dynamicsLoss_ = forwardDynamicsModel.bellman_error((_states), (_actions), (_result_states)) if (settings['use_stochastic_forward_dynamics']): dynamicsLoss = np.mean(dynamicsLoss_) else: dynamicsLoss = np.mean(np.fabs(dynamicsLoss_)) if (settings['train_reward_predictor']): dynamicsRewardLoss_ = forwardDynamicsModel.reward_error( _states, _actions, _result_states, _rewards) dynamicsRewardLoss = np.mean(np.fabs(dynamicsRewardLoss_)) # dynamicsRewardLosses.append(dynamicsRewardLoss) dynamicsRewardLosses = dynamicsRewardLoss if (settings['train_forward_dynamics'] and ((round_ % settings['plotting_update_freq_num_rounds']) == 0)): # dynamicsLosses.append(dynamicsLoss) mean_dynamicsLosses = dynamicsLoss std_dynamicsLosses = np.std((dynamicsLoss_)) if (settings['train_forward_dynamics']): trainData["mean_forward_dynamics_loss"].append( mean_dynamicsLosses) trainData["std_forward_dynamics_loss"].append( std_dynamicsLosses) print("Round: " + str(round_) + " Epoch: " + str(epoch) + " ForwardPredictionLoss: " + str(dynamicsLoss) + " in " + str(datetime.timedelta(seconds=(t1 - t0))) + " seconds") # print ("State Bounds: ", forwardDynamicsModel.getStateBounds(), " exp: ", experience.getStateBounds()) # print ("Action Bounds: ", forwardDynamicsModel.getActionBounds(), " exp: ", experience.getActionBounds()) # print (str(datetime.timedelta(seconds=(t1-t0)))) if (settings['visualize_learning']): nlv.updateLoss( np.array(trainData["mean_forward_dynamics_loss"]), np.array(trainData["std_forward_dynamics_loss"])) nlv.redraw() nlv.setInteractiveOff() nlv.saveVisual(directory + "trainingGraphNN") nlv.setInteractive() if (settings['train_reward_predictor']): mean_dynamicsRewardLosses = np.mean(dynamicsRewardLoss) std_dynamicsRewardLosses = np.std(dynamicsRewardLoss_) dynamicsRewardLosses = [] trainData["mean_forward_dynamics_reward_loss"].append( mean_dynamicsRewardLosses) trainData["std_forward_dynamics_reward_loss"].append( std_dynamicsRewardLosses) if (settings['train_reward_predictor'] and settings['visualize_learning']): rewardlv.updateLoss( np.array(trainData["mean_forward_dynamics_reward_loss"]), np.array(trainData["std_forward_dynamics_reward_loss"])) rewardlv.redraw() rewardlv.setInteractiveOff() rewardlv.saveVisual(directory + "rewardTrainingGraph") rewardlv.setInteractive() if (round_ % settings['saving_update_freq_num_rounds']) == 0: if mean_dynamicsLosses < best_dynamicsLosses: best_dynamicsLosses = mean_dynamicsLosses print("Saving BEST current forward dynamics model: " + str(best_dynamicsLosses)) file_name_dynamics = directory + "forward_dynamics_" + "_Best_pretrain.pkl" f = open(file_name_dynamics, 'wb') dill.dump(forwardDynamicsModel, f) f.close() if settings['save_trainData']: fp = open( directory + "FD_trainingData_" + str(settings['agent_name']) + ".json", 'w') # print ("Train data: ", trainData) ## because json does not serialize np.float32 for key in trainData: trainData[key] = [float(i) for i in trainData[key]] json.dump(trainData, fp) fp.close()