def sendData(settings): from util.SimulationUtil import getDataDirectory, getAgentName from_email = "*****@*****.**" email_subject = "Simulation Data" email_text = """ This email includes some data on the current state of the simulation. \n Take care.\n """ directory = getDataDirectory(settings) agentName = getAgentName(settings) print("Data folder: ", directory) trainingGraph = directory + agentName + '_' + ".png" try: send_mail(send_from=from_email, send_to=['*****@*****.**'], subject=email_subject, text=email_text, files=[trainingGraph]) except Exception as e: print 'Error email data: %s' % e except: print "Emailling of simulation data failed " print "Unexpected error:", sys.exc_info()[0] pass
def combineNetworkModels(settings_file_name): from model.ModelUtil import getSettings settings = getSettings(settings_file_name) # settings['shouldRender'] = True import os os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device=" + settings[ 'training_processor_type'] + ",floatX=" + settings['float_type'] ## Theano needs to be imported after the flags are set. # from ModelEvaluation import * # from model.ModelUtil import * from ModelEvaluation import SimWorker, evalModelParrallel, collectExperience # from model.ModelUtil import validBounds from model.LearningAgent import LearningAgent, LearningWorker from util.SimulationUtil import validateSettings, createEnvironment, createRLAgent, createActor from util.SimulationUtil import getDataDirectory, createForwardDynamicsModel, createSampler, getAgentName from util.ExperienceMemory import ExperienceMemory from RLVisualize import RLVisualize from NNVisualize import NNVisualize directory = getDataDirectory(settings) rounds = settings["rounds"] epochs = settings["epochs"] # num_states=settings["num_states"] epsilon = settings["epsilon"] discount_factor = settings["discount_factor"] reward_bounds = np.array(settings["reward_bounds"]) # reward_bounds = np.array([[-10.1],[0.0]]) batch_size = settings["batch_size"] train_on_validation_set = settings["train_on_validation_set"] state_bounds = np.array(settings['state_bounds']) discrete_actions = np.array(settings['discrete_actions']) num_actions = discrete_actions.shape[0] # number of rows print("Sim config file name: " + str(settings["sim_config_file"])) # c = characterSim.Configuration(str(settings["sim_config_file"])) # c = characterSim.Configuration("../data/epsilon0Config.ini") action_space_continuous = settings['action_space_continuous'] settings['load_saved_model'] = False new_model = createRLAgent(settings['agent_name'], state_bounds, discrete_actions, reward_bounds, settings) params = new_model.getNetworkParameters() print("New Network Critic shape") for i in range(len(params[0])): print(params[0][i].shape) print("New Network Critic shape, done") if (True): file_name = directory + getAgentName() + ".pkl" else: file_name = directory + getAgentName() + "_Best.pkl" print("Loading model: ", file_name) f = open(file_name, 'rb') old_model = dill.load(f) f.close() print("State Length: ", len(old_model.getStateBounds()[0])) if (True): new_model.setAgentNetworkParamters(old_model) new_model.setCombinedNetworkParamters(old_model) # new_model.setMergeLayerNetworkParamters(old_model) new_model.setMergeLayerNetworkParamters(old_model, zeroInjectedMergeLayer=True) else: new_model.setNetworkParameters(old_model.getNetworkParameters()) params = new_model.getNetworkParameters() print("New Network Critic shape") for i in range(len(params[0])): print(params[0][i].shape) for i in range(len(params[1])): print(params[1][i].shape) ### Modify state bounds state_bounds[:, settings['num_terrain_features']:len( state_bounds[0])] = old_model.getStateBounds() print("State bounds: ", state_bounds.shape) print(state_bounds) new_model.setStateBounds(state_bounds) new_model.setActionBounds(old_model.getActionBounds()) new_model.setRewardBounds(old_model.getRewardBounds()) file_name = directory + getAgentName() + "_Injected.pkl" f = open(file_name, 'wb') dill.dump(new_model, f) f.close()
def trainModelParallel(inputData): settingsFileName = inputData[0] settings = inputData[1] np.random.seed(int(settings['random_seed'])) import os if ('THEANO_FLAGS' in os.environ): os.environ['THEANO_FLAGS'] = os.environ[ 'THEANO_FLAGS'] + "mode=FAST_RUN,device=" + settings[ 'training_processor_type'] + ",floatX=" + settings['float_type'] else: os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device=" + settings[ 'training_processor_type'] + ",floatX=" + settings['float_type'] import keras.backend keras.backend.set_floatx(settings['float_type']) print("K.floatx()", keras.backend.floatx()) from ModelEvaluation import SimWorker, evalModelParrallel, collectExperience, simEpoch, evalModel, simModelParrallel from model.ModelUtil import validBounds, fixBounds, anneal_value from model.LearningAgent import LearningAgent, LearningWorker from util.SimulationUtil import validateSettings from util.SimulationUtil import createEnvironment from util.SimulationUtil import createRLAgent from util.SimulationUtil import createActor, getAgentName from util.SimulationUtil import getDataDirectory, createForwardDynamicsModel, createSampler from util.ExperienceMemory import ExperienceMemory from RLVisualize import RLVisualize from NNVisualize import NNVisualize #from sim.PendulumEnvState import PendulumEnvState #from sim.PendulumEnv import PendulumEnv #from sim.BallGame2DEnv import BallGame2DEnv settings = validateSettings(settings) model_type = settings["model_type"] directory = getDataDirectory(settings) if not os.path.exists(directory): os.makedirs(directory) # copy settings file out_file_name = directory + os.path.basename(settingsFileName) print("Saving settings file with data: ", out_file_name) out_file = open(out_file_name, 'w') out_file.write(json.dumps(settings, indent=4)) out_file.close() ### Try and save algorithm and model files for reference if "." in settings['model_type']: ### convert . to / and copy file over file_name = settings['model_type'] k = file_name.rfind(".") file_name = file_name[:k] file_name_read = file_name.replace(".", "/") file_name_read = file_name_read + ".py" print("model file name:", file_name) print("os.path.basename(file_name): ", os.path.basename(file_name)) file = open(file_name_read, 'r') out_file = open(directory + file_name + ".py", 'w') out_file.write(file.read()) file.close() out_file.close() if "." in settings['agent_name']: ### convert . to / and copy file over file_name = settings['agent_name'] k = file_name.rfind(".") file_name = file_name[:k] file_name_read = file_name.replace(".", "/") file_name_read = file_name_read + ".py" print("model file name:", file_name) print("os.path.basename(file_name): ", os.path.basename(file_name)) file = open(file_name_read, 'r') out_file = open(directory + file_name + ".py", 'w') out_file.write(file.read()) file.close() out_file.close() if (settings['train_forward_dynamics']): if "." in settings['forward_dynamics_model_type']: ### convert . to / and copy file over file_name = settings['forward_dynamics_model_type'] k = file_name.rfind(".") file_name = file_name[:k] file_name_read = file_name.replace(".", "/") file_name_read = file_name_read + ".py" print("model file name:", file_name) print("os.path.basename(file_name): ", os.path.basename(file_name)) file = open(file_name_read, 'r') out_file = open(directory + file_name + ".py", 'w') out_file.write(file.read()) file.close() out_file.close() rounds = settings["rounds"] epochs = settings["epochs"] epsilon = settings["epsilon"] discount_factor = settings["discount_factor"] reward_bounds = np.array(settings["reward_bounds"]) batch_size = settings["batch_size"] train_on_validation_set = settings["train_on_validation_set"] state_bounds = np.array(settings['state_bounds']) discrete_actions = np.array(settings['discrete_actions']) #9*6 num_actions = discrete_actions.shape[0] # number of rows print("Sim config file name: " + str(settings["sim_config_file"])) action_space_continuous = settings['action_space_continuous'] if (settings['num_available_threads'] == 1): input_anchor_queue = multiprocessing.Queue( settings['queue_size_limit']) input_anchor_queue_eval = multiprocessing.Queue( settings['queue_size_limit']) output_experience_queue = multiprocessing.Queue( settings['queue_size_limit']) eval_episode_data_queue = multiprocessing.Queue( settings['queue_size_limit']) else: input_anchor_queue = multiprocessing.Queue(settings['epochs']) input_anchor_queue_eval = multiprocessing.Queue(settings['epochs']) output_experience_queue = multiprocessing.Queue( settings['queue_size_limit']) eval_episode_data_queue = multiprocessing.Queue( settings['eval_epochs']) if (settings['on_policy']): ## So that off policy agent does not learn output_experience_queue = None sim_work_queues = [] action_space_continuous = settings['action_space_continuous'] if action_space_continuous: action_bounds = np.array(settings["action_bounds"], dtype=float) ### Using a wrapper for the type of actor now actor = createActor(settings['environment_type'], settings, None) exp_val = None if (not validBounds(action_bounds)): # Check that the action bounds are spcified correctly print("Action bounds invalid: ", action_bounds) sys.exit() if (not validBounds(state_bounds)): # Probably did not collect enough bootstrapping samples to get good state bounds. print("State bounds invalid: ", state_bounds) state_bounds = fixBounds(np.array(state_bounds)) bound_fixed = validBounds(state_bounds) print("State bounds fixed: ", bound_fixed) sys.exit() if (not validBounds(reward_bounds)): print("Reward bounds invalid: ", reward_bounds) sys.exit() if settings['action_space_continuous']: experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), settings['expereince_length'], continuous_actions=True, settings=settings) else: experience = ExperienceMemory(len(state_bounds[0]), 1, settings['expereince_length']) experience.setSettings(settings) if settings['visualize_learning']: title = settings['agent_name'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] rlv = RLVisualize(title=title + " agent on " + str(settings['environment_type']), settings=settings) rlv.setInteractive() rlv.init() if (settings['train_forward_dynamics']): if settings['visualize_learning']: title = settings['forward_dynamics_model_type'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] nlv = NNVisualize(title=str("Dynamics Model") + " with " + title, settings=settings) nlv.setInteractive() nlv.init() if (settings['train_reward_predictor']): if settings['visualize_learning']: title = settings['forward_dynamics_model_type'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] rewardlv = NNVisualize(title=str("Reward Model") + " with " + title, settings=settings) rewardlv.setInteractive() rewardlv.init() if (settings['debug_critic']): #True criticLosses = [] criticRegularizationCosts = [] if (settings['visualize_learning']): title = settings['agent_name'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] critic_loss_viz = NNVisualize(title=str("Critic Loss") + " with " + title) critic_loss_viz.setInteractive() critic_loss_viz.init() critic_regularization_viz = NNVisualize( title=str("Critic Reg Cost") + " with " + title) critic_regularization_viz.setInteractive() critic_regularization_viz.init() if (settings['debug_actor']): # True actorLosses = [] actorRegularizationCosts = [] if (settings['visualize_learning']): #False title = settings['agent_name'] k = title.rfind(".") + 1 if (k > len(title)): ## name does not contain a . k = 0 title = title[k:] actor_loss_viz = NNVisualize(title=str("Actor Loss") + " with " + title) actor_loss_viz.setInteractive() actor_loss_viz.init() actor_regularization_viz = NNVisualize( title=str("Actor Reg Cost") + " with " + title) actor_regularization_viz.setInteractive() actor_regularization_viz.init() model = createRLAgent(settings['agent_name'], state_bounds, discrete_actions, reward_bounds, settings) #return a model class forwardDynamicsModel = None if (settings['train_forward_dynamics']): #False if (settings['forward_dynamics_model_type'] == "SingleNet"): print( "Creating forward dynamics network: Using single network model" ) forwardDynamicsModel = createForwardDynamicsModel(settings, state_bounds, action_bounds, None, None, agentModel=model) else: print("Creating forward dynamics network") forwardDynamicsModel = createForwardDynamicsModel(settings, state_bounds, action_bounds, None, None, agentModel=None) forwardDynamicsModel.setActor(actor) forwardDynamicsModel.init(len(state_bounds[0]), len(action_bounds[0]), state_bounds, action_bounds, actor, None, settings) (agent, learning_workers) = createLearningAgent(settings, output_experience_queue, state_bounds, action_bounds, reward_bounds) masterAgent = agent ### These are the workers for training (sim_workers, sim_work_queues) = createSimWorkers( settings, input_anchor_queue, output_experience_queue, eval_episode_data_queue, model, forwardDynamicsModel, exp_val, state_bounds, action_bounds, reward_bounds) eval_sim_workers = sim_workers eval_sim_work_queues = sim_work_queues if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): #True (eval_sim_workers, eval_sim_work_queues) = createSimWorkers( settings, input_anchor_queue_eval, output_experience_queue, eval_episode_data_queue, model, forwardDynamicsModel, exp_val, state_bounds, action_bounds, reward_bounds, default_sim_id=settings['override_sim_env_id']) # id=1 else: input_anchor_queue_eval = input_anchor_queue best_eval = -100000000.0 best_dynamicsLosses = best_eval * -1.0 values = [] discounted_values = [] bellman_error = [] reward_over_epoc = [] dynamicsLosses = [] dynamicsRewardLosses = [] for lw in learning_workers: print("Learning worker") print(lw) if (int(settings["num_available_threads"]) > 1): for sw in sim_workers: print("Sim worker") print(sw) sw.start() if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for sw in eval_sim_workers: print("Sim worker") print(sw) sw.start() ## This needs to be done after the simulation worker processes are created exp_val = createEnvironment(settings["forwardDynamics_config_file"], settings['environment_type'], settings, render=settings['shouldRender'], index=0) exp_val.setActor(actor) exp_val.getActor().init() exp_val.init() ### This is for a single-threaded Synchronous sim only. if (int(settings["num_available_threads"]) == 1 ): # This is okay if there is one thread only... sim_workers[0].setEnvironment(exp_val) sim_workers[0].start() if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): eval_sim_workers[0].setEnvironment(exp_val) eval_sim_workers[0].start() masterAgent.setPolicy(model) if (settings['train_forward_dynamics']): masterAgent.setForwardDynamics(forwardDynamicsModel) tmp_p = 1.0 message = {} if (settings['load_saved_model']): tmp_p = settings['min_epsilon'] data = ('Update_Policy', tmp_p, model.getStateBounds(), model.getActionBounds(), model.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters()) if (settings['train_forward_dynamics']): data = ('Update_Policy', tmp_p, model.getStateBounds(), model.getActionBounds(), model.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters(), masterAgent.getForwardDynamics().getNetworkParameters()) message['type'] = 'Update_Policy' message['data'] = data for m_q in sim_work_queues: print("trainModel: Sending current network parameters: ", m_q) m_q.put(message) if (int(settings["num_available_threads"]) == 1): experience, state_bounds, reward_bounds, action_bounds = collectExperience( actor, exp_val, model, settings, sim_work_queues=None, eval_episode_data_queue=None ) #experience: state, action, nextstate, rewards, else: if (settings['on_policy']): experience, state_bounds, reward_bounds, action_bounds = collectExperience( actor, None, model, settings, sim_work_queues=sim_work_queues, eval_episode_data_queue=eval_episode_data_queue) else: experience, state_bounds, reward_bounds, action_bounds = collectExperience( actor, None, model, settings, sim_work_queues=input_anchor_queue, eval_episode_data_queue=eval_episode_data_queue) masterAgent.setExperience(experience) if ('keep_seperate_fd_exp_buffer' in settings and (settings['keep_seperate_fd_exp_buffer'])): masterAgent.setFDExperience(copy.deepcopy(experience)) if (not validBounds(action_bounds)): # Check that the action bounds are spcified correctly print("Action bounds invalid: ", action_bounds) sys.exit() if (not validBounds(state_bounds)): # Probably did not collect enough bootstrapping samples to get good state bounds. print("State bounds invalid: ", state_bounds) state_bounds = fixBounds(np.array(state_bounds)) bound_fixed = validBounds(state_bounds) print("State bounds fixed: ", bound_fixed) if (not validBounds(reward_bounds)): print("Reward bounds invalid: ", reward_bounds) sys.exit() print("Reward History: ", experience._reward_history) print("Action History: ", experience._action_history) print("Action Mean: ", np.mean(experience._action_history)) print("Experience Samples: ", (experience.samples())) if (settings["save_experience_memory"]): print("Saving initial experience memory") file_name = directory + getAgentName() + "_expBufferInit.hdf5" experience.saveToFile(file_name) if (settings['load_saved_model'] or (settings['load_saved_model'] == 'network_and_scales')): ## Transfer learning experience.setStateBounds(copy.deepcopy(model.getStateBounds())) experience.setRewardBounds(copy.deepcopy(model.getRewardBounds())) experience.setActionBounds(copy.deepcopy(model.getActionBounds())) model.setSettings(settings) else: ## Normal model.setStateBounds(state_bounds) model.setActionBounds(action_bounds) model.setRewardBounds(reward_bounds) experience.setStateBounds(copy.deepcopy(model.getStateBounds())) experience.setRewardBounds(copy.deepcopy(model.getRewardBounds())) experience.setActionBounds(copy.deepcopy(model.getActionBounds())) masterAgent_message_queue = multiprocessing.Queue(settings['epochs']) if (settings['train_forward_dynamics']): if (not settings['load_saved_model']): forwardDynamicsModel.setStateBounds(state_bounds) forwardDynamicsModel.setActionBounds(action_bounds) forwardDynamicsModel.setRewardBounds(reward_bounds) masterAgent.setForwardDynamics(forwardDynamicsModel) ## Now everything related to the exp memory needs to be updated bellman_errors = [] masterAgent.setPolicy(model) print("Master agent state bounds: ", repr(masterAgent.getPolicy().getStateBounds())) for sw in sim_workers: # Need to update parameter bounds for models print("exp: ", sw._exp) print("sw modle: ", sw._model.getPolicy()) ## If not on policy if (not settings['on_policy']): for lw in learning_workers: lw._agent.setPolicy(model) lw.setMasterAgentMessageQueue(masterAgent_message_queue) lw.updateExperience(experience) print("ls policy: ", lw._agent.getPolicy()) lw.start() tmp_p = 1.0 if (settings['load_saved_model']): tmp_p = settings['min_epsilon'] data = ('Update_Policy', tmp_p, model.getStateBounds(), model.getActionBounds(), model.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters()) if (settings['train_forward_dynamics']): data = ('Update_Policy', tmp_p, model.getStateBounds(), model.getActionBounds(), model.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters(), masterAgent.getForwardDynamics().getNetworkParameters()) message['type'] = 'Update_Policy' message['data'] = data for m_q in sim_work_queues: print("trainModel: Sending current network parameters: ", m_q) m_q.put(message) del model ## Give gloabl access to processes to they can be terminated when ctrl+c is pressed global sim_processes sim_processes = sim_workers global learning_processes learning_processes = learning_workers global _input_anchor_queue _input_anchor_queue = input_anchor_queue global _output_experience_queue _output_experience_queue = output_experience_queue global _eval_episode_data_queue _eval_episode_data_queue = eval_episode_data_queue global _sim_work_queues _sim_work_queues = sim_work_queues trainData = {} trainData["mean_reward"] = [] trainData["std_reward"] = [] trainData["mean_bellman_error"] = [] trainData["std_bellman_error"] = [] trainData["mean_discount_error"] = [] trainData["std_discount_error"] = [] trainData["mean_forward_dynamics_loss"] = [] trainData["std_forward_dynamics_loss"] = [] trainData["mean_forward_dynamics_reward_loss"] = [] trainData["std_forward_dynamics_reward_loss"] = [] trainData["mean_eval"] = [] trainData["std_eval"] = [] trainData["mean_critic_loss"] = [] trainData["std_critic_loss"] = [] trainData["mean_critic_regularization_cost"] = [] trainData["std_critic_regularization_cost"] = [] trainData["mean_actor_loss"] = [] trainData["std_actor_loss"] = [] trainData["mean_actor_regularization_cost"] = [] trainData["std_actor_regularization_cost"] = [] trainData["anneal_p"] = [] if (False): print("State Bounds:", masterAgent.getStateBounds()) print("Action Bounds:", masterAgent.getActionBounds()) print("Exp State Bounds: ", experience.getStateBounds()) print("Exp Action Bounds: ", experience.getActionBounds()) print("Starting first round") if (settings['on_policy']): sim_epochs_ = epochs for round_ in range( 0, rounds): #annel value # the parameter of greedy exploration if ('annealing_schedule' in settings and (settings['annealing_schedule'] != False)): p = anneal_value(float(round_ / rounds), settings_=settings) else: p = ((settings['initial_temperature'] / math.log(round_ + 2))) p = max(settings['min_epsilon'], min(settings['epsilon'], p)) # Keeps it between 1.0 and 0.2 if (settings['load_saved_model']): p = settings['min_epsilon'] for epoch in range(epochs): if (settings['on_policy']): out = simModelParrallel( sw_message_queues=sim_work_queues, model=masterAgent, settings=settings, eval_episode_data_queue=eval_episode_data_queue, anchors=settings['num_on_policy_rollouts']) ( tuples, discounted_sum, q_value, evalData ) = out # tuples = states, actions, result_states, rewards, falls, G_ts, advantage, exp_actions (__states, __actions, __result_states, __rewards, __falls, __G_ts, advantage__, exp_actions__) = tuples for i in range(1): masterAgent.train(_states=__states, _actions=__actions, _rewards=__rewards, _result_states=__result_states, _falls=__falls, _advantage=advantage__, _exp_actions=exp_actions__) if (('anneal_on_policy' in settings) and settings['anneal_on_policy']): p_tmp_ = p else: p_tmp_ = 1.0 data = ('Update_Policy', p_tmp_, masterAgent.getStateBounds(), masterAgent.getActionBounds(), masterAgent.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters()) message = {} message['type'] = 'Update_Policy' message['data'] = data if (settings['train_forward_dynamics']): data = ('Update_Policy', p_tmp_, masterAgent.getStateBounds(), masterAgent.getActionBounds(), masterAgent.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters(), masterAgent.getForwardDynamics(). getNetworkParameters()) message['data'] = data for m_q in sim_work_queues: ## block on full queue m_q.put(message) if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for m_q in eval_sim_work_queues: ## block on full queue m_q.put(message) else: episodeData = {} episodeData['data'] = epoch episodeData['type'] = 'sim' input_anchor_queue.put(episodeData) if masterAgent.getExperience().samples( ) >= batch_size: #更新policy网络 states, actions, result_states, rewards, falls, G_ts, exp_actions = masterAgent.getExperience( ).get_batch(batch_size) error = masterAgent.bellman_error(states, actions, rewards, result_states, falls) bellman_errors.append(error) if (settings['debug_critic']): loss__ = masterAgent.getPolicy()._get_critic_loss( ) # uses previous call batch data criticLosses.append(loss__) regularizationCost__ = masterAgent.getPolicy( )._get_critic_regularization() criticRegularizationCosts.append(regularizationCost__) if (settings['debug_actor']): #True loss__ = masterAgent.getPolicy()._get_actor_loss( ) # uses previous call batch data actorLosses.append(loss__) regularizationCost__ = masterAgent.getPolicy( )._get_actor_regularization() actorRegularizationCosts.append(regularizationCost__) if not all(np.isfinite(error)): print( "States: " + str(states) + " ResultsStates: " + str(result_states) + " Rewards: " + str(rewards) + " Actions: " + str(actions) + " Falls: ", str(falls)) print("Bellman Error is Nan: " + str(error) + str(np.isfinite(error))) sys.exit() error = np.mean(np.fabs(error)) if error > 10000: print("Error to big: ") print(states, actions, rewards, result_states) if (settings['train_forward_dynamics']): #False dynamicsLoss = masterAgent.getForwardDynamics( ).bellman_error(states, actions, result_states, rewards) dynamicsLoss = np.mean(np.fabs(dynamicsLoss)) #fabs:计算绝对值 dynamicsLosses.append(dynamicsLoss) if (settings['train_reward_predictor']): dynamicsRewardLoss = masterAgent.getForwardDynamics( ).reward_error(states, actions, result_states, rewards) dynamicsRewardLoss = np.mean( np.fabs(dynamicsRewardLoss)) dynamicsRewardLosses.append(dynamicsRewardLoss) if (settings['train_forward_dynamics']): print("Round: " + str(round_) + " Epoch: " + str(epoch) + " p: " + str(p) + " With mean reward: " + str(np.mean(rewards)) + " bellman error: " + str(error) + " ForwardPredictionLoss: " + str(dynamicsLoss)) else: print("Round: " + str(round_) + " Epoch: " + str(epoch) + " p: " + str(p) + " With mean reward: " + str(np.mean(rewards)) + " bellman error: " + str(error)) if (settings["print_levels"][settings["print_level"]] >= settings["print_levels"]['train']): print("Master agent experience size: " + str(masterAgent.getExperience().samples())) if (not settings['on_policy']): ## There could be stale policy parameters in here, use the last set put in the queue data = None while (not masterAgent_message_queue.empty()): ## Don't block try: data = masterAgent_message_queue.get(False) except Exception as inst: pass if (not (data == None)): masterAgent.setExperience(data[0]) masterAgent.getPolicy().setNetworkParameters(data[1]) masterAgent.setStateBounds( masterAgent.getExperience().getStateBounds()) masterAgent.setActionBounds( masterAgent.getExperience().getActionBounds()) masterAgent.setRewardBounds( masterAgent.getExperience().getRewardBounds()) if (settings['train_forward_dynamics']): masterAgent.getForwardDynamics().setNetworkParameters( data[2]) if ('keep_seperate_fd_exp_buffer' in settings and (settings['keep_seperate_fd_exp_buffer'])): masterAgent.setFDExperience(data[3]) # this->_actor->iterate(); ## This will let me know which part of learning is going slower training updates or simulation if (settings["print_levels"][settings["print_level"]] >= settings["print_levels"]['train']): print("sim queue size: ", input_anchor_queue.qsize()) #返回队列的大小 if (output_experience_queue != None): print("exp tuple queue size: ", output_experience_queue.qsize()) if (not settings['on_policy']): data = ('Update_Policy', p, masterAgent.getStateBounds(), masterAgent.getActionBounds(), masterAgent.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters()) if (settings['train_forward_dynamics']): data = ( 'Update_Policy', p, masterAgent.getStateBounds(), masterAgent.getActionBounds(), masterAgent.getRewardBounds(), masterAgent.getPolicy().getNetworkParameters(), masterAgent.getForwardDynamics().getNetworkParameters()) message['type'] = 'Update_Policy' message['data'] = data for m_q in sim_work_queues: ## Don't block on full queue try: m_q.put(message, False) except: print("SimWorker model parameter message queue full: ", m_q.qsize()) if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for m_q in eval_sim_work_queues: ## Don't block on full queue try: m_q.put(message, False) except: print("SimWorker model parameter message queue full: ", m_q.qsize()) if (round_ % settings['plotting_update_freq_num_rounds']) == 0: # Running less often helps speed learning up. # Sync up sim actors if (settings['on_policy']): mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error, mean_eval, std_eval = evalModelParrallel( input_anchor_queue=eval_sim_work_queues, model=masterAgent, settings=settings, eval_episode_data_queue=eval_episode_data_queue, anchors=settings['eval_epochs']) else: mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error, mean_eval, std_eval = evalModelParrallel( input_anchor_queue=input_anchor_queue_eval, model=masterAgent, settings=settings, eval_episode_data_queue=eval_episode_data_queue, anchors=settings['eval_epochs']) print(mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error) if mean_bellman_error > 10000: print("Error to big: ") else: if (settings['train_forward_dynamics']): #false mean_dynamicsLosses = np.mean(dynamicsLosses) std_dynamicsLosses = np.std(dynamicsLosses) dynamicsLosses = [] if (settings['train_reward_predictor']): #false mean_dynamicsRewardLosses = np.mean(dynamicsRewardLosses) std_dynamicsRewardLosses = np.std(dynamicsRewardLosses) dynamicsRewardLosses = [] trainData["mean_reward"].append(mean_reward) trainData["std_reward"].append(std_reward) trainData["anneal_p"].append(p) trainData["mean_bellman_error"].append( np.mean(np.fabs(bellman_errors))) trainData["std_bellman_error"].append(np.std(bellman_errors)) bellman_errors = [] trainData["mean_discount_error"].append(mean_discount_error) trainData["std_discount_error"].append(std_discount_error) trainData["mean_eval"].append(mean_eval) trainData["std_eval"].append(std_eval) if (settings['train_forward_dynamics']): trainData["mean_forward_dynamics_loss"].append( mean_dynamicsLosses) trainData["std_forward_dynamics_loss"].append( std_dynamicsLosses) if (settings['train_reward_predictor']): trainData["mean_forward_dynamics_reward_loss"].append( mean_dynamicsRewardLosses) trainData["std_forward_dynamics_reward_loss"].append( std_dynamicsRewardLosses) if (round_ % settings['saving_update_freq_num_rounds']) == 0: if (settings['train_forward_dynamics']): file_name_dynamics = directory + "forward_dynamics_" + ".pkl" f = open(file_name_dynamics, 'wb') dill.dump(masterAgent.getForwardDynamics(), f) f.close() if mean_dynamicsLosses < best_dynamicsLosses: best_dynamicsLosses = mean_dynamicsLosses print("Saving BEST current forward dynamics agent: " + str(best_dynamicsLosses)) file_name_dynamics = directory + "forward_dynamics_" + "_Best.pkl" f = open(file_name_dynamics, 'wb') dill.dump(masterAgent.getForwardDynamics(), f) #save model f.close() if (mean_eval > best_eval): best_eval = mean_eval print("Saving BEST current agent: " + str(best_eval)) file_name = directory + getAgentName() + "_Best.pkl" f = open(file_name, 'wb') dill.dump(masterAgent.getPolicy(), f) f.close() if settings['save_trainData']: fp = open( directory + "trainingData_" + str(settings['agent_name']) + ".json", 'w') ## because json does not serialize np.float32 for key in trainData: trainData[key] = [float(i) for i in trainData[key]] json.dump(trainData, fp) fp.close() print("Saving current masterAgent") file_name = directory + getAgentName() + ".pkl" f = open(file_name, 'wb') dill.dump(masterAgent.getPolicy(), f) f.close() gc.collect() print("Terminating Workers") if (settings['on_policy']): for m_q in sim_work_queues: ## block on full queue m_q.put(None) if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for m_q in eval_sim_work_queues: ## block on full queue m_q.put(None) for sw in sim_workers: # Should update these more often sw.join() if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): for sw in eval_sim_workers: # Should update these more often sw.join() for i in range(len(sim_work_queues)): print("sim_work_queues size: ", sim_work_queues[i].qsize()) while (not sim_work_queues[i].empty()): ### Empty the queue ## Don't block try: data_ = sim_work_queues[i].get(False) except Exception as inst: pass print("sim_work_queues size: ", sim_work_queues[i].qsize()) for i in range(len(eval_sim_work_queues)): print("eval_sim_work_queues size: ", eval_sim_work_queues[i].qsize()) while (not eval_sim_work_queues[i].empty()): ### Empty the queue ## Don't block try: data_ = eval_sim_work_queues[i].get(False) except Exception as inst: pass print("eval_sim_work_queues size: ", eval_sim_work_queues[i].qsize()) print("Finish sim") exp_val.finish() print("Save last versions of files.") file_name = directory + getAgentName() + ".pkl" f = open(file_name, 'wb') dill.dump(masterAgent.getPolicy(), f) f.close() f = open( directory + "trainingData_" + str(settings['agent_name']) + ".json", "w") for key in trainData: trainData[key] = [float(i) for i in trainData[key]] json.dump(trainData, f, sort_keys=True, indent=4) f.close() if (settings['train_forward_dynamics']): file_name_dynamics = directory + "forward_dynamics_" + ".pkl" f = open(file_name_dynamics, 'wb') dill.dump(masterAgent.getForwardDynamics(), f) f.close() print("Delete any plots being used") gc.collect() #立即释放内存
def trainMetaModel(settingsFileName, samples=10, settings=None, numThreads=1, hyperSettings=None): import shutil import os result_data = {} result_data['settings_files'] = [] if (settings is None): file = open(settingsFileName) settings = json.load(file) # print ("Settings: " + str(json.dumps(settings))) file.close() print("Running ", samples, " simulation(s) over ", numThreads, " Thread(s)") settings_original = copy.deepcopy(settings) directory_ = getBaseDataDirectory(settings_original) if not os.path.exists(directory_): os.makedirs(directory_) out_file_name = directory_ + "settings.json" print("Saving settings file with data to: ", out_file_name) out_file = open(out_file_name, 'w') out_file.write(json.dumps(settings_original, indent=4)) # file.close() out_file.close() sim_settings = [] sim_settingFileNames = [] sim_data = [] for i in range(samples): settings['data_folder'] = settings_original['data_folder'] + "_" + str( i) settings['random_seed'] = int(settings['random_seed']) + ( (int(settings['num_available_threads']) + 1) * i) ## Change some other settings to reduce memory usage and train faster settings['print_level'] = "hyper_train" settings['shouldRender'] = False settings['visualize_learning'] = False settings['saving_update_freq_num_rounds'] = settings_original[ 'saving_update_freq_num_rounds'] * 10 if ('expert_policy_files' in settings): for j in range(len(settings['expert_policy_files'])): settings['expert_policy_files'][j] = settings_original[ 'expert_policy_files'][j] + "/_" + str(i) result_data['settings_files'].append(copy.deepcopy(settings)) sim_settings.append(copy.deepcopy(settings)) sim_settingFileNames.append(settingsFileName) sim_data.append((settingsFileName, copy.deepcopy(settings))) ## Create data directory and copy any desired files to these folders . if (not (hyperSettings is None)): # file = open(hyperSettings) hyper_settings = hyperSettings # print ("Settings: " + str(json.dumps(settings))) # file.close() directory = getDataDirectory(settings) if not os.path.exists(directory): os.makedirs(directory) if ('saved_model_path' in hyperSettings): print("Copying fd model: ", hyperSettings['saved_model_path']) # shutil.copy2(hyperSettings['saved_model_path'], directory+"forward_dynamics_"+"_Best_pretrain.pkl" ) shutil.copy2(hyperSettings['saved_model_path'], directory + getAgentName() + "_Best.pkl") if ('saved_model_folder' in hyperSettings): ### Copy models from other metamodel simulation ### Purposefully not copying the "Best" model but the last instead shutil.copy2( hyperSettings['saved_model_folder'] + "/_" + str(i) + '/' + settings['model_type'] + '/' + getAgentName() + ".pkl", directory + getAgentName() + "_Best.pkl") # p = ThreadPool(numThreads) p = ProcessingPool(numThreads) t0 = time.time() # print ("hyperSettings: ", hyper_settings) if ((hyperSettings is not None) and ('testing' in hyper_settings and (hyper_settings['testing']))): print("Not simulating, this is a testing run:") else: result = p.map(trainModelParallel, sim_data) t1 = time.time() print("Meta model training complete in " + str(datetime.timedelta(seconds=(t1 - t0))) + " seconds") # print (result) result_data['sim_time'] = "Meta model training complete in " + str( datetime.timedelta(seconds=(t1 - t0))) + " seconds" result_data['raw_sim_time_in_seconds'] = t1 - t0 result_data['Number_of_simulations_sampled'] = samples result_data['Number_of_threads_used'] = numThreads return result_data
def evaluateModelRender(settings_file_name, runLastModel=False): settings = getSettings(settings_file_name) # settings['shouldRender'] = True import os os.environ['THEANO_FLAGS'] = "mode=FAST_RUN,device=" + settings[ 'training_processor_type'] + ",floatX=" + settings['float_type'] from util.SimulationUtil import validateSettings, createEnvironment, createRLAgent, createActor, getAgentName from util.SimulationUtil import getDataDirectory, createForwardDynamicsModel, getAgentName from util.ExperienceMemory import ExperienceMemory from model.LearningAgent import LearningAgent, LearningWorker from RLVisualize import RLVisualize from NNVisualize import NNVisualize model_type = settings["model_type"] directory = getDataDirectory(settings) rounds = settings["rounds"] epochs = settings["epochs"] # num_states=settings["num_states"] epsilon = settings["epsilon"] discount_factor = settings["discount_factor"] # max_reward=settings["max_reward"] batch_size = settings["batch_size"] state_bounds = np.array(settings['state_bounds']) action_space_continuous = settings["action_space_continuous"] discrete_actions = np.array(settings['discrete_actions']) num_actions = discrete_actions.shape[0] reward_bounds = np.array(settings["reward_bounds"]) action_space_continuous = settings['action_space_continuous'] if action_space_continuous: action_bounds = np.array(settings["action_bounds"], dtype=float) print("Sim config file name: " + str(settings["sim_config_file"])) ### Using a wrapper for the type of actor now if action_space_continuous: experience = ExperienceMemory(len(state_bounds[0]), len(action_bounds[0]), settings['expereince_length'], continuous_actions=True, settings=settings) else: experience = ExperienceMemory(len(state_bounds[0]), 1, settings['expereince_length']) # actor = ActorInterface(discrete_actions) actor = createActor(str(settings['environment_type']), settings, experience) masterAgent = LearningAgent(n_in=len(state_bounds[0]), n_out=len(action_bounds[0]), state_bounds=state_bounds, action_bounds=action_bounds, reward_bound=reward_bounds, settings_=settings) # c = characterSim.Configuration("../data/epsilon0Config.ini") if (runLastModel == True): file_name = directory + getAgentName() + ".pkl" else: file_name = directory + getAgentName() + "_Best.pkl" f = open(file_name, 'rb') model = dill.load(f) f.close() if (settings['train_forward_dynamics']): file_name_dynamics = directory + "forward_dynamics_" + "_Best.pkl" # file_name=directory+getAgentName()+".pkl" f = open(file_name_dynamics, 'rb') forwardDynamicsModel = dill.load(f) f.close() if (settings["use_transfer_task_network"]): task_directory = getTaskDataDirectory(settings) file_name = directory + getAgentName() + "_Best.pkl" f = open(file_name, 'rb') taskModel = dill.load(f) f.close() # copy the task part from taskModel to model print("Transferring task portion of model.") model.setTaskNetworkParameters(taskModel) # this is the process that selects which game to play sim_index = 0 if ('override_sim_env_id' in settings and (settings['override_sim_env_id'] != False)): sim_index = settings['override_sim_env_id'] exp = createEnvironment(settings["sim_config_file"], settings['environment_type'], settings, render=True, index=sim_index) if (settings['train_forward_dynamics']): # actor.setForwardDynamicsModel(forwardDynamicsModel) forwardDynamicsModel.setActor(actor) masterAgent.setForwardDynamics(forwardDynamicsModel) # forwardDynamicsModel.setEnvironment(exp) # actor.setPolicy(model) exp.setActor(actor) exp.getActor().init() exp.init() exp.generateValidationEnvironmentSample(0) expected_value_viz = None if (settings['visualize_expected_value']): expected_value_viz = NNVisualize(title=str("Expected Value") + " with " + str(settings["model_type"]), settings=settings) expected_value_viz.setInteractive() expected_value_viz.init() criticLosses = [] masterAgent.setSettings(settings) masterAgent.setExperience(experience) masterAgent.setPolicy(model) """ mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error, mean_eval, std_eval = evalModel(actor, exp, masterAgent, discount_factor, anchors=_anchors[:settings['eval_epochs']], action_space_continuous=action_space_continuous, settings=settings, print_data=True, evaluation=True, visualizeEvaluation=expected_value_viz) # simEpoch(exp, model, discount_factor=discount_factor, anchors=_anchors[:settings['eval_epochs']][9], action_space_continuous=True, settings=settings, print_data=True, p=0.0, validation=True) """ """ workers = [] input_anchor_queue = Queue(settings['queue_size_limit']) output_experience_queue = Queue(settings['queue_size_limit']) for process in range(settings['num_available_threads']): # this is the process that selects which game to play exp = characterSim.Experiment(c) if settings['environment_type'] == 'pendulum_env_state': print ("Using Environment Type: " + str(settings['environment_type'])) exp = PendulumEnvState(exp) elif settings['environment_type'] == 'pendulum_env': print ("Using Environment Type: " + str(settings['environment_type'])) exp = PendulumEnv(exp) else: print ("Invalid environment type: " + str(settings['environment_type'])) sys.exit() exp.getActor().init() exp.init() w = SimWorker(input_anchor_queue, output_experience_queue, exp, model, discount_factor, action_space_continuous=action_space_continuous, settings=settings, print_data=False, p=0.0, validation=True) w.start() workers.append(w) mean_reward, std_reward, mean_bellman_error, std_bellman_error, mean_discount_error, std_discount_error = evalModelParrallel( input_anchor_queue, output_experience_queue, discount_factor, anchors=_anchors[:settings['eval_epochs']], action_space_continuous=action_space_continuous, settings=settings) for w in workers: input_anchor_queue.put(None) """ # print ("Average Reward: " + str(mean_reward)) exp.getActor().initEpoch() exp.initEpoch() fps = 30 state_ = exp.getState() action_ = np.array(masterAgent.predict(state_, evaluation_=True), dtype='float64') exp.updateAction(action_) sim = SimContainer(exp, masterAgent, settings, expected_value_viz) sim._grad_sum = np.zeros_like(state_) # glutInitWindowPosition(x, y); # glutInitWindowSize(width, height); # glutCreateWindow("PyODE Ragdoll Simulation") # set GLUT callbacks glutKeyboardFunc(sim.onKey) ## This works because GLUT in C++ uses the same global context (singleton) as the one in python glutTimerFunc(int(1000.0 / fps), sim.animate, 0) # 30 fps? # glutIdleFunc(animate) # enter the GLUT event loop glutMainLoop()
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(Distillation, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network ### Load expert policy files self._expert_policies = [] file_name_ = "" for i in range(len(self.getSettings()['expert_policy_files'])): file_name = self.getSettings( )['expert_policy_files'][i] + '/' + self.getSettings( )['model_type'] + '/' + getAgentName() + '.pkl' if (file_name_ == file_name): ## To help save memory when experts are the same # model_ = self._expert_policies[len(self._expert_policies)-1] self._expert_policies.append(model_) else: print("Loading pre compiled network: ", file_name) f = open(file_name, 'rb') model_ = dill.load(f) # model.setSettings(settings) f.close() self._expert_policies.append(model_) file_name_ = file_name self._actor_buffer_states = [] self._actor_buffer_result_states = [] self._actor_buffer_actions = [] self._actor_buffer_rewards = [] self._actor_buffer_falls = [] self._actor_buffer_diff = [] self._NotFallen = T.bcol("Not_Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._tmp_diff = T.col("Tmp_Diff") self._tmp_diff.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._tmp_diff_shared = theano.shared(np.zeros( (self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) """ self._target_shared = theano.shared( np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) """ self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] ## Target network self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1) self._target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._q_valsTargetNextState) self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop # loss = 0.5 * self._diff ** 2 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._NotFallen: self._NotFallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), self._model.getActionSymbolicVariable(): self._model.getActions(), # self._NotFallen: self._NotFallen_shared self._tmp_diff: self._tmp_diff_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2))) if (self.getSettings()['use_previous_value_regularization']): self._actor_regularization = self._actor_regularization + ( (self.getSettings()['previous_value_regularization_weight']) * change_penalty(self._model.getActorNetwork(), self._modelTarget.getActorNetwork())) elif ('regularization_type' in self.getSettings() and (self.getSettings()['regularization_type'] == 'KL_Divergence')): self._kl_firstfixed = T.mean( kl( self._q_valsActTarget, T.ones_like(self._q_valsActTarget) * self.getSettings()['exploration_rate'], self._q_valsActA, T.ones_like(self._q_valsActA) * self.getSettings()['exploration_rate'], self._action_length)) #self._actor_regularization = (( self._KL_Weight ) * self._kl_firstfixed ) + (10*(self._kl_firstfixed>self.getSettings()['kl_divergence_threshold'])* # T.square(self._kl_firstfixed-self.getSettings()['kl_divergence_threshold'])) self._actor_regularization = (self._kl_firstfixed) * ( self.getSettings()['kl_divergence_threshold']) print("Using regularization type : ", self.getSettings()['regularization_type']) # SGD update # self._updates_ = lasagne.updates.rmsprop(self._loss, self._params, self._learning_rate, self._rho, # self._rms_epsilon) self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) ## Clipping the max gradient """ for x in range(len(self._value_grad)): self._value_grad[x] = T.clip(self._value_grad[x] , -0.1, 0.1) """ if (self.getSettings()['optimizer'] == 'rmsprop'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.rmsprop(self._value_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.momentum( self._value_grad, self._params, self._critic_learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adagrad( self._value_grad, self._params, self._critic_learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## TD update """ if (self.getSettings()['optimizer'] == 'rmsprop'): self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._updates_ = lasagne.updates.momentum(T.mean(self._q_func) + self._critic_regularization, self._params, self._critic_learning_rate * -T.mean(self._diff), momentum=self._rho) elif ( self.getSettings()['optimizer'] == 'adam'): self._updates_ = lasagne.updates.adam(T.mean(self._q_func), self._params, self._critic_learning_rate * -T.mean(self._diff), beta1=0.9, beta2=0.999, epsilon=1e-08) else: print ("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) """ ## Need to perform an element wise operation or replicate _diff for this to work properly. # self._actDiff = theano.tensor.elemwise.Elemwise(theano.scalar.mul)((self._model.getActionSymbolicVariable() - self._q_valsActA), theano.tensor.tile((self._diff * (1.0/(1.0-self._discount_factor))), self._action_length)) # Target network does not work well here? self._actDiff = (self._model.getActionSymbolicVariable() - self._q_valsActA_drop) # self._actDiff = ((self._model.getActionSymbolicVariable() - self._q_valsActA)) # Target network does not work well here? # self._actDiff_drop = ((self._model.getActionSymbolicVariable() - self._q_valsActA_drop)) # Target network does not work well here? ## This should be a single column vector # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.transpose(T.sum(T.pow(self._actDiff, 2),axis=1) )), (self._diff * (1.0/(1.0-self._discount_factor)))) # self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(( T.reshape(T.sum(T.pow(self._actDiff, 2),axis=1), (self._batch_size, 1) )), # (self._tmp_diff * (1.0/(1.0-self._discount_factor))) # self._actLoss_ = (T.mean(T.pow(self._actDiff, 2),axis=1)) self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (T.mean(T.pow(self._actDiff, 2), axis=1)), (self._tmp_diff)) # self._actLoss = T.sum(self._actLoss)/float(self._batch_size) self._actLoss = T.mean(self._actLoss_) # self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop ** 2)/float(self._batch_size)) # because the number of rows can shrink # self._actLoss_drop = (T.mean(0.5 * self._actDiff_drop ** 2)) self._policy_grad = T.grad(self._actLoss + self._actor_regularization, self._actionParams) ## Clipping the max gradient """ for x in range(len(self._policy_grad)): self._policy_grad[x] = T.clip(self._policy_grad[x] , -0.5, 0.5) """ if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam( self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._actionUpdates = lasagne.updates.adagrad( self._policy_grad, self._actionParams, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._model.getActions(), } ### Noisey state updates # self._target = (self._model.getRewardSymbolicVariable() + (np.array([self._discount_factor] ,dtype=np.dtype(self.getSettings()['float_type']))[0] * self._q_valsTargetNextState )) * self._NotFallen # self._target_dyna = theano.gradient.disconnected_grad(self._q_func) ## Bellman error self._bellman = self._target - self._q_funcTarget # self._target = self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState ) ### Give v(s') the next state and v(s) (target) the current state self._diff_adv = (self._discount_factor * self._q_func) - (self._q_valsTargetNextState) self._diff_adv_givens = { self._model.getStateSymbolicVariable(): self._model.getResultStates(), self._model.getResultStateSymbolicVariable(): self._model.getStates(), } Distillation.compile(self)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(Distillation, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network ### Load expert policy files self._expert_policies = [] file_name_ = "" for i in range(len(self.getSettings()['expert_policy_files'])): file_name = self.getSettings( )['expert_policy_files'][i] + '/' + self.getSettings( )['model_type'] + '/' + getAgentName() + '.pkl' if (file_name_ == file_name): ## To help save memory when experts are the same self._expert_policies.append(model_) else: print("Loading pre compiled network: ", file_name) f = open(file_name, 'rb') model_ = dill.load(f) f.close() self._expert_policies.append( model_) # expert model, load the 2 expert models file_name_ = file_name self._actor_buffer_states = [] self._actor_buffer_result_states = [] self._actor_buffer_actions = [] self._actor_buffer_rewards = [] self._actor_buffer_falls = [] self._actor_buffer_diff = [] self._NotFallen = T.bcol("Not_Fallen") ## because float64 <= float32 * int32, need to use int16 or int8 self._NotFallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int8')) self._NotFallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int8'), broadcastable=(False, True)) self._tmp_diff = T.col("Tmp_Diff") self._tmp_diff.tag.test_value = np.zeros( (self._batch_size, 1), dtype=np.dtype(self.getSettings()['float_type'])) self._tmp_diff_shared = theano.shared( np.zeros((self._batch_size, 1), dtype=self.getSettings()['float_type']), broadcastable=(False, True)) #定义一个共享变量,初始值为为0 self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] ## Target network self._modelTarget = copy.deepcopy(model) # target model 是要更新的模型 self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) #确定性原始模型的state值输出 self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) #非确定的state值输出 self._q_valsNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) #下一步的state值 self._q_valsTargetNextState = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True) #目标模型的下一步的state值 self._q_valsTarget = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) #目标模型的state值 self._q_valsTarget_drop = lasagne.layers.get_output( self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) #目标模型的state self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsActTarget = lasagne.layers.get_output( self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) #remove the random self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) #actor 值 self._q_func = self._q_valsA self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop self._target = self._model.getRewardSymbolicVariable() + ( self._discount_factor * self._q_valsTargetNextState) # self._model.getRewardSymbolicVariable() 获取rewards的值getRewards() =self._rewards_shared 从0开始一直更新 self._diff = self._target - self._q_func self._diff_drop = self._target - self._q_func_drop #更新的模型的reward减去原始模型的critic的输出值 loss = T.pow(self._diff, 2) self._loss = T.mean(loss) # 两个模型的reward的差值 self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), self._model.getRewardSymbolicVariable(): self._model.getRewards() } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), self._model.getActionSymbolicVariable(): self._model.getActions(), self._tmp_diff: self._tmp_diff_shared } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = ( (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2))) if (self.getSettings()['use_previous_value_regularization']): self._actor_regularization = self._actor_regularization + ( (self.getSettings()['previous_value_regularization_weight']) * change_penalty(self._model.getActorNetwork(), self._modelTarget.getActorNetwork())) elif ('regularization_type' in self.getSettings() and (self.getSettings()['regularization_type'] == 'KL_Divergence')): self._kl_firstfixed = T.mean( kl( self._q_valsActTarget, T.ones_like(self._q_valsActTarget) * self.getSettings()['exploration_rate'], self._q_valsActA, T.ones_like(self._q_valsActA) * self.getSettings()['exploration_rate'], self._action_length)) self._actor_regularization = (self._kl_firstfixed) * ( self.getSettings()['kl_divergence_threshold']) print("Using regularization type : ", self.getSettings()['regularization_type']) # SGD update self._value_grad = T.grad(self._loss + self._critic_regularization, self._params) if (self.getSettings()['optimizer'] == 'rmsprop'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.rmsprop(self._value_grad, self._params, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.momentum( self._value_grad, self._params, self._critic_learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adam(self._value_grad, self._params, self._critic_learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): print("Optimizing Value Function with ", self.getSettings()['optimizer'], " method") self._updates_ = lasagne.updates.adagrad( self._value_grad, self._params, self._critic_learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) sys.exit(-1) ## TD update ## Need to perform an element wise operation or replicate _diff for this to work properly. self._actDiff = (self._model.getActionSymbolicVariable() - self._q_valsActA_drop) # 更新模型的actor的输出减去原始模型的actor值 self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)( (T.mean(T.pow(self._actDiff, 2), axis=1)), (self._tmp_diff)) self._actLoss = T.mean(self._actLoss_) self._policy_grad = T.grad(self._actLoss + self._actor_regularization, self._actionParams) ## Clipping the max gradient if (self.getSettings()['optimizer'] == 'rmsprop'): self._actionUpdates = lasagne.updates.rmsprop( self._policy_grad, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'momentum'): self._actionUpdates = lasagne.updates.momentum(self._policy_grad, self._actionParams, self._learning_rate, momentum=self._rho) elif (self.getSettings()['optimizer'] == 'adam'): self._actionUpdates = lasagne.updates.adam( self._policy_grad, self._actionParams, self._learning_rate, beta1=0.9, beta2=0.999, epsilon=self._rms_epsilon) elif (self.getSettings()['optimizer'] == 'adagrad'): self._actionUpdates = lasagne.updates.adagrad( self._policy_grad, self._actionParams, self._learning_rate, epsilon=self._rms_epsilon) else: print("Unknown optimization method: ", self.getSettings()['optimizer']) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates() } ## Bellman error self._bellman = self._target - self._q_funcTarget ### Give v(s') the next state and v(s) (target) the current state self._diff_adv = (self._discount_factor * self._q_func) - ( self._q_valsTargetNextState ) #\gamma*critic模型的输出-critic模型在下一个状态的输出值 self._diff_adv_givens = { self._model.getStateSymbolicVariable(): self._model.getResultStates(), self._model.getResultStateSymbolicVariable(): self._model.getStates(), } Distillation.compile(self)