def learn_from_batch(self, batch): if not self.main_network.online_network.output_heads[ 0].DND.has_enough_entries(self.tp.agent.number_of_knn): return 0 else: if not self.training_started: self.training_started = True screen.log_title( "Finished collecting initial entries in DND. Starting to train network..." ) current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch( batch) TD_targets = self.main_network.online_network.predict(current_states) # only update the action that we have actually done in this transition for i in range(self.tp.batch_size): TD_targets[i, actions[i]] = total_return[i] # train the neural network result = self.main_network.train_and_sync_networks( current_states, TD_targets) total_loss = result[0] return total_loss
def evaluate(self, num_episodes, keep_networks_synced=False): """ Run in an evaluation mode for several episodes. Actions will be chosen greedily. :param keep_networks_synced: keep the online network in sync with the global network after every episode :param num_episodes: The number of episodes to evaluate on :return: None """ max_reward_achieved = -float('inf') average_evaluation_reward = 0 screen.log_title("Running evaluation") self.env.change_phase(RunPhase.TEST) for i in range(num_episodes): # keep the online network in sync with the global network if keep_networks_synced: for network in self.networks: network.sync() episode_ended = False while not episode_ended: episode_ended = self.act(phase=RunPhase.TEST) if self.tp.visualization.dump_gifs and self.total_reward_in_current_episode > max_reward_achieved: max_reward_achieved = self.total_reward_in_current_episode frame_skipping = int(5/self.tp.env.frame_skip) logger.create_gif(self.last_episode_images[::frame_skipping], name='score-{}'.format(max_reward_achieved), fps=10) average_evaluation_reward += self.total_reward_in_current_episode self.reset_game() average_evaluation_reward /= float(num_episodes) self.env.change_phase(RunPhase.TRAIN) screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward))
def improve(self): """ Training algorithms wrapper. Heatup >> [ Evaluate >> Play >> Train >> Save checkpoint ] :return: None """ # synchronize the online network weights with the global network for network in self.networks: network.sync() # heatup phase if self.tp.num_heatup_steps != 0: self.in_heatup = True screen.log_title("Starting heatup {}".format(self.task_id)) num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)): self.act() # training phase self.in_heatup = False screen.log_title("Starting training {}".format(self.task_id)) self.exploration_policy.change_phase(RunPhase.TRAIN) training_start_time = time.time() model_snapshots_periods_passed = -1 while self.training_iteration < self.tp.num_training_iterations: # evaluate evaluate_agent = (self.last_episode_evaluation_ran is not self.current_episode) and \ (self.current_episode % self.tp.evaluate_every_x_episodes == 0) if evaluate_agent: self.last_episode_evaluation_ran = self.current_episode self.evaluate(self.tp.evaluation_episodes) # snapshot model if self.tp.save_model_sec and self.tp.save_model_sec > 0 and not self.tp.distributed: total_training_time = time.time() - training_start_time current_snapshot_period = (int(total_training_time) // self.tp.save_model_sec) if current_snapshot_period > model_snapshots_periods_passed: model_snapshots_periods_passed = current_snapshot_period self.main_network.save_model(model_snapshots_periods_passed) # play and record in replay buffer if self.tp.agent.step_until_collecting_full_episodes: step = 0 while step < self.tp.agent.num_consecutive_playing_steps or self.memory.get_episode(-1).length() != 0: self.act() step += 1 else: for step in range(self.tp.agent.num_consecutive_playing_steps): self.act() # train if self.tp.train: for step in range(self.tp.agent.num_consecutive_training_steps): loss = self.train() self.loss.add_sample(loss) self.training_iteration += 1 self.post_training_commands()
def set_framework(framework_type): # choosing neural network framework framework = Frameworks().get(framework_type) sess = None if framework == Frameworks.TensorFlow: import tensorflow as tf config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.Session(config=config) elif framework == Frameworks.Neon: import ngraph as ng sess = ng.transformers.make_transformer() screen.log_title("Using {} framework".format(Frameworks().to_string(framework))) return sess
def learn_from_batch(self, batch): if not self.main_network.online_network.output_heads[ 0].DND.has_enough_entries(self.tp.agent.number_of_knn): return 0 else: if not self.training_started: self.training_started = True screen.log_title( "Finished collecting initial entries in DND. Starting to train network..." ) current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch( batch) result = self.main_network.train_and_sync_networks( current_states, total_return) total_loss = result[0] return total_loss
def check_input_and_fill_run_dict(parser): args = parser.parse_args() # if no arg is given if len(sys.argv) == 1: parser.print_help() exit(0) # list available presets if args.list: presets_lists = list_all_classes_in_module(presets) screen.log_title("Available Presets:") for preset in presets_lists: print(preset) sys.exit(0) # check inputs try: # num_workers = int(args.num_workers) num_workers = int(re.match("^\d+$", args.num_workers).group(0)) except ValueError: screen.error("Parameter num_workers should be an integer.") preset_names = list_all_classes_in_module(presets) if args.preset is not None and args.preset not in preset_names: screen.error("A non-existing preset was selected. ") if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir): screen.error("The requested checkpoint folder to load from does not exist. ") if args.save_model_sec is not None: try: args.save_model_sec = int(args.save_model_sec) except ValueError: screen.error("Parameter save_model_sec should be an integer.") if args.preset is None and (args.agent_type is None or args.environment_type is None or args.exploration_policy_type is None) and not args.play: screen.error('When no preset is given for Coach to run, the user is expected to input the desired agent_type,' ' environment_type and exploration_policy_type to assemble a preset. ' '\nAt least one of these parameters was not given.') elif args.preset is None and args.play and args.environment_type is None: screen.error('When no preset is given for Coach to run, and the user requests human control over the environment,' ' the user is expected to input the desired environment_type and level.' '\nAt least one of these parameters was not given.') elif args.preset is None and args.play and args.environment_type: args.agent_type = 'Human' args.exploration_policy_type = 'ExplorationParameters' # get experiment name and path experiment_name = logger.get_experiment_name(args.experiment_name) experiment_path = logger.get_experiment_path(experiment_name) if args.play and num_workers > 1: screen.warning("Playing the game as a human is only available with a single worker. " "The number of workers will be reduced to 1") num_workers = 1 # fill run_dict run_dict = dict() run_dict['agent_type'] = args.agent_type run_dict['environment_type'] = args.environment_type run_dict['exploration_policy_type'] = args.exploration_policy_type run_dict['level'] = args.level run_dict['preset'] = args.preset run_dict['custom_parameter'] = args.custom_parameter run_dict['experiment_path'] = experiment_path run_dict['framework'] = Frameworks().get(args.framework) run_dict['play'] = args.play run_dict['evaluate'] = args.evaluate# or args.play # multi-threading parameters run_dict['num_threads'] = num_workers # checkpoints run_dict['save_model_sec'] = args.save_model_sec run_dict['save_model_dir'] = experiment_path if args.save_model_sec is not None else None run_dict['checkpoint_restore_dir'] = args.checkpoint_restore_dir # visualization run_dict['visualization.dump_gifs'] = args.dump_gifs run_dict['visualization.render'] = args.render run_dict['visualization.tensorboard'] = args.tensorboard return args, run_dict
worker_hosts = ",".join(["localhost:{}".format(get_open_port()) for i in range(run_dict['num_threads'] + 1)]) # Make sure to disable GPU so that all the workers will use the CPU set_cpu() # create a parameter server cmd = [ "python3", "./parallel_actor.py", "--ps_hosts={}".format(ps_hosts), "--worker_hosts={}".format(worker_hosts), "--job_name=ps", ] parameter_server = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) screen.log_title("*** Distributed Training ***") time.sleep(1) # create N training workers and 1 evaluating worker workers = [] for i in range(run_dict['num_threads'] + 1): # this is the evaluation worker run_dict['task_id'] = i if i == run_dict['num_threads']: run_dict['evaluate_only'] = True run_dict['visualization.render'] = args.render else: run_dict['evaluate_only'] = False run_dict['visualization.render'] = False # #In a parallel setting, only the evaluation agent renders
agent_params.exploration = AdditiveNoiseParameters() agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0) agent_params.exploration.evaluation_noise_percentage = 0 # no playing during the training phase agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0) # use the following command line to download and extract the CARLA dataset: # python rl_coach/utilities/carla_dataset_to_replay_buffer.py agent_params.memory.load_memory_from_file_path = "./datasets/carla_train_set_replay_buffer.p" agent_params.memory.state_key_with_the_class_index = 'high_level_command' agent_params.memory.num_classes = 4 # download dataset if it doesn't exist if not os.path.exists(agent_params.memory.load_memory_from_file_path): screen.log_title("The CARLA dataset is not present in the following path: {}" .format(agent_params.memory.load_memory_from_file_path)) result = screen.ask_yes_no("Do you want to download it now?") if result: create_dataset(None, "./datasets/carla_train_set_replay_buffer.p") else: screen.error("Please update the path to the CARLA dataset in the CARLA_CIL preset", crash=True) ############### # Environment # ############### env_params = CarlaEnvironmentParameters() env_params.level = 'town1' env_params.cameras = ['CameraRGB'] env_params.camera_height = 600 env_params.camera_width = 800
def __init__(self, env, tuning_parameters, replicated_device=None, task_id=0): """ :param env: An environment instance :type env: EnvironmentWrapper :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset :param replicated_device: A tensorflow device for distributed training (optional) :type replicated_device: instancemethod :param thread_id: The current thread id :param thread_id: int """ screen.log_title("Creating agent {}".format(task_id)) self.task_id = task_id self.sess = tuning_parameters.sess self.env = tuning_parameters.env_instance = env self.imitation = False # i/o dimensions if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height: tuning_parameters.env.desired_observation_width = self.env.width tuning_parameters.env.desired_observation_height = self.env.height self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size if tuning_parameters.agent.use_accumulated_reward_as_measurement: self.measurements_size = tuning_parameters.env.measurements_size = ( self.measurements_size[0] + 1, ) # modules if tuning_parameters.agent.load_memory_from_file_path: screen.log_title( "Loading replay buffer from pickle. Pickle path: {}".format( tuning_parameters.agent.load_memory_from_file_path)) self.memory = read_pickle( tuning_parameters.agent.load_memory_from_file_path) else: self.memory = eval(tuning_parameters.memory + '(tuning_parameters)') # self.architecture = eval(tuning_parameters.architecture) self.has_global = replicated_device is not None self.replicated_device = replicated_device self.worker_device = "/job:worker/task:{}/cpu:0".format( task_id) if replicated_device is not None else "/gpu:0" self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)') self.evaluation_exploration_policy = eval( tuning_parameters.exploration.evaluation_policy + '(tuning_parameters)') self.evaluation_exploration_policy.change_phase(RunPhase.TEST) # initialize all internal variables self.tp = tuning_parameters self.in_heatup = False self.total_reward_in_current_episode = 0 self.total_steps_counter = 0 self.running_reward = None self.training_iteration = 0 self.current_episode = self.tp.current_episode = 0 self.curr_state = {} self.current_episode_steps_counter = 0 self.episode_running_info = {} self.last_episode_evaluation_ran = 0 self.running_observations = [] logger.set_current_time(self.current_episode) self.main_network = None self.networks = [] self.last_episode_images = [] self.renderer = Renderer() # signals self.signals = [] self.loss = Signal('Loss') self.signals.append(self.loss) self.curr_learning_rate = Signal('Learning Rate') self.signals.append(self.curr_learning_rate) if self.tp.env.normalize_observation and not self.env.is_state_type_image: if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers: self.running_observation_stats = RunningStat( (self.tp.env.desired_observation_width, )) self.running_reward_stats = RunningStat(()) else: self.running_observation_stats = SharedRunningStats( self.tp, replicated_device, shape=(self.tp.env.desired_observation_width, ), name='observation_stats') self.running_reward_stats = SharedRunningStats( self.tp, replicated_device, shape=(), name='reward_stats') # env is already reset at this point. Otherwise we're getting an error where you cannot # reset an env which is not done self.reset_game(do_not_reset_env=True) # use seed if self.tp.seed is not None: random.seed(self.tp.seed) np.random.seed(self.tp.seed)
def check_input_and_fill_run_dict(parser): args = parser.parse_args() # if no arg is given if len(sys.argv) == 1: parser.print_help() exit(0) # list available presets if args.list: presets_lists = list_all_classes_in_module(presets) screen.log_title("Available Presets:") for preset in presets_lists: print(preset) sys.exit(0) # check inputs try: # num_workers = int(args.num_workers) num_workers = int(re.match("^\d+$", args.num_workers).group(0)) except ValueError: screen.error("Parameter num_workers should be an integer.") exit(1) preset_names = list_all_classes_in_module(presets) if args.preset is not None and args.preset not in preset_names: screen.error("A non-existing preset was selected. ") exit(1) if args.checkpoint_restore_dir is not None and not os.path.exists( args.checkpoint_restore_dir): screen.error( "The requested checkpoint folder to load from does not exist. ") exit(1) if args.save_model_sec is not None: try: args.save_model_sec = int(args.save_model_sec) except ValueError: screen.error("Parameter save_model_sec should be an integer.") exit(1) if args.preset is None and (args.agent_type is None or args.environment_type is None or args.exploration_policy_type is None): screen.error( 'When no preset is given for Coach to run, the user is expected to input the desired agent_type,' ' environment_type and exploration_policy_type to assemble a preset. ' '\nAt least one of these parameters was not given.') exit(1) experiment_name = args.experiment_name if args.experiment_name == '': experiment_name = screen.ask_input("Please enter an experiment name: ") experiment_name = experiment_name.replace(" ", "_") match = re.match("^$|^\w{1,100}$", experiment_name) if match is None: screen.error( 'Experiment name must be composed only of alphanumeric letters and underscores and should not be ' 'longer than 100 characters.') exit(1) experiment_path = os.path.join('./experiments/', match.group(0)) experiment_path = get_experiment_path(experiment_path) # fill run_dict run_dict = dict() run_dict['agent_type'] = args.agent_type run_dict['environment_type'] = args.environment_type run_dict['exploration_policy_type'] = args.exploration_policy_type run_dict['preset'] = args.preset run_dict['custom_parameter'] = args.custom_parameter run_dict['experiment_path'] = experiment_path run_dict['framework'] = Frameworks().get(args.framework) # multi-threading parameters run_dict['num_threads'] = num_workers # checkpoints run_dict['save_model_sec'] = args.save_model_sec run_dict[ 'save_model_dir'] = experiment_path if args.save_model_sec is not None else None run_dict['checkpoint_restore_dir'] = args.checkpoint_restore_dir # visualization run_dict['visualization.dump_gifs'] = args.dump_gifs run_dict['visualization.render'] = args.render return args, run_dict
preset = eval('presets.{}()'.format(preset_name)) if preset.test and preset_name not in presets_to_ignore: frameworks = [] if preset.agent.tensorflow_support and not args.ignore_tensorflow: frameworks.append('tensorflow') if preset.agent.neon_support and not args.ignore_neon: frameworks.append('neon') for framework in frameworks: if args.stop_after_first_failure and fail_count > 0: break test_count += 1 # run the experiment in a separate thread screen.log_title("Running test {} - {}".format( preset_name, framework)) log_file_name = 'test_log_{preset_name}_{framework}.txt'.format( preset_name=preset_name, framework=framework, ) cmd = ('CUDA_VISIBLE_DEVICES=' ' python3 coach.py ' '-p {preset_name} ' '-f {framework} ' '-e {test_name} ' '-n {num_workers} ' '-cp "seed=0" ' '&> {log_file_name} ').format( preset_name=preset_name, framework=framework, test_name=test_name,