def _gather_last_observation(env, actions, step, homing_policies, selection_weights): start_obs, meta = env.reset() if step > 1: if selection_weights is None: # Select a homing policy for the previous time step randomly uniformly policy = random.choice(homing_policies[step - 1]) else: # Select a homing policy for the previous time step using the given weights # policy = random.choices(homing_policies[step - 1], weights=selection_weights, k=1)[0] ix = gp.sample_action_from_prob(selection_weights) policy = homing_policies[step - 1][ix] obs = start_obs for step_ in range(1, step): obs_var = cuda_var(torch.from_numpy(obs)).float().view(1, -1) action = policy[step_].sample_action(obs_var) obs, reward, done, meta = env.step(action) action = random.choice(actions) new_obs, reward, done, meta = env.step(action) return new_obs, meta
def take_action(self, log_probabilities, new_model_state, image_emb_seq, factor_entropy): assert self.status == Client.WAITING_FOR_ACTION probability = list(torch.exp(log_probabilities.data))[0] self.model_state = new_model_state self.last_log_prob = log_probabilities self.image_emb_seq = image_emb_seq self.factor_entropy = factor_entropy # Use test policy to get the action self.last_action = gp.sample_action_from_prob(probability) self.num_action += 1 # if self.metadata["goal_dist"] < 5: # # Add a forced stop action to replay items # imp_weight = float(probability[3]) # reward = 1.0 # replay_item = ReplayMemoryItem( # self.state, self.agent.action_space.get_stop_action_index(), reward * imp_weight, # log_prob=self.last_log_prob, image_emb_seq=self.image_emb_seq, factor_entropy=self.factor_entropy) # self.batch_replay_items.append(replay_item) if self.last_action == self.agent.action_space.get_stop_action_index(): self.server.halt_nonblocking() else: self.server.send_action_nonblocking(self.last_action) self.status = Client.WAITING_TO_RECEIVE
def _gather_sample(env, actions, step, homing_policies, selection_weights=None): """ Gather sample using ALL_RANDOM style """ start_obs, meta = env.reset() if step > 1: if selection_weights is None: # Select a homing policy for the previous time step randomly uniformly ix = random.randint(0, len(homing_policies[step - 1]) - 1) policy = homing_policies[step - 1][ix] else: # Select a homing policy for the previous time step using the given weights # policy = random.choices(homing_policies[step - 1], weights=selection_weights, k=1)[0] ix = gp.sample_action_from_prob(selection_weights) policy = homing_policies[step - 1][ix] obs = start_obs for step_ in range(1, step): obs_var = cuda_var(torch.from_numpy(obs)).float().view(1, -1) action = policy[step_].sample_action(obs_var) obs, reward, done, meta = env.step(action) current_obs = obs else: ix = None current_obs = start_obs if meta is not None and "state" in meta: curr_state = meta["state"] else: curr_state = None deviation_action = random.choice(actions) action_prob = 1.0 / float(max(1, len(actions))) next_obs, reward, done, meta = env.step(deviation_action) new_meta = meta if new_meta is not None and "state" in new_meta: next_state = new_meta["state"] else: next_state = None data_point = TransitionDatapoint(curr_obs=current_obs, action=deviation_action, next_obs=next_obs, y=1, curr_state=curr_state, next_state=next_state, action_prob=action_prob, policy_index=ix, step=step, reward=reward) return data_point
def _explore_and_set_tracking(self, server, data_point): # Get the panoramic image panorama, _ = server.explore() # Get the panorama and predict the goal location state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=panorama, previous_action=None, pose=None, position_orientation=None, data_point=data_point) volatile = self.local_predictor_model.get_attention_prob(state, model_state=None) attention_prob = list(volatile["attention_probs"].view(-1)[:-1].data.cpu().numpy()) inferred_ix = gp.sample_action_from_prob(attention_prob) sampled_prob = volatile["attention_probs"][inferred_ix] if inferred_ix == 6 * self.config["num_manipulation_row"] * self.config["num_manipulation_col"]: print("Predicting Out-of-sight") return assert 0 <= inferred_ix < 6 * self.config["num_manipulation_row"] * self.config["num_manipulation_col"] row = int(inferred_ix / (6 * self.config["num_manipulation_col"])) col = inferred_ix % (6 * self.config["num_manipulation_col"]) region_ix = int(col / self.config["num_manipulation_col"]) if region_ix == 0: camera_ix = 3 elif region_ix == 1: camera_ix = 4 elif region_ix == 2: camera_ix = 5 elif region_ix == 3: camera_ix = 0 elif region_ix == 4: camera_ix = 1 elif region_ix == 5: camera_ix = 2 else: raise AssertionError("region ix should be in {0, 1, 2, 3, 4, 5}. Found ", region_ix) col = col % self.config["num_manipulation_col"] # Set tracking row_value = min(1.0, (row + 0.5) / float(self.config["num_manipulation_row"])) col_value = min(1.0, (col + 0.5) / float(self.config["num_manipulation_col"])) server.set_tracking(camera_ix, row_value, col_value) return sampled_prob
def num_oracle_rollin_segments(self, num_segments): prob = [0] * num_segments sum_prob = 0.00001 # small value for numerical stability for i in range(num_segments - 1, -1, -1): if i == num_segments - 1: prob[i] = self.p else: prob[i] = self.p * prob[i + 1] sum_prob += prob[i] prob = [prob_val / sum_prob for prob_val in prob] self.num_call += 1 self._decay() return gp.sample_action_from_prob(prob)
def do_train(self, agent, train_dataset, tune_dataset, experiment_name): """ Perform training """ dataset_size = len(train_dataset) for epoch in range(1, self.max_epoch + 1): logging.info("Starting epoch %d", epoch) action_counts = [0] * self.action_space.num_actions() # Test on tuning data agent.test(tune_dataset, tensorboard=self.tensorboard) batch_replay_items = [] total_reward = 0 episodes_in_batch = 0 for data_point_ix, data_point in enumerate(train_dataset): if (data_point_ix + 1) % 100 == 0: logging.info("Done %d out of %d", data_point_ix, dataset_size) logging.info("Training data action counts %r", action_counts) # instruction = instruction_to_string( # data_point.get_instruction(), self.config) # print "TRAIN INSTRUCTION: %r" % instruction # print "" instruction = data_point.get_paragraph_instruction() num_actions = 0 max_num_actions = len(data_point.get_trajectory()) max_num_actions += self.constants["max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=data_point.get_paragraph_instruction(), config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point) state.start_read_pointer, state.end_read_pointer = data_point.get_instruction_indices( ) forced_stop = True while num_actions < max_num_actions: # Sample action using the policy # Generate probabilities over actions probabilities = list( torch.exp(self.model.get_probs(state).data)) # Use test policy to get the action action = gp.sample_action_from_prob(probabilities) action_counts[action] += 1 if action == agent.action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward) batch_replay_items.append(replay_item) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update( image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) num_actions += 1 total_reward += reward # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update episodes_in_batch += 1 if episodes_in_batch == 1: loss_val = self.do_update(batch_replay_items) batch_replay_items = [] # entropy_val = float(self.entropy.data[0]) # self.tensorboard.log(entropy_val, loss_val, total_reward) cross_entropy = float(self.cross_entropy.data[0]) self.tensorboard.log(cross_entropy, loss_val, total_reward) total_reward = 0 episodes_in_batch = 0 if self.tensorboard is not None: self.tensorboard.log_all_train_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Save the model self.model.save_model(experiment_name + "/contextual_bandit_resnet_epoch_" + str(epoch)) logging.info("Training data action counts %r", action_counts)
def do_train_forced_reading(self, agent, train_dataset, tune_dataset, experiment_name): """ Perform training """ assert isinstance( agent, ReadPointerAgent ), "This learning algorithm works only with READPointerAgent" dataset_size = len(train_dataset) for epoch in range(1, self.max_epoch + 1): logging.info("Starting epoch %d", epoch) action_counts = dict() action_counts[ReadPointerAgent.READ_MODE] = [0] * 2 action_counts[ReadPointerAgent. ACT_MODE] = [0] * self.action_space.num_actions() # Test on tuning data agent.test_forced_reading(tune_dataset, tensorboard=self.tensorboard) batch_replay_items = [] total_reward = 0 episodes_in_batch = 0 for data_point_ix, data_point in enumerate(train_dataset): if (data_point_ix + 1) % 100 == 0: logging.info("Done %d out of %d", data_point_ix, dataset_size) logging.info("Training data action counts %r", action_counts) num_actions = 0 max_num_actions = len(data_point.get_trajectory()) max_num_actions += self.constants["max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) oracle_segments = data_point.get_instruction_oracle_segmented() pose = int(metadata["y_angle"] / 15.0) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose) per_segment_budget = int(max_num_actions / len(oracle_segments)) num_segment_actions = 0 mode = ReadPointerAgent.READ_MODE current_segment_ix = 0 while True: if mode == ReadPointerAgent.READ_MODE: # Find the number of tokens to read for the gold segment num_segment_size = len( oracle_segments[current_segment_ix]) current_segment_ix += 1 for i in range(0, num_segment_size): state = state.update_on_read() mode = ReadPointerAgent.ACT_MODE elif mode == ReadPointerAgent.ACT_MODE: # Sample action using the policy # Generate probabilities over actions probabilities = list( torch.exp(self.model.get_probs(state, mode).data)) # Use test policy to get the action action = gp.sample_action_from_prob(probabilities) action_counts[mode][action] += 1 # deal with act mode boundary conditions if num_actions >= max_num_actions: forced_stop = True break elif action == agent.action_space.get_stop_action_index( ) or num_segment_actions > per_segment_budget: if state.are_tokens_left_to_be_read(): # reward = self._calc_reward_act_halt(state) if metadata["error"] < 5.0: reward = 1.0 else: reward = -1.0 # Add to replay memory replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward, mode) if action == agent.action_space.get_stop_action_index( ): batch_replay_items.append(replay_item) mode = ReadPointerAgent.READ_MODE agent.server.force_goal_update() state = state.update_on_act_halt() num_segment_actions = 0 else: if action == agent.action_space.get_stop_action_index( ): forced_stop = False else: # stopping due to per segment budget exhaustion forced_stop = True break else: image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, mode=mode) batch_replay_items.append(replay_item) # Update the agent state pose = int(metadata["y_angle"] / 15.0) state = state.update(image, action, pose=pose) num_actions += 1 num_segment_actions += 1 total_reward += reward else: raise AssertionError( "Mode should be either read or act. Unhandled mode: " + str(mode)) assert mode == ReadPointerAgent.ACT_MODE, "Agent should end on Act Mode" # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward, mode) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update episodes_in_batch += 1 if episodes_in_batch == 1: loss_val = self.do_update(batch_replay_items) batch_replay_items = [] entropy_val = float(self.entropy.data[0]) self.tensorboard.log(entropy_val, loss_val, total_reward) total_reward = 0 episodes_in_batch = 0 self.tensorboard.log_train_error(metadata["error"]) # Save the model self.model.save_model( experiment_name + "/read_pointer_forced_reading_contextual_bandit_resnet_epoch_" + str(epoch)) logging.info("Training data action counts %r", action_counts)
def do_train(self, agent, train_dataset, tune_dataset, experiment_name): """ Perform training """ assert isinstance( agent, ReadPointerAgent ), "This learning algorithm works only with READPointerAgent" dataset_size = len(train_dataset) for epoch in range(1, self.max_epoch + 1): logging.info("Starting epoch %d", epoch) action_counts = dict() action_counts[ReadPointerAgent.READ_MODE] = [0] * 2 action_counts[ReadPointerAgent. ACT_MODE] = [0] * self.action_space.num_actions() # Test on tuning data agent.test(tune_dataset, tensorboard=self.tensorboard) batch_replay_items = [] total_reward = 0 episodes_in_batch = 0 for data_point_ix, data_point in enumerate(train_dataset): if (data_point_ix + 1) % 100 == 0: logging.info("Done %d out of %d", data_point_ix, dataset_size) logging.info("Training data action counts %r", action_counts) num_actions = 0 max_num_actions = len(data_point.get_trajectory()) max_num_actions += self.constants["max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None) mode = ReadPointerAgent.READ_MODE last_action_was_halt = False instruction = instruction_to_string( data_point.get_instruction(), self.config) print "TRAIN INSTRUCTION: %r" % instruction print "" while True: # Sample action using the policy # Generate probabilities over actions probabilities = list( torch.exp(self.model.get_probs(state, mode).data)) # Use test policy to get the action action = gp.sample_action_from_prob(probabilities) action_counts[mode][action] += 1 if mode == ReadPointerAgent.READ_MODE: # read mode boundary conditions forced_action = False if not state.are_tokens_left_to_be_read(): # force halt action = 1 forced_action = True elif num_actions >= max_num_actions or last_action_was_halt: # force read action = 0 forced_action = True if not forced_action: # Store reward in the replay memory list reward = self._calc_reward_read_mode(state, action) replay_item = ReplayMemoryItem(state, action, reward, mode=mode) batch_replay_items.append(replay_item) if action == 0: last_action_was_halt = False state = state.update_on_read() elif action == 1: last_action_was_halt = True mode = ReadPointerAgent.ACT_MODE else: raise AssertionError( "Read mode only supports two actions: read(0) and halt(1). " + "Found " + str(action)) elif mode == ReadPointerAgent.ACT_MODE: # deal with act mode boundary conditions if num_actions >= max_num_actions: forced_stop = True break elif action == agent.action_space.get_stop_action_index( ): if state.are_tokens_left_to_be_read(): reward = self._calc_reward_act_halt(state) # Add to replay memory replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward, mode) batch_replay_items.append(replay_item) mode = ReadPointerAgent.READ_MODE last_action_was_halt = True state = state.update_on_act_halt() else: forced_stop = False break else: image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, mode=mode) batch_replay_items.append(replay_item) # Update the agent state state = state.update(image, action) num_actions += 1 total_reward += reward last_action_was_halt = False else: raise AssertionError( "Mode should be either read or act. Unhandled mode: " + str(mode)) assert mode == ReadPointerAgent.ACT_MODE, "Agent should end on Act Mode" # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, agent.action_space.get_stop_action_index(), reward, mode) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update episodes_in_batch += 1 if episodes_in_batch == 1: loss_val = self.do_update(batch_replay_items) batch_replay_items = [] entropy_val = float(self.entropy.data[0]) self.tensorboard.log(entropy_val, loss_val, total_reward) total_reward = 0 episodes_in_batch = 0 self.tensorboard.log_train_error(metadata["error"]) # Save the model self.model.save_model( experiment_name + "/read_pointer_contextual_bandit_resnet_epoch_" + str(epoch)) logging.info("Training data action counts %r", action_counts)
def do_train_(house_id, shared_model, config, action_space, meta_data_util, constants, train_dataset, tune_dataset, experiment, experiment_name, rank, server, logger, model_type, vocab, use_pushover=False): logger.log("In Training...") launch_k_unity_builds([config["port"]], "./house_" + str(house_id) + "_elmer.x86_64", arg_str="--config ./AssetsHouse/config" + str(house_id) + ".json", cwd="./simulators/house/") logger.log("Launched Builds.") server.initialize_server() logger.log("Server Initialized.") # Test policy test_policy = gp.get_argmax_action if rank == 0: # client 0 creates a tensorboard server tensorboard = Tensorboard(experiment_name) logger.log('Created Tensorboard Server.') else: tensorboard = None if use_pushover: pushover_logger = None else: pushover_logger = None # Create a local model for rollouts local_model = model_type(config, constants) # local_model.train() # Create the Agent tmp_agent = TmpHouseAgent(server=server, model=local_model, test_policy=test_policy, action_space=action_space, meta_data_util=meta_data_util, config=config, constants=constants) logger.log("Created Agent.") action_counts = [0] * action_space.num_actions() max_epochs = 100000 # constants["max_epochs"] dataset_size = len(train_dataset) tune_dataset_size = len(tune_dataset) if tune_dataset_size > 0: # Test on tuning data tmp_agent.test(tune_dataset, vocab, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger) # Create the learner to compute the loss learner = TmpAsynchronousContextualBandit(shared_model, local_model, action_space, meta_data_util, config, constants, tensorboard) # TODO change 2 --- unity launch moved up learner.logger = logger for epoch in range(1, max_epochs + 1): for data_point_ix, data_point in enumerate(train_dataset): # Sync with the shared model # local_model.load_state_dict(shared_model.state_dict()) local_model.load_from_state_dict(shared_model.get_state_dict()) if (data_point_ix + 1) % 100 == 0: logger.log("Done %d out of %d" %(data_point_ix, dataset_size)) logger.log("Training data action counts %r" % action_counts) num_actions = 0 max_num_actions = constants["horizon"] max_num_actions += constants["max_extra_horizon"] image, metadata = tmp_agent.server.reset_receive_feedback(data_point) instruction = data_point.get_instruction() # instruction_str = TmpAsynchronousContextualBandit.convert_indices_to_text(instruction, vocab) # print("Instruction str is ", instruction_str) # Pose and Orientation gone TODO change 3 state = AgentObservedState(instruction=instruction, config=config, constants=constants, start_image=image, previous_action=None, data_point=data_point) state.goal = learner.get_goal(metadata) model_state = None batch_replay_items = [] total_reward = 0 forced_stop = True while num_actions < max_num_actions: # logger.log("Training: Meta Data %r " % metadata) # Sample action using the policy log_probabilities, model_state, image_emb_seq, state_feature = \ local_model.get_probs(state, model_state) probabilities = list(torch.exp(log_probabilities.data))[0] # Sample action from the probability action = gp.sample_action_from_prob(probabilities) action_counts[action] += 1 if action == action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = tmp_agent.server.send_action_receive_feedback(action) # logger.log("Action is %r, Reward is %r Probability is %r " % (action, reward, probabilities)) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, log_prob=log_probabilities) batch_replay_items.append(replay_item) # Update the agent state # Pose and orientation gone, TODO change 4 state = state.update(image, action, data_point=data_point) state.goal = learner.get_goal(metadata) num_actions += 1 total_reward += reward # Send final STOP action and get feedback image, reward, metadata = tmp_agent.server.halt_and_receive_feedback() total_reward += reward # Store it in the replay memory list if not forced_stop: # logger.log("Action is Stop, Reward is %r Probability is %r " % (reward, probabilities)) replay_item = ReplayMemoryItem(state, action_space.get_stop_action_index(), reward, log_prob=log_probabilities) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update if len(batch_replay_items) > 0: # 32 loss_val = learner.do_update(batch_replay_items) if tensorboard is not None: # cross_entropy = float(learner.cross_entropy.data[0]) # tensorboard.log(cross_entropy, loss_val, 0) tensorboard.log_scalar("loss", loss_val) entropy = float(learner.entropy.data[0])/float(num_actions + 1) tensorboard.log_scalar("entropy", entropy) ratio = float(learner.ratio.data[0]) tensorboard.log_scalar("Abs_objective_to_entropy_ratio", ratio) tensorboard.log_scalar("total_reward", total_reward) tensorboard.log_scalar("mean navigation error", metadata['mean-navigation-error']) if learner.action_prediction_loss is not None: action_prediction_loss = float(learner.action_prediction_loss.data[0]) learner.tensorboard.log_action_prediction_loss(action_prediction_loss) if learner.temporal_autoencoder_loss is not None: temporal_autoencoder_loss = float(learner.temporal_autoencoder_loss.data[0]) tensorboard.log_temporal_autoencoder_loss(temporal_autoencoder_loss) if learner.object_detection_loss is not None: object_detection_loss = float(learner.object_detection_loss.data[0]) tensorboard.log_object_detection_loss(object_detection_loss) if learner.symbolic_language_prediction_loss is not None: symbolic_language_prediction_loss = float(learner.symbolic_language_prediction_loss.data[0]) tensorboard.log_scalar("sym_language_prediction_loss", symbolic_language_prediction_loss) if learner.goal_prediction_loss is not None: goal_prediction_loss = float(learner.goal_prediction_loss.data[0]) tensorboard.log_scalar("goal_prediction_loss", goal_prediction_loss) # Save the model local_model.save_model(experiment + "/contextual_bandit_" + str(rank) + "_epoch_" + str(epoch)) logger.log("Training data action counts %r" % action_counts) if tune_dataset_size > 0: # Test on tuning data tmp_agent.test(tune_dataset, vocab, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger)
def do_train_forced_reading(self, agent, train_dataset, tune_dataset, experiment_name): """ Perform training """ assert isinstance( agent, ReadPointerAgent ), "This learning algorithm works only with READPointerAgent" dataset_size = len(train_dataset) for epoch in range(1, self.max_epoch + 1): logging.info("Starting epoch %d", epoch) total_cb_segments = 0 num_reached_acceptable_circle = 0 total_segments = 0 total_supervised_segments = 0 action_counts = dict() action_counts[ReadPointerAgent.READ_MODE] = [0] * 2 action_counts[ReadPointerAgent. ACT_MODE] = [0] * self.action_space.num_actions() # Test on tuning data agent.test_forced_reading(tune_dataset, tensorboard=self.tensorboard) batch_replay_items = [] total_reward = 0 episodes_in_batch = 0 for data_point_ix, data_point in enumerate(train_dataset): if (data_point_ix + 1) % 100 == 0: logging.info("Done %d out of %d", data_point_ix, dataset_size) logging.info( "Contextual bandit segments %r, success %r per.", total_cb_segments, (num_reached_acceptable_circle * 100) / float(max(1, total_cb_segments))) logging.info("Num segments %r, Percent supervised %r", total_segments, (total_supervised_segments * 100) / float(max(1, total_segments))) logging.info("Training data action counts %r", action_counts) num_actions = 0 max_num_actions = len(data_point.get_trajectory()) max_num_actions += self.constants["max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) oracle_segments = data_point.get_instruction_oracle_segmented() pose = int(metadata["y_angle"] / 15.0) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None, pose=pose) per_segment_budget = int(max_num_actions / len(oracle_segments)) num_segment_actions = 0 trajectory_segments = data_point.get_sub_trajectory_list() mode = ReadPointerAgent.READ_MODE current_segment_ix = 0 num_supervised_rollout = self.rollin_policy.num_oracle_rollin_segments( len(trajectory_segments)) total_segments += len(trajectory_segments) while True: if mode == ReadPointerAgent.READ_MODE: # Find the number of tokens to read for the gold segment num_segment_size = len( oracle_segments[current_segment_ix]) current_segment_ix += 1 for i in range(0, num_segment_size): state = state.update_on_read() mode = ReadPointerAgent.ACT_MODE total_segments += 1 elif mode == ReadPointerAgent.ACT_MODE: if current_segment_ix <= num_supervised_rollout: # Do supervised learning for this segment for action in trajectory_segments[ current_segment_ix - 1]: image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list. Use reward of 1 as it is supervised learning all_rewards = self._get_all_rewards(metadata) replay_item = ReplayMemoryItem( state, action, reward=1, mode=mode, all_rewards=all_rewards) batch_replay_items.append(replay_item) # Update the agent state pose = int(metadata["y_angle"] / 15.0) state = state.update(image, action, pose=pose) num_actions += 1 total_reward += reward # Change the segment assert metadata[ "goal_dist"] < 5.0, "oracle segments out of acceptable circle" if state.are_tokens_left_to_be_read(): mode = ReadPointerAgent.READ_MODE # Jump to the next goal agent.server.force_goal_update() state = state.update_on_act_halt() num_segment_actions = 0 else: forced_stop = True break else: # Do contextual bandit for this segment and future # Generate probabilities over actions probabilities = list( torch.exp( self.model.get_probs(state, mode).data)) # Sample an action from the distribution action = gp.sample_action_from_prob(probabilities) action_counts[mode][action] += 1 # deal with act mode boundary conditions if num_actions >= max_num_actions: break elif action == agent.action_space.get_stop_action_index( ) or num_segment_actions > per_segment_budget: within_acceptable_circle = metadata[ "goal_dist"] < 5.0 if within_acceptable_circle: num_reached_acceptable_circle += 1 total_cb_segments += 1 if state.are_tokens_left_to_be_read(): if within_acceptable_circle: if metadata["error"] < 5.0: reward = 1.0 else: reward = -1.0 # Add to replay memory all_rewards = metadata["all_reward"] replay_item = ReplayMemoryItem( state, agent.action_space. get_stop_action_index(), reward, mode, all_rewards=all_rewards) batch_replay_items.append(replay_item) mode = ReadPointerAgent.READ_MODE # Jump to the next goal agent.server.force_goal_update() state = state.update_on_act_halt() num_segment_actions = 0 else: # No point going any further so break break else: break else: image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list all_rewards = self._get_all_rewards(metadata) replay_item = ReplayMemoryItem( state, action, reward, mode=mode, all_rewards=all_rewards) batch_replay_items.append(replay_item) # Update the agent state pose = int(metadata["y_angle"] / 15.0) state = state.update(image, action, pose=pose) num_actions += 1 num_segment_actions += 1 total_reward += reward else: raise AssertionError( "Mode should be either read or act. Unhandled mode: " + str(mode)) assert mode == ReadPointerAgent.ACT_MODE, "Agent should end on Act Mode" # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update episodes_in_batch += 1 if episodes_in_batch == 1: loss_val = self.do_update(batch_replay_items) batch_replay_items = [] entropy_val = float(self.entropy.data[0]) self.tensorboard.log(entropy_val, loss_val, total_reward) total_reward = 0 episodes_in_batch = 0 if self.tensorboard is not None: self.tensorboard.log_all_train_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Save the model self.model.save_model( experiment_name + "/read_pointer_forced_reading_curriculum_contextual_bandit_epoch_" + str(epoch)) logging.info("Training data action counts %r", action_counts)
def do_train_(simulator_file, shared_model, config, action_space, meta_data_util, constants, train_dataset, tune_dataset, experiment, experiment_name, rank, server, logger, model_type, use_pushover=False): # Launch unity launch_k_unity_builds([config["port"]], simulator_file) server.initialize_server() # Test policy test_policy = gp.get_argmax_action # torch.manual_seed(args.seed + rank) if rank == 0: # client 0 creates a tensorboard server tensorboard = Tensorboard(experiment_name) else: tensorboard = None if use_pushover: pushover_logger = PushoverLogger(experiment_name) else: pushover_logger = None # Create a local model for rollouts local_model = model_type(config, constants) # local_model.train() # Create the Agent logger.log("STARTING AGENT") agent = Agent(server=server, model=local_model, test_policy=test_policy, action_space=action_space, meta_data_util=meta_data_util, config=config, constants=constants) logger.log("Created Agent...") action_counts = [0] * action_space.num_actions() max_epochs = constants["max_epochs"] dataset_size = len(train_dataset) tune_dataset_size = len(tune_dataset) # Create the learner to compute the loss learner = AsynchronousContextualBandit(shared_model, local_model, action_space, meta_data_util, config, constants, tensorboard) for epoch in range(1, max_epochs + 1): for data_point_ix, data_point in enumerate(train_dataset): # Sync with the shared model local_model.load_from_state_dict(shared_model.get_state_dict()) if (data_point_ix + 1) % 100 == 0: logger.log("Done %d out of %d" % (data_point_ix, dataset_size)) logger.log("Training data action counts %r" % action_counts) num_actions = 0 max_num_actions = constants["horizon"] + constants[ "max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) state = AgentObservedState(instruction=data_point.instruction, config=config, constants=constants, start_image=image, previous_action=None, data_point=data_point) meta_data_util.start_state_update_metadata(state, metadata) model_state = None batch_replay_items = [] total_reward = 0 forced_stop = True while num_actions < max_num_actions: # Sample action using the policy log_probabilities, model_state, image_emb_seq, volatile = \ local_model.get_probs(state, model_state) probabilities = list(torch.exp(log_probabilities.data))[0] # Sample action from the probability action = gp.sample_action_from_prob(probabilities) action_counts[action] += 1 if action == action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, log_prob=log_probabilities, volatile=volatile) batch_replay_items.append(replay_item) # Update the agent state state = state.update(image, action, data_point=data_point) meta_data_util.state_update_metadata(state, metadata) num_actions += 1 total_reward += reward # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward if tensorboard is not None: meta_data_util.state_update_metadata(tensorboard, metadata) # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, action_space.get_stop_action_index(), reward, log_prob=log_probabilities, volatile=volatile) batch_replay_items.append(replay_item) # Perform update if len(batch_replay_items) > 0: loss_val = learner.do_update(batch_replay_items) if tensorboard is not None: entropy = float( learner.entropy.data[0]) / float(num_actions + 1) tensorboard.log_scalar("loss", loss_val) tensorboard.log_scalar("entropy", entropy) tensorboard.log_scalar("total_reward", total_reward) # Save the model local_model.save_model(experiment + "/contextual_bandit_" + str(rank) + "_epoch_" + str(epoch)) logger.log("Training data action counts %r" % action_counts) if tune_dataset_size > 0: # Test on tuning data agent.test(tune_dataset, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger)
def do_train_(shared_model, config, action_space, meta_data_util, constants, train_dataset, tune_dataset, experiment, experiment_name, rank, server, logger, model_type, use_pushover=False): server.initialize_server() # Test policy test_policy = gp.get_argmax_action # torch.manual_seed(args.seed + rank) if rank == 0: # client 0 creates a tensorboard server tensorboard = Tensorboard(experiment_name) else: tensorboard = None if use_pushover: pushover_logger = PushoverLogger(experiment_name) else: pushover_logger = None # Create a local model for rollouts local_model = model_type(config, constants) # local_model.train() # Create the Agent logger.log("STARTING AGENT") agent = Agent(server=server, model=local_model, test_policy=test_policy, action_space=action_space, meta_data_util=meta_data_util, config=config, constants=constants) logger.log("Created Agent...") action_counts = [0] * action_space.num_actions() max_epochs = constants["max_epochs"] dataset_size = len(train_dataset) tune_dataset_size = len(tune_dataset) # Create the learner to compute the loss learner = AsynchronousAdvantageActorGAECritic(shared_model, local_model, action_space, meta_data_util, config, constants, tensorboard) # Launch unity launch_k_unity_builds([config["port"]], "./simulators/NavDroneLinuxBuild.x86_64") for epoch in range(1, max_epochs + 1): learner.epoch = epoch task_completion_accuracy = 0 mean_stop_dist_error = 0 stop_dist_errors = [] for data_point_ix, data_point in enumerate(train_dataset): # Sync with the shared model # local_model.load_state_dict(shared_model.state_dict()) local_model.load_from_state_dict(shared_model.get_state_dict()) if (data_point_ix + 1) % 100 == 0: logger.log("Done %d out of %d" % (data_point_ix, dataset_size)) logger.log("Training data action counts %r" % action_counts) num_actions = 0 max_num_actions = constants["horizon"] + constants[ "max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback( data_point) pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = AgentObservedState( instruction=data_point.instruction, config=config, constants=constants, start_image=image, previous_action=None, pose=pose, position_orientation=position_orientation, data_point=data_point) state.goal = GoalPrediction.get_goal_location( metadata, data_point, learner.image_height, learner.image_width) model_state = None batch_replay_items = [] total_reward = 0 forced_stop = True while num_actions < max_num_actions: # Sample action using the policy log_probabilities, model_state, image_emb_seq, volatile = \ local_model.get_probs(state, model_state) probabilities = list(torch.exp(log_probabilities.data))[0] # Sample action from the probability action = gp.sample_action_from_prob(probabilities) action_counts[action] += 1 # Generate goal if config["do_goal_prediction"]: goal = learner.goal_prediction_calculator.get_goal_location( metadata, data_point, learner.image_height, learner.image_width) else: goal = None if action == action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, log_prob=log_probabilities, volatile=volatile, goal=goal) batch_replay_items.append(replay_item) # Update the agent state pose = int(metadata["y_angle"] / 15.0) position_orientation = (metadata["x_pos"], metadata["z_pos"], metadata["y_angle"]) state = state.update( image, action, pose=pose, position_orientation=position_orientation, data_point=data_point) state.goal = GoalPrediction.get_goal_location( metadata, data_point, learner.image_height, learner.image_width) num_actions += 1 total_reward += reward # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback( ) total_reward += reward if metadata["stop_dist_error"] < 5.0: task_completion_accuracy += 1 mean_stop_dist_error += metadata["stop_dist_error"] stop_dist_errors.append(metadata["stop_dist_error"]) if tensorboard is not None: tensorboard.log_all_train_errors( metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, action_space.get_stop_action_index(), reward, log_prob=log_probabilities, volatile=volatile, goal=goal) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update if len(batch_replay_items) > 0: # 32: loss_val = learner.do_update(batch_replay_items) # self.action_prediction_loss_calculator.predict_action(batch_replay_items) # del batch_replay_items[:] # in place list clear if tensorboard is not None: cross_entropy = float(learner.cross_entropy.data[0]) tensorboard.log(cross_entropy, loss_val, 0) entropy = float( learner.entropy.data[0]) / float(num_actions + 1) v_value_loss_per_step = float( learner.value_loss.data[0]) / float(num_actions + 1) tensorboard.log_scalar("entropy", entropy) tensorboard.log_scalar("total_reward", total_reward) tensorboard.log_scalar("v_value_loss_per_step", v_value_loss_per_step) ratio = float(learner.ratio.data[0]) tensorboard.log_scalar( "Abs_objective_to_entropy_ratio", ratio) if learner.action_prediction_loss is not None: action_prediction_loss = float( learner.action_prediction_loss.data[0]) learner.tensorboard.log_action_prediction_loss( action_prediction_loss) if learner.temporal_autoencoder_loss is not None: temporal_autoencoder_loss = float( learner.temporal_autoencoder_loss.data[0]) tensorboard.log_temporal_autoencoder_loss( temporal_autoencoder_loss) if learner.object_detection_loss is not None: object_detection_loss = float( learner.object_detection_loss.data[0]) tensorboard.log_object_detection_loss( object_detection_loss) if learner.symbolic_language_prediction_loss is not None: symbolic_language_prediction_loss = float( learner.symbolic_language_prediction_loss. data[0]) tensorboard.log_scalar( "sym_language_prediction_loss", symbolic_language_prediction_loss) if learner.goal_prediction_loss is not None: goal_prediction_loss = float( learner.goal_prediction_loss.data[0]) tensorboard.log_scalar("goal_prediction_loss", goal_prediction_loss) # Save the model local_model.save_model(experiment + "/contextual_bandit_" + str(rank) + "_epoch_" + str(epoch)) logger.log("Training data action counts %r" % action_counts) mean_stop_dist_error = mean_stop_dist_error / float( len(train_dataset)) task_completion_accuracy = (task_completion_accuracy * 100.0) / float(len(train_dataset)) logger.log("Training: Mean stop distance error %r" % mean_stop_dist_error) logger.log("Training: Task completion accuracy %r " % task_completion_accuracy) bins = range(0, 80, 3) # range of distance histogram, _ = np.histogram(stop_dist_errors, bins) logger.log("Histogram of train errors %r " % histogram) if tune_dataset_size > 0: # Test on tuning data agent.test(tune_dataset, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger)
def do_train_(shared_model, config, action_space, meta_data_util, constants, train_dataset, tune_dataset, experiment, experiment_name, rank, server, logger, model_type, vocab, use_pushover=False): launch_k_unity_builds([config["port"]], "./simulators/blocks/retro_linux_build.x86_64") server.initialize_server() # Test policy test_policy = gp.get_argmax_action # torch.manual_seed(args.seed + rank) if rank == 0: # client 0 creates a tensorboard server tensorboard = Tensorboard(experiment_name) else: tensorboard = None if use_pushover: pushover_logger = PushoverLogger(experiment_name) else: pushover_logger = None # Create a local model for rollouts local_model = model_type(config, constants) # local_model.train() # Create the Agent logger.log("STARTING AGENT") tmp_agent = TmpBlockAgent(server=server, model=local_model, test_policy=test_policy, action_space=action_space, meta_data_util=meta_data_util, config=config, constants=constants) logger.log("Created Agent...") action_counts = [0] * action_space.num_actions() max_epochs = constants["max_epochs"] dataset_size = len(train_dataset) tune_dataset_size = len(tune_dataset) # Create the learner to compute the loss learner = TmpAsynchronousContextualBandit(shared_model, local_model, action_space, meta_data_util, config, constants, tensorboard) # TODO change 2 --- unity launch moved up for epoch in range(1, max_epochs + 1): for data_point_ix, data_point in enumerate(train_dataset): # Sync with the shared model # local_model.load_state_dict(shared_model.state_dict()) local_model.load_from_state_dict(shared_model.get_state_dict()) if (data_point_ix + 1) % 100 == 0: logger.log("Done %d out of %d" % (data_point_ix, dataset_size)) logger.log("Training data action counts %r" % action_counts) num_actions = 0 # max_num_actions = len(data_point.get_trajectory()) # max_num_actions += self.constants["max_extra_horizon"] max_num_actions = constants["horizon"] + 5 image, metadata = tmp_agent.server.reset_receive_feedback( data_point) instruction = TmpAsynchronousContextualBandit.convert_text_to_indices( metadata["instruction"], vocab) # Pose and Orientation gone TODO change 3 state = AgentObservedState(instruction=instruction, config=config, constants=constants, start_image=image, previous_action=None, data_point=data_point) model_state = None batch_replay_items = [] total_reward = 0 forced_stop = True while num_actions < max_num_actions: # Sample action using the policy log_probabilities, model_state, image_emb_seq, state_feature = \ local_model.get_probs(state, model_state) probabilities = list(torch.exp(log_probabilities.data))[0] # Sample action from the probability action = gp.sample_action_from_prob(probabilities) action_counts[action] += 1 if action == action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = tmp_agent.server.send_action_receive_feedback( action) # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, log_prob=log_probabilities) batch_replay_items.append(replay_item) # Update the agent state # Pose and orientation gone, TODO change 4 state = state.update(image, action, data_point=data_point) num_actions += 1 total_reward += reward # Send final STOP action and get feedback image, reward, metadata = tmp_agent.server.halt_and_receive_feedback( ) total_reward += reward # if tensorboard is not None: # tensorboard.log_all_train_errors( # metadata["edit_dist_error"], metadata["closest_dist_error"], metadata["stop_dist_error"]) # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem( state, action_space.get_stop_action_index(), reward, log_prob=log_probabilities) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Perform update if len(batch_replay_items) > 0: # 32 loss_val = learner.do_update(batch_replay_items) # self.action_prediction_loss_calculator.predict_action(batch_replay_items) # del batch_replay_items[:] # in place list clear if tensorboard is not None: # cross_entropy = float(learner.cross_entropy.data[0]) # tensorboard.log(cross_entropy, loss_val, 0) tensorboard.log_scalar("loss", loss_val) entropy = float( learner.entropy.data[0]) / float(num_actions + 1) tensorboard.log_scalar("entropy", entropy) ratio = float(learner.ratio.data[0]) tensorboard.log_scalar( "Abs_objective_to_entropy_ratio", ratio) tensorboard.log_scalar("total_reward", total_reward) if learner.action_prediction_loss is not None: action_prediction_loss = float( learner.action_prediction_loss.data[0]) learner.tensorboard.log_action_prediction_loss( action_prediction_loss) if learner.temporal_autoencoder_loss is not None: temporal_autoencoder_loss = float( learner.temporal_autoencoder_loss.data[0]) tensorboard.log_temporal_autoencoder_loss( temporal_autoencoder_loss) if learner.object_detection_loss is not None: object_detection_loss = float( learner.object_detection_loss.data[0]) tensorboard.log_object_detection_loss( object_detection_loss) if learner.symbolic_language_prediction_loss is not None: symbolic_language_prediction_loss = float( learner.symbolic_language_prediction_loss. data[0]) tensorboard.log_scalar( "sym_language_prediction_loss", symbolic_language_prediction_loss) if learner.goal_prediction_loss is not None: goal_prediction_loss = float( learner.goal_prediction_loss.data[0]) tensorboard.log_scalar("goal_prediction_loss", goal_prediction_loss) if learner.mean_factor_entropy is not None: mean_factor_entropy = float( learner.mean_factor_entropy.data[0]) tensorboard.log_factor_entropy_loss( mean_factor_entropy) # Save the model local_model.save_model(experiment + "/contextual_bandit_" + str(rank) + "_epoch_" + str(epoch)) logger.log("Training data action counts %r" % action_counts) if tune_dataset_size > 0: # Test on tuning data print("Going for testing") tmp_agent.test(tune_dataset, vocab, tensorboard=tensorboard, logger=logger, pushover_logger=pushover_logger) print("Done testing")
def _sample_goal(self, exploration_image, data_point, panaroma=True): state = AgentObservedState( instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=exploration_image, previous_action=None, pose=None, position_orientation=data_point.get_start_pos(), data_point=data_point) volatile = self.local_predictor_model.get_attention_prob( state, model_state=None) attention_prob = list( volatile["attention_probs"].view(-1)[:-1].data.cpu().numpy()) sampled_ix = gp.sample_action_from_prob(attention_prob) sampled_prob = volatile["attention_probs"][sampled_ix] ################################################# # Max pointed about that when inferred ix above is the last value then calculations are buggy. He is right. predicted_row = int(sampled_ix / float(192)) predicted_col = sampled_ix % 192 screen_pos = (predicted_row, predicted_col) if panaroma: # Index of the 6 image where the goal is region_index = int(predicted_col / 32) predicted_col = predicted_col % 32 # Column within that image where the goal is pos = data_point.get_start_pos() new_pos_angle = GoalPredictionSingle360ImageSupervisedLearningFromDisk.\ get_new_pos_angle_from_region_index(region_index, pos) metadata = { "x_pos": pos[0], "z_pos": pos[1], "y_angle": new_pos_angle } else: pos = data_point.get_start_pos() metadata = {"x_pos": pos[0], "z_pos": pos[1], "y_angle": pos[2]} row, col = predicted_row + 0.5, predicted_col + 0.5 start_pos = current_pos_from_metadata(metadata) start_pose = current_pose_from_metadata(metadata) goal_pos = data_point.get_destination_list()[-1] height_drone = 2.5 x_gen, z_gen = get_inverse_object_position( row, col, height_drone, 30, 32, 32, (start_pos[0], start_pos[1], start_pose)) predicted_goal_pos = (x_gen, z_gen) x_goal, z_goal = goal_pos x_diff = x_gen - x_goal z_diff = z_gen - z_goal dist = math.sqrt(x_diff * x_diff + z_diff * z_diff) return predicted_goal_pos, dist, screen_pos, sampled_prob
def do_train(self, agent, experiment_name): """ Perform training """ print("in training") for epoch in range(1, self.max_epoch + 1): logging.info("Starting epoch %r", epoch) # Test on tuning data # switch instruction set to test agent.server.env.switch_instructions_set('test') agent.test(30, tensorboard=self.tensorboard) agent.server.env.switch_instructions_set('train') for i in range(0, 500): batch_replay_items = [] num_actions = 0 total_reward = 0 instruction, image, metadata = agent.server.reset_receive_feedback( ) state = AgentObservedState(instruction=instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None) model_state = None while True: # Sample action using the policy # Generate probabilities over actions log_probabilities, model_state, _, _ = self.model.get_probs( state, model_state) probabilities = list(torch.exp(log_probabilities.data)) # Use test policy to get the action action = gp.sample_action_from_prob(probabilities[0]) # logging.info('Train: probabilities:' + str(probabilities[0].cpu().numpy()) + ' , action taken: ' + str(action)) # Send the action and get feedback image, reward, done, metadata = agent.server.send_action_receive_feedback( action, num_actions) total_reward += reward # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward, log_prob=log_probabilities) batch_replay_items.append(replay_item) # Update the agent state state = state.update(image, action) num_actions += 1 if done: break # Perform update loss_val = self.do_update(batch_replay_items) entropy_val = float(self.entropy.data[0]) self.tensorboard.log(entropy_val, loss_val, total_reward) # Save the model self.model.save_model(experiment_name + "/contextual_bandit_epoch_" + str(epoch))
def _gather_sample(env, actions, step, homing_policies, selection_weights=None): """ Gather sample using ALL_RANDOM style """ start_obs, meta = env.reset() if step > 1: if selection_weights is None: # Select a homing policy for the previous time step randomly uniformly ix = random.randint(0, len(homing_policies[step - 1]) - 1) policy = homing_policies[step - 1][ix] else: # Select a homing policy for the previous time step using the given weights # policy = random.choices(homing_policies[step - 1], weights=selection_weights, k=1)[0] ix = gp.sample_action_from_prob(selection_weights) policy = homing_policies[step - 1][ix] obs = start_obs for step_ in range(1, step): obs_var = cuda_var(torch.from_numpy(obs)).float().view(1, -1) action = policy[step_].sample_action(obs_var) obs, reward, done, meta = env.step(action) current_obs = obs else: ix = None current_obs = start_obs if meta is not None and "state" in meta: curr_state = meta["state"] else: curr_state = None deviation_action = random.choice(actions) action_prob = 1.0 / float(max(1, len(actions))) y = random.randint(0, 1) if y == 0: # Add imposter next_obs, new_meta = EncoderSamplerAllRandom._gather_last_observation( env, actions, step, homing_policies, selection_weights) reward = None # Reward for imposter transition makes little sense elif y == 1: # Take the action next_obs, reward, done, meta = env.step(deviation_action) new_meta = meta else: raise AssertionError("y can only be either 0 or 1") if new_meta is not None and "state" in new_meta: next_state = new_meta["state"] else: next_state = None data_point = TransitionDatapoint(curr_obs=current_obs, action=deviation_action, next_obs=next_obs, y=y, curr_state=curr_state, next_state=next_state, action_prob=action_prob, policy_index=ix, step=step, reward=reward) return data_point
def do_train(self, agent, train_dataset, tune_dataset, experiment_name): """ Perform training """ for epoch in range(1, self.max_epoch + 1): # Test on tuning data agent.test(tune_dataset, tensorboard=self.tensorboard) for data_point in train_dataset: batch_replay_items = [] num_actions = 0 total_reward = 0 max_num_actions = len(data_point.get_trajectory()) max_num_actions += self.constants["max_extra_horizon"] image, metadata = agent.server.reset_receive_feedback(data_point) state = AgentObservedState(instruction=data_point.instruction, config=self.config, constants=self.constants, start_image=image, previous_action=None) forced_stop = True instruction = instruction_to_string( data_point.get_instruction(), self.config) print "TRAIN INSTRUCTION: %r" % instruction print "" while num_actions < max_num_actions: # Sample action using the policy # Generate probabilities over actions probabilities = list(torch.exp(self.model.get_probs(state).data)) # Use test policy to get the action action = gp.sample_action_from_prob(probabilities) if action == agent.action_space.get_stop_action_index(): forced_stop = False break # Send the action and get feedback image, reward, metadata = agent.server.send_action_receive_feedback(action) total_reward += reward # Store it in the replay memory list replay_item = ReplayMemoryItem(state, action, reward) batch_replay_items.append(replay_item) # Update the agent state state = state.update(image, action) num_actions += 1 # Send final STOP action and get feedback image, reward, metadata = agent.server.halt_and_receive_feedback() total_reward += reward # Store it in the replay memory list if not forced_stop: replay_item = ReplayMemoryItem(state, agent.action_space.get_stop_action_index(), reward) batch_replay_items.append(replay_item) # Update the scores based on meta_data # self.meta_data_util.log_results(metadata) # Compute Q-values using sampled rollout ReinforceLearning._set_q_val(batch_replay_items) # Perform update loss_val = self.do_update(batch_replay_items) entropy_val = float(self.entropy.data[0]) self.tensorboard.log(entropy_val, loss_val, total_reward) self.tensorboard.log_train_error(metadata["error"]) # Save the model self.model.save_model(experiment_name + "/reinforce_epoch_" + str(epoch))