def __init__(self, config, constants): AbstractIncrementalModel.__init__(self, config, constants) self.none_action = config["num_actions"] self.config = config self.constants = constants # CNN over images - using SimpleImage for testing for now! self.image_module = ImageCnnEmnlp( image_emb_size=config["image_emb_dim"], input_num_channels=3 * 5, #3 channels per image - 5 images in history image_height=config["image_height"], image_width=config["image_width"]) # this is somewhat counter intuitivie - emb_dim is the word size # hidden_size is the output size self.text_module = TextSimpleModule(emb_dim=config["word_emb_dim"], hidden_dim=config["lstm_emb_dim"], vocab_size=config["vocab_size"]) self.previous_action_module = ActionSimpleModule( num_actions=config["no_actions"], action_emb_size=config["previous_action_embedding_dim"]) self.previous_block_module = ActionSimpleModule( num_actions=config["no_blocks"], action_emb_size=config["previous_block_embedding_dim"]) self.final_module = IncrementalMultimodalEmnlp( image_module=self.image_module, text_module=self.text_module, previous_action_module=self.previous_action_module, previous_block_module=self.previous_block_module, input_embedding_size=config["lstm_emb_dim"] + config["image_emb_dim"] + config["previous_action_embedding_dim"] + config["previous_block_embedding_dim"], output_hidden_size=config["h1_hidden_dim"], blocks_hidden_size=config["no_blocks"], directions_hidden_size=config["no_actions"], max_episode_length=(constants["horizon"] + 5)) if torch.cuda.is_available(): self.image_module.cuda() self.text_module.cuda() self.previous_action_module.cuda() self.previous_block_module.cuda() self.final_module.cuda()
def __init__(self, config, constants): AbstractIncrementalModel.__init__(self, config, constants) self.none_action = config["num_actions"] self.image_module = ImageResnetModule( image_emb_size=constants["image_emb_dim"], input_num_channels=3, image_height=config["image_height"], image_width=config["image_width"], using_recurrence=True) # self.image_module = resnet.resnet18(pretrained=True) # constants["image_emb_dim"] = 1000 self.image_recurrence_module = IncrementalRecurrenceSimpleModule( input_emb_dim=constants["image_emb_dim"], output_emb_dim=constants["image_emb_dim"]) self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) total_emb_size = (2 * constants["image_emb_dim"] + constants["action_emb_dim"]) if config["do_action_prediction"]: self.action_prediction_module = ActionPredictionModule( 2 * constants["image_emb_dim"], constants["image_emb_dim"], config["num_actions"]) else: self.action_prediction_module = None if config["do_temporal_autoencoding"]: self.temporal_autoencoder_module = TemporalAutoencoderModule( self.action_module, constants["image_emb_dim"], constants["action_emb_dim"], constants["image_emb_dim"]) else: self.temporal_autoencoder_module = None if config["do_object_detection"]: self.landmark_names = get_all_landmark_names() self.object_detection_module = ObjectDetectionModule( image_module=self.image_module, image_emb_size=constants["image_emb_dim"], num_objects=63) else: self.object_detection_module = None final_module = IncrementalMultimodalRecurrentSimpleGoalImageModule( image_module=self.image_module, image_recurrence_module=self.image_recurrence_module, action_module=self.action_module, total_emb_size=total_emb_size, num_actions=config["num_actions"]) self.final_module = final_module if torch.cuda.is_available(): self.image_module.cuda() self.image_recurrence_module.cuda() self.action_module.cuda() self.final_module.cuda() if self.action_prediction_module is not None: self.action_prediction_module.cuda() if self.temporal_autoencoder_module is not None: self.temporal_autoencoder_module.cuda() if self.object_detection_module is not None: self.object_detection_module.cuda()
def __init__(self, config, constants): AbstractIncrementalModel.__init__(self, config, constants) self.none_action = config["num_actions"] self.image_module = ImageResnetModule( image_emb_size=constants["image_emb_dim"], input_num_channels=3, image_height=config["image_height"], image_width=config["image_width"], using_recurrence=True) self.image_recurrence_module = IncrementalRecurrenceSimpleModule( input_emb_dim=constants["image_emb_dim"], output_emb_dim=constants["image_emb_dim"]) if config["use_pointer_model"]: raise AssertionError("Not implemented") # self.text_module = TextPointerModule( # emb_dim=constants["word_emb_dim"], # hidden_dim=constants["lstm_emb_dim"], # vocab_size=config["vocab_size"]) else: self.text_module = TextImplicitFactorizationModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"], num_factors=2, factors_vocabulary_size=60, factors_embedding_size=250) self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) if config["use_pointer_model"]: total_emb_size = (constants["image_emb_dim"] + 4 * constants["lstm_emb_dim"] + constants["action_emb_dim"]) else: total_emb_size = (constants["image_emb_dim"] + 2 * 250 + constants["action_emb_dim"]) final_module = IncrementalMultimodalRecurrentSimpleModule( image_module=self.image_module, image_recurrence_module=self.image_recurrence_module, text_module=self.text_module, action_module=self.action_module, total_emb_size=total_emb_size, num_actions=config["num_actions"]) self.final_module = final_module if torch.cuda.is_available(): self.image_module.cuda() self.image_recurrence_module.cuda() self.text_module.cuda() self.action_module.cuda() self.final_module.cuda()
def __init__(self, config, constants): AbstractModel.__init__(self, config, constants) self.none_action = config["num_actions"] self.image_module = ImageTextKernelResnetModule( image_emb_size=constants["image_emb_dim"], input_num_channels=3, image_height=config["image_height"], image_width=config["image_width"], text_emb_size=constants["lstm_emb_dim"], using_recurrence=True) self.image_recurrence_module = RecurrenceSimpleModule( input_emb_dim=constants["image_emb_dim"], output_emb_dim=constants["image_emb_dim"]) if config["use_pointer_model"]: self.text_module = TextPointerModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"]) else: self.text_module = TextSimpleModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"]) self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) if config["use_pointer_model"]: total_emb_size = (constants["image_emb_dim"] + 4 * constants["lstm_emb_dim"] + constants["action_emb_dim"]) else: total_emb_size = (constants["image_emb_dim"] + constants["lstm_emb_dim"] + constants["action_emb_dim"]) final_module = MultimodalTextKernelRecurrentSimpleModule( image_module=self.image_module, image_recurrence_module=self.image_recurrence_module, text_module=self.text_module, action_module=self.action_module, total_emb_size=total_emb_size, num_actions=config["num_actions"]) self.final_module = final_module if torch.cuda.is_available(): self.image_module.cuda() self.image_recurrence_module.cuda() self.text_module.cuda() self.action_module.cuda() self.final_module.cuda()
def __init__(self, config, constants): AbstractModel.__init__(self, config, constants) self.none_action = config["num_actions"] landmark_names = get_all_landmark_names() self.radius_module = RadiusModule(15) self.angle_module = AngleModule(48) self.landmark_module = LandmarkModule(63) self.image_module = ImageResnetModule( image_emb_size=constants["image_emb_dim"], input_num_channels=3, image_height=config["image_height"], image_width=config["image_width"], using_recurrence=True) self.image_recurrence_module = RecurrenceSimpleModule( input_emb_dim=constants["image_emb_dim"], output_emb_dim=constants["image_emb_dim"]) self.text_module = SymbolicInstructionModule( radius_embedding=self.radius_module, theta_embedding=self.angle_module, landmark_embedding=self.landmark_module) self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) total_emb_size = (constants["image_emb_dim"] + 32 * 4 + constants["action_emb_dim"]) final_module = MultimodalRecurrentSimpleModule( image_module=self.image_module, image_recurrence_module=self.image_recurrence_module, text_module=self.text_module, action_module=self.action_module, total_emb_size=total_emb_size, num_actions=config["num_actions"]) self.final_module = final_module if torch.cuda.is_available(): self.image_module.cuda() self.text_module.cuda() self.action_module.cuda() self.final_module.cuda() self.radius_module.cuda() self.angle_module.cuda() self.landmark_module.cuda()
def __init__(self, config, constants): AbstractModel.__init__(self, config, constants) self.none_action = config["num_actions"] landmark_names = get_all_landmark_names() self.radius_module = RadiusModule(15) self.angle_module = AngleModule(48) self.landmark_module = LandmarkModule(63) self.image_module = SymbolicImageModule( landmark_names=landmark_names, radius_module=self.radius_module, angle_module=self.angle_module, landmark_module=self.landmark_module) if config["use_pointer_model"]: self.text_module = TextPointerModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"]) else: self.text_module = TextSimpleModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"]) self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) total_emb_size = (32 * 3 * 63 + constants["lstm_emb_dim"] + constants["action_emb_dim"]) final_module = MultimodalSimpleModule( image_module=self.image_module, text_module=self.text_module, action_module=self.action_module, total_emb_size=total_emb_size, num_actions=config["num_actions"]) self.final_module = final_module if torch.cuda.is_available(): self.image_module.cuda() self.text_module.cuda() self.action_module.cuda() self.final_module.cuda() self.radius_module.cuda() self.angle_module.cuda() self.landmark_module.cuda()
def __init__(self, config, constants): AbstractIncrementalModel.__init__(self, config, constants) self.none_action = config["num_actions"] self.config = config self.constants = constants # CNN over images - using what is essentially SimpleImage currently self.image_module = ImageCnnEmnlp( image_emb_size=constants["image_emb_dim"], input_num_channels=3 * 5, # 3 channels per image - 5 images in history image_height=config["image_height"], image_width=config["image_width"]) # LSTM to embed text self.text_module = TextSimpleModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"]) # Action module to embed previous action+block self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) # Put it all together self.final_module = IncrementalMultimodalEmnlp( image_module=self.image_module, text_module=self.text_module, action_module=self.action_module, input_embedding_size=constants["lstm_emb_dim"] + constants["image_emb_dim"] + constants["action_emb_dim"], output_hidden_size=config["h1_hidden_dim"], blocks_hidden_size=config["blocks_hidden_dim"], directions_hidden_size=config["action_hidden_dim"], max_episode_length=(constants["horizon"] + 5)) if torch.cuda.is_available(): self.image_module.cuda() self.text_module.cuda() self.action_module.cuda() self.final_module.cuda()
def __init__(self, config, constants): AbstractModel.__init__(self, config, constants) self.none_action = config["num_actions"] landmark_names = get_all_landmark_names() self.radius_module = RadiusModule(15) self.angle_module = AngleModule(48) self.landmark_module = LandmarkModule(63) self.image_module = SymbolicImageModule( landmark_names=landmark_names, radius_module=self.radius_module, angle_module=self.angle_module, landmark_module=self.landmark_module) self.text_module = SymbolicInstructionModule( radius_embedding=self.radius_module, theta_embedding=self.angle_module, landmark_embedding=self.landmark_module) self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) total_emb_size = (32 * 3 * 63 + 32 * 4 + constants["action_emb_dim"]) final_module = MultimodalSimpleModule( image_module=self.image_module, text_module=self.text_module, action_module=self.action_module, total_emb_size=total_emb_size, num_actions=config["num_actions"]) self.final_module = final_module if torch.cuda.is_available(): self.image_module.cuda() self.text_module.cuda() self.action_module.cuda() self.final_module.cuda() self.radius_module.cuda() self.angle_module.cuda() self.landmark_module.cuda()
def __init__(self, config, constants): AbstractModel.__init__(self, config, constants) self.none_action = config["num_actions"] self.image_module = ImageResnetModule( image_emb_size=constants["image_emb_dim"], input_num_channels=3*constants["max_num_images"], image_height=config["image_height"], image_width=config["image_width"]) if config["use_pointer_model"]: self.text_module = TextPointerModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"]) else: self.text_module = TextSimpleModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"]) self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) total_emb_size = (constants["image_emb_dim"] + constants["lstm_emb_dim"] + constants["action_emb_dim"]) final_module = MultimodalSimpleWithStopModule( image_module=self.image_module, text_module=self.text_module, action_module=self.action_module, total_emb_size=total_emb_size, num_actions=config["num_actions"]) self.final_module = final_module if torch.cuda.is_available(): self.image_module.cuda() self.text_module.cuda() self.action_module.cuda() self.final_module.cuda()
def __init__(self, config, constants): AbstractIncrementalModel.__init__(self, config, constants) self.none_action = config["num_actions"] self.image_module = ImageResnetModule( image_emb_size=constants["image_emb_dim"], input_num_channels=3, image_height=config["image_height"], image_width=config["image_width"], using_recurrence=True) self.num_cameras = 1 self.image_recurrence_module = IncrementalRecurrenceSimpleModule( input_emb_dim=(constants["image_emb_dim"] * self.num_cameras + constants["action_emb_dim"]), output_emb_dim=constants["image_emb_dim"]) if config["use_pointer_model"]: self.text_module = TextPointerModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"]) else: self.text_module = TextBiLSTMModule( emb_dim=constants["word_emb_dim"], hidden_dim=constants["lstm_emb_dim"], vocab_size=config["vocab_size"]) self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) if config["use_pointer_model"]: total_emb_size = (constants["image_emb_dim"] + 4 * constants["lstm_emb_dim"] + constants["action_emb_dim"]) else: total_emb_size = ((self.num_cameras + 1) * constants["image_emb_dim"] + 2 * constants["lstm_emb_dim"] + constants["action_emb_dim"]) if config["do_action_prediction"]: self.action_prediction_module = ActionPredictionModule( 2 * self.num_cameras * constants["image_emb_dim"], constants["image_emb_dim"], config["num_actions"]) else: self.action_prediction_module = None if config["do_temporal_autoencoding"]: self.temporal_autoencoder_module = TemporalAutoencoderModule( self.action_module, self.num_cameras * constants["image_emb_dim"], constants["action_emb_dim"], constants["image_emb_dim"]) else: self.temporal_autoencoder_module = None if config["do_object_detection"]: self.landmark_names = get_all_landmark_names() self.object_detection_module = ObjectDetectionModule( image_module=self.image_module, image_emb_size=self.num_cameras * constants["image_emb_dim"], num_objects=67) else: self.object_detection_module = None if config["do_symbolic_language_prediction"]: self.symbolic_language_prediction_module = SymbolicLanguagePredictionModule( total_emb_size=2 * constants["lstm_emb_dim"]) else: self.symbolic_language_prediction_module = None if config["do_goal_prediction"]: self.goal_prediction_module = GoalPredictionModule( total_emb_size=32) else: self.goal_prediction_module = None final_module = TmpIncrementalMultimodalDenseValtsRecurrentSimpleModule( image_module=self.image_module, image_recurrence_module=self.image_recurrence_module, text_module=self.text_module, action_module=self.action_module, total_emb_size=total_emb_size, num_actions=config["num_actions"]) self.final_module = final_module if torch.cuda.is_available(): self.image_module.cuda() self.image_recurrence_module.cuda() self.text_module.cuda() self.action_module.cuda() self.final_module.cuda() if self.action_prediction_module is not None: self.action_prediction_module.cuda() if self.temporal_autoencoder_module is not None: self.temporal_autoencoder_module.cuda() if self.object_detection_module is not None: self.object_detection_module.cuda() if self.symbolic_language_prediction_module is not None: self.symbolic_language_prediction_module.cuda() if self.goal_prediction_module is not None: self.goal_prediction_module.cuda()
def __init__(self, config, constants): AbstractIncrementalModel.__init__(self, config, constants) self.none_action = config["num_actions"] landmark_names = get_all_landmark_names() self.radius_module = RadiusModule(15) self.angle_module = AngleModule(12) # (48) self.landmark_module = LandmarkModule(67) self.num_cameras = 1 self.image_module = ImageRyanResnetModule( image_emb_size=constants["image_emb_dim"], input_num_channels=3, image_height=config["image_height"], image_width=config["image_width"], using_recurrence=True) self.image_recurrence_module = IncrementalRecurrenceSimpleModule( input_emb_dim=constants["image_emb_dim"] * self.num_cameras, # + constants["action_emb_dim"], output_emb_dim=constants["image_emb_dim"]) self.text_module = SymbolicInstructionModule( radius_embedding=self.radius_module, theta_embedding=self.angle_module, landmark_embedding=self.landmark_module) self.action_module = ActionSimpleModule( num_actions=config["num_actions"], action_emb_size=constants["action_emb_dim"]) total_emb_size = ((self.num_cameras) * constants["image_emb_dim"] + 32 * 2 + constants["action_emb_dim"]) if config["do_action_prediction"]: self.action_prediction_module = ActionPredictionModule( 2 * self.num_cameras * constants["image_emb_dim"], constants["image_emb_dim"], config["num_actions"]) else: self.action_prediction_module = None if config["do_temporal_autoencoding"]: self.temporal_autoencoder_module = TemporalAutoencoderModule( self.action_module, self.num_cameras * constants["image_emb_dim"], constants["action_emb_dim"], constants["image_emb_dim"]) else: self.temporal_autoencoder_module = None if config["do_object_detection"]: self.landmark_names = get_all_landmark_names() self.object_detection_module = ObjectDetectionModule( image_module=self.image_module, image_emb_size=self.num_cameras * constants["image_emb_dim"], num_objects=67) else: self.object_detection_module = None final_module = IncrementalMultimodalRecurrentSimpleModule( image_module=self.image_module, image_recurrence_module=self.image_recurrence_module, text_module=self.text_module, action_module=self.action_module, total_emb_size=total_emb_size, num_actions=config["num_actions"]) self.final_module = final_module if torch.cuda.is_available(): self.image_module.cuda() self.image_recurrence_module.cuda() self.text_module.cuda() self.action_module.cuda() self.final_module.cuda() if self.action_prediction_module is not None: self.action_prediction_module.cuda() if self.temporal_autoencoder_module is not None: self.temporal_autoencoder_module.cuda() if self.object_detection_module is not None: self.object_detection_module.cuda()