def __init__( self, config: Config, dataset: Dataset, configuration_key=None, init_for_load_only=False, ): self._init_configuration(config, configuration_key) # Initialize base model # Using a dataset with twice the number of relations to initialize base model alt_dataset = dataset.shallow_copy() alt_dataset._num_relations = dataset.num_relations() * 2 base_model = KgeModel.create( config=config, dataset=alt_dataset, configuration_key=self.configuration_key + ".base_model", init_for_load_only=init_for_load_only, ) # Initialize this model super().__init__( config=config, dataset=dataset, scorer=base_model.get_scorer(), create_embedders=False, init_for_load_only=init_for_load_only, ) self._base_model = base_model # TODO change entity_embedder assignment to sub and obj embedders when support # for that is added self._entity_embedder = self._base_model.get_s_embedder() self._relation_embedder = self._base_model.get_p_embedder()
def __init__(self, config: Config, configuration_key: str, dataset: Dataset): super().__init__(config, configuration_key) # load config self.num_samples = torch.zeros(3, dtype=torch.int) self.filter_positives = torch.zeros(3, dtype=torch.bool) self.vocabulary_size = torch.zeros(3, dtype=torch.int) self.shared = self.get_option("shared") self.with_replacement = self.get_option("with_replacement") if not self.with_replacement and not self.shared: raise ValueError( "Without replacement sampling is only supported when " "shared negative sampling is enabled.") self.filtering_split = config.get("negative_sampling.filtering.split") if self.filtering_split == "": self.filtering_split = config.get("train.split") for slot in SLOTS: slot_str = SLOT_STR[slot] self.num_samples[slot] = self.get_option(f"num_samples.{slot_str}") self.filter_positives[slot] = self.get_option( f"filtering.{slot_str}") self.vocabulary_size[slot] = (dataset.num_relations() if slot == P else dataset.num_entities()) # create indices for filtering here already if needed and not existing # otherwise every worker would create every index again and again if self.filter_positives[slot]: pair = ["po", "so", "sp"][slot] dataset.index(f"{self.filtering_split}_{pair}_to_{slot_str}") if any(self.filter_positives): if self.shared: raise ValueError( "Filtering is not supported when shared negative sampling is enabled." ) self.check_option("filtering.implementation", ["standard", "fast", "fast_if_available"]) self.filter_implementation = self.get_option( "filtering.implementation") self.dataset = dataset # auto config for slot, copy_from in [(S, O), (P, None), (O, S)]: if self.num_samples[slot] < 0: if copy_from is not None and self.num_samples[copy_from] > 0: self.num_samples[slot] = self.num_samples[copy_from] else: self.num_samples[slot] = 0
def __init__( self, config: Config, dataset: Dataset, scorer: Union[RelationalScorer, type], initialize_embedders=True, configuration_key=None, ): super().__init__(config, dataset, configuration_key) # TODO support different embedders for subjects and objects #: Embedder used for entities (both subject and objects) self._entity_embedder: KgeEmbedder #: Embedder used for relations self._relation_embedder: KgeEmbedder if initialize_embedders: self._entity_embedder = KgeEmbedder.create( config, dataset, self.configuration_key + ".entity_embedder", dataset.num_entities(), ) #: Embedder used for relations num_relations = dataset.num_relations() self._relation_embedder = KgeEmbedder.create( config, dataset, self.configuration_key + ".relation_embedder", num_relations, ) #: Scorer self._scorer: RelationalScorer if type(scorer) == type: # scorer is type of the scorer to use; call its constructor self._scorer = scorer(config=config, dataset=dataset, configuration_key=self.configuration_key) else: self._scorer = scorer
def __init__( self, config: Config, dataset: Dataset, scorer: Union[RelationalScorer, type], create_embedders=True, configuration_key=None, init_for_load_only=False, ): super().__init__(config, dataset, configuration_key) # TODO support different embedders for subjects and objects #: Embedder used for entities (both subject and objects) self._entity_embedder: KgeEmbedder #: Embedder used for relations self._relation_embedder: KgeEmbedder if create_embedders: self._entity_embedder = KgeEmbedder.create( config, dataset, self.configuration_key + ".entity_embedder", dataset.num_entities(), init_for_load_only=init_for_load_only, ) #: Embedder used for relations num_relations = dataset.num_relations() self._relation_embedder = KgeEmbedder.create( config, dataset, self.configuration_key + ".relation_embedder", num_relations, init_for_load_only=init_for_load_only, ) if not init_for_load_only: # load pretrained embeddings pretrained_entities_filename = "" pretrained_relations_filename = "" if self.has_option("entity_embedder.pretrain.model_filename"): pretrained_entities_filename = self.get_option( "entity_embedder.pretrain.model_filename" ) if self.has_option("relation_embedder.pretrain.model_filename"): pretrained_relations_filename = self.get_option( "relation_embedder.pretrain.model_filename" ) def load_pretrained_model( pretrained_filename: str, ) -> Optional[KgeModel]: if pretrained_filename != "": self.config.log( f"Initializing with embeddings stored in " f"{pretrained_filename}" ) checkpoint = load_checkpoint(pretrained_filename) return KgeModel.create_from(checkpoint) return None pretrained_entities_model = load_pretrained_model( pretrained_entities_filename ) if pretrained_entities_filename == pretrained_relations_filename: pretrained_relations_model = pretrained_entities_model else: pretrained_relations_model = load_pretrained_model( pretrained_relations_filename ) if pretrained_entities_model is not None: if ( pretrained_entities_model.get_s_embedder() != pretrained_entities_model.get_o_embedder() ): raise ValueError( "Can only initialize with pre-trained models having " "identical subject and object embeddings." ) self._entity_embedder.init_pretrained( pretrained_entities_model.get_s_embedder() ) if pretrained_relations_model is not None: self._relation_embedder.init_pretrained( pretrained_relations_model.get_p_embedder() ) #: Scorer self._scorer: RelationalScorer if type(scorer) == type: # scorer is type of the scorer to use; call its constructor self._scorer = scorer( config=config, dataset=dataset, configuration_key=self.configuration_key ) else: self._scorer = scorer
def __init__( self, config: Config, dataset: Dataset, scorer: Union[RelationalScorer, type], create_embedders=True, configuration_key=None, init_for_load_only=False, parameter_client=None, max_partition_entities=0, ): super().__init__(config, dataset, configuration_key) # TODO support different embedders for subjects and objects #: Embedder used for entities (both subject and objects) self._entity_embedder: KgeEmbedder #: Embedder used for relations self._relation_embedder: KgeEmbedder if create_embedders: self._create_embedders(init_for_load_only) elif False: #if self.get_option("create_complete"): # embedding_layer_size = dataset.num_entities() if config.get("job.distributed.entity_sync_level") == "partition" and max_partition_entities != 0: embedding_layer_size =max_partition_entities else: embedding_layer_size = self._calc_embedding_layer_size(config, dataset) config.log(f"creating entity_embedder with {embedding_layer_size} keys") self._entity_embedder = KgeEmbedder.create( config=config, dataset=dataset, configuration_key=self.configuration_key + ".entity_embedder", #dataset.num_entities(), vocab_size=embedding_layer_size, init_for_load_only=init_for_load_only, parameter_client=parameter_client, lapse_offset=0, complete_vocab_size=dataset.num_entities() ) #: Embedder used for relations num_relations = dataset.num_relations() self._relation_embedder = KgeEmbedder.create( config, dataset, self.configuration_key + ".relation_embedder", num_relations, init_for_load_only=init_for_load_only, parameter_client=parameter_client, lapse_offset=dataset.num_entities(), complete_vocab_size=dataset.num_relations(), ) if not init_for_load_only and parameter_client.rank == get_min_rank(config): # load pretrained embeddings pretrained_entities_filename = "" pretrained_relations_filename = "" if self.has_option("entity_embedder.pretrain.model_filename"): pretrained_entities_filename = self.get_option( "entity_embedder.pretrain.model_filename" ) if self.has_option("relation_embedder.pretrain.model_filename"): pretrained_relations_filename = self.get_option( "relation_embedder.pretrain.model_filename" ) def load_pretrained_model( pretrained_filename: str, ) -> Optional[KgeModel]: if pretrained_filename != "": self.config.log( f"Initializing with embeddings stored in " f"{pretrained_filename}" ) checkpoint = load_checkpoint(pretrained_filename) return KgeModel.create_from(checkpoint, parameter_client=parameter_client) return None pretrained_entities_model = load_pretrained_model( pretrained_entities_filename ) if pretrained_entities_filename == pretrained_relations_filename: pretrained_relations_model = pretrained_entities_model else: pretrained_relations_model = load_pretrained_model( pretrained_relations_filename ) if pretrained_entities_model is not None: if ( pretrained_entities_model.get_s_embedder() != pretrained_entities_model.get_o_embedder() ): raise ValueError( "Can only initialize with pre-trained models having " "identical subject and object embeddings." ) self._entity_embedder.init_pretrained( pretrained_entities_model.get_s_embedder() ) if pretrained_relations_model is not None: self._relation_embedder.init_pretrained( pretrained_relations_model.get_p_embedder() ) #: Scorer self._scorer: RelationalScorer if type(scorer) == type: # scorer is type of the scorer to use; call its constructor self._scorer = scorer( config=config, dataset=dataset, configuration_key=self.configuration_key ) else: self._scorer = scorer