예제 #1
0
파일: predict.py 프로젝트: tobby2002/PyKEEN
def start_predictions_pipeline(
        model_directory: str,
        data_directory: str,
        path_to_blacklisted_triples: Optional[str] = None,
        export_predictions=True) -> None:
    """
    Performs inference based on a trained KGE model. The predictions are saved predictions.tsv in the provided
    data directory.
    :param model_directory: Directory containing the experimental artifacts: configuration.json,
    entities_to_embeddings.json, relations_to_embeddings.json and trained_model.pkl
    :param data_directory: Directory containing the candidate entities as an entities.tsv file and
    the candidate relations as relations.tsv. Both files consists of one column containint the entities/relations,
    and based on these all combinatios of possible triples are created.
    :param remove_training_triples:
    :param path_to_blacklisted_triples:
    :return:
    """
    # Load configuration file
    with open(os.path.join(model_directory, 'configuration.json')) as f:
        config = json.load(f)

    # Load entity to id mapping
    with open(os.path.join(model_directory, 'entity_to_id.json')) as f:
        entity_to_id = json.load(f)

    # Load relation to id mapping
    with open(os.path.join(model_directory, 'relation_to_id.json')) as f:
        relation_to_id = json.load(f)

    trained_kge_model: Module = get_kge_model(config=config)
    path_to_model = os.path.join(model_directory, 'trained_model.pkl')
    trained_kge_model.load_state_dict(torch.load(path_to_model))

    entities = np.loadtxt(fname=os.path.join(data_directory, 'entities.tsv'),
                          dtype=str,
                          delimiter='\t')
    relations = np.loadtxt(fname=os.path.join(data_directory, 'relations.tsv'),
                           dtype=str,
                           delimiter='\t')

    device_name = 'cuda:0' if torch.cuda.is_available(
    ) and config[PREFERRED_DEVICE] == GPU else CPU
    device = torch.device(device_name)

    ranked_triples = make_predictions(
        kge_model=trained_kge_model,
        entities=entities,
        relations=relations,
        entity_to_id=entity_to_id,
        rel_to_id=relation_to_id,
        device=device,
        blacklist_path=path_to_blacklisted_triples,
    )

    if export_predictions:
        np.savetxt(os.path.join(data_directory, 'predictions.tsv'),
                   ranked_triples,
                   fmt='%s')

    return ranked_triples
예제 #2
0
    def run(self) -> Mapping:
        """Run this pipeline."""
        metric_results = None

        if self._use_hpo(self.config):  # Hyper-parameter optimization mode
            mapped_pos_train_triples, mapped_pos_test_triples = self._get_train_and_test_triples()

            (trained_model,
             loss_per_epoch,
             entity_label_to_embedding,
             relation_label_to_embedding,
             metric_results,
             params) = RandomSearch.run(
                mapped_train_triples=mapped_pos_train_triples,
                mapped_test_triples=mapped_pos_test_triples,
                entity_to_id=self.entity_label_to_id,
                rel_to_id=self.relation_label_to_id,
                config=self.config,
                device=str(self.device),
                seed=self.seed,
            )
        else:  # Training Mode
            if self.is_evaluation_required:
                mapped_pos_train_triples, mapped_pos_test_triples = self._get_train_and_test_triples()
            else:
                mapped_pos_train_triples, mapped_pos_test_triples = self._get_train_triples(), None

            all_entities = np.array(list(self.entity_label_to_id.values()))

            # Initialize KG embedding model
            self.config[pkc.PREFERRED_DEVICE] = pkc.CPU if self.device_name == pkc.CPU else pkc.GPU

            if self.seed is not None:
                torch.manual_seed(self.seed)

            kge_model: Module = get_kge_model(config=self.config)

            kge_model.entity_label_to_id = self.entity_label_to_id
            kge_model.relation_label_to_id = self.relation_label_to_id
            kge_model.num_entities = len(self.entity_label_to_id)
            kge_model.num_relations = len(self.relation_label_to_id)

            batch_size = self.config[pkc.BATCH_SIZE]
            num_epochs = self.config[pkc.NUM_EPOCHS]
            learning_rate = self.config[pkc.LEARNING_RATE]

            log.info("-------------Train KG Embeddings-------------")
            loss_per_epoch = kge_model.fit(
                pos_triples=mapped_pos_train_triples,
                learning_rate=learning_rate,
                num_epochs=num_epochs,
                batch_size=batch_size,
            )
            trained_model = kge_model

            params = self.config


            if self.is_evaluation_required:
                log.info("-------------Start Evaluation-------------")
                metric_results = compute_metric_results(
                    kg_embedding_model=kge_model,
                    mapped_train_triples=mapped_pos_train_triples,
                    mapped_test_triples=mapped_pos_test_triples,
                    device=str(self.device),
                    filter_neg_triples=self.config[pkc.FILTER_NEG_TRIPLES],
                )

        # Prepare Output
        entity_id_to_label = {
            value: key
            for key, value in self.entity_label_to_id.items()
        }
        relation_id_to_label = {
            value: key for
            key, value in self.relation_label_to_id.items()
        }
        entity_label_to_embedding = {
            entity_id_to_label[entity_id]: embedding.detach().cpu().numpy()
            for entity_id, embedding in enumerate(trained_model.entity_embeddings.weight)
        }

        if self.config[pkc.KG_EMBEDDING_MODEL_NAME] in (pkc.SE_NAME, pkc.UM_NAME):
            relation_label_to_embedding = None
        else:
            relation_label_to_embedding = {
                relation_id_to_label[relation_id]: embedding.detach().cpu().numpy()
                for relation_id, embedding in enumerate(trained_model.relation_embeddings.weight)
            }

        return _make_results(
            trained_model=trained_model,
            loss_per_epoch=loss_per_epoch,
            entity_to_embedding=entity_label_to_embedding,
            relation_to_embedding=relation_label_to_embedding,
            metric_results=metric_results,
            entity_to_id=self.entity_label_to_id,
            rel_to_id=self.relation_label_to_id,
            params=params,
        )
예제 #3
0
    def optimize_hyperparams(
            self,
            mapped_train_triples,
            mapped_test_triples,
            entity_to_id,
            rel_to_id,
            config,
            device,
            seed: Optional[int] = None,
            k_evaluation: int = 10,
    ) -> HPOptimizerResult:
        if seed is not None:
            torch.manual_seed(config[pkc.SEED])

        trained_kge_models: List[Module] = []
        epoch_losses: List[List[float]] = []
        hits_at_k_evaluations: List[float] = []
        entity_to_ids: List[Dict[int, str]] = []
        rel_to_ids: List[Dict[int, str]] = []
        models_params: List[Dict] = []
        eval_summaries: List = []

        config = config.copy()
        max_iters = config[pkc.NUM_OF_HPO_ITERS]

        sample_fct = (
            self._sample_conv_e_params
            if config[pkc.KG_EMBEDDING_MODEL_NAME] == pkc.CONV_E_NAME else
            self._sample_parameter_value
        )

        for _ in trange(max_iters, desc='HPO Iteration'):
            # Sample hyper-params
            kge_model_config: Dict[str, Any] = sample_fct(config)
            kge_model_config[pkc.SEED]: int = seed
            kge_model_config[pkc.PREFERRED_DEVICE]: str = pkc.CPU if device == pkc.CPU else pkc.GPU

            # Configure defined model
            kge_model: Module = get_kge_model(config=kge_model_config)

            # Load class params
            kge_model.entity_label_to_id: Dict[str, int] = entity_to_id
            kge_model.relation_label_to_id: Dict[str, int] = rel_to_id
            kge_model.num_entities: int = len(entity_to_id)
            kge_model.num_relations: int = len(rel_to_id)

            models_params.append(kge_model_config)
            entity_to_ids.append(entity_to_id)
            rel_to_ids.append(rel_to_id)

            all_entities = np.array(list(entity_to_id.values()))

            batch_size = kge_model_config[pkc.BATCH_SIZE]
            num_epochs = kge_model_config[pkc.NUM_EPOCHS]
            learning_rate = kge_model_config[pkc.LEARNING_RATE]

            epoch_loss = kge_model.fit(
                pos_triples=mapped_train_triples,
                learning_rate=learning_rate,
                num_epochs=num_epochs,
                batch_size=batch_size,
                tqdm_kwargs=dict(leave=False),
            )

            trained_kge_model = kge_model

            # Evaluate trained model
            metric_results = compute_metric_results(
                kg_embedding_model=trained_kge_model,
                mapped_train_triples=mapped_train_triples,
                mapped_test_triples=mapped_test_triples,
                device=device,
            )

            # TODO: Define HPO metric
            eval_summaries.append(metric_results)

            trained_kge_models.append(trained_kge_model)
            epoch_losses.append(epoch_loss)

            hits_at_k_evaluation = metric_results.hits_at_k[k_evaluation]
            hits_at_k_evaluations.append(hits_at_k_evaluation)

        index_of_max = int(np.argmax(a=hits_at_k_evaluations))

        return (
            trained_kge_models[index_of_max],
            epoch_losses[index_of_max],
            entity_to_ids[index_of_max],
            rel_to_ids[index_of_max],
            eval_summaries[index_of_max],
            models_params[index_of_max],
        )