예제 #1
0
    def __init__(self, dataset_dir: str, num_workers: int,
                 train_batch_size: int, val_batch_size: int,
                 negative_sample_size: int, *args, **kwargs):
        """Initiates a Knowledge Graph dataset.

        Args:
            dataset_dir: str
                path of the dataset directory to use
            num_workers: int
                number of workers to use
            train_batch_size: int
                batch size to use for training
            val_batch_size: int
                batch size to use for validation and test
            negative_sample_size: int
                size of the negative samples
        """
        super().__init__(*args, **kwargs)

        self.dataset_dir = dataset_dir
        self.num_workers = num_workers
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.negative_sample_size = negative_sample_size

        # Build dictionaries to translate entities/relations to their ID
        self.entity2id = load_entities(
            os.path.join(self.dataset_dir, "entities.dict"))
        self.relation2id = load_relations(
            os.path.join(self.dataset_dir, "relations.dict"))
        # Load training, validation and test triples
        self.train_triples = read_triple(
            os.path.join(self.dataset_dir, "train.txt"),
            self.entity2id,
            self.relation2id,
        )
        self.val_triples = read_triple(
            os.path.join(self.dataset_dir, "valid.txt"),
            self.entity2id,
            self.relation2id,
        )
        self.test_triples = read_triple(
            os.path.join(self.dataset_dir, "test.txt"), self.entity2id,
            self.relation2id)
예제 #2
0
def run_rgcn(
    dataset_path: str,
    ckpt_file: str,
    cfg: dict,
):
    """Runs experiment with the R-GCN model.

    Args:
        dataset_path: str
            path to the dataset
        ckpt_file: str
            checkpoint file to the pretrained model
        cfg: dict
            configuration dictionary to use
    """
    cfg_model = cfg["rgcn"]["model"]
    cfg_data = cfg["rgcn"]["data"]
    cfg_training = cfg["rgcn"]["training"]

    ## Load the dataset
    # Build dictionaries to translate entities/relations to their ID
    entity2id = load_entities(os.path.join(dataset_path, "entities.dict"))
    relation2id = load_relations(os.path.join(dataset_path, "relations.dict"))

    # Load training, validation and test triples
    train_triples = read_triple(
        os.path.join(dataset_path, "train.txt"), entity2id, relation2id
    )
    val_triples = read_triple(
        os.path.join(dataset_path, "valid.txt"), entity2id, relation2id
    )
    test_triples = read_triple(
        os.path.join(dataset_path, "test.txt"), entity2id, relation2id
    )

    # Build the data objects used by the model
    all_triples = torch.LongTensor(train_triples + val_triples + test_triples)
    train_triples = np.array(train_triples, dtype=int)
    val_triples = torch.LongTensor(val_triples)
    test_triples = torch.LongTensor(test_triples)

    # Build the test graph
    test_graph = build_test_graph(len(entity2id), len(relation2id), train_triples)

    # Create a model instance
    model = RGCN(
        len(entity2id),
        len(relation2id),
        cfg_model["n_bases"],
        cfg_model["dropout"],
        cfg_model["reg_ratio"],
    )

    # Load the pretrained model
    if not os.path.exists(ckpt_file):
        optimizer = torch.optim.Adam(
            model.parameters(), lr=cfg_training["learning_rate"]
        )
        best_mrr = 0.0

        for i in tqdm(range(cfg_training["n_epochs"]), desc="Training epochs"):
            model.train()
            optimizer.zero_grad()
            loss = train_epoch(
                train_triples,
                model,
                cfg_data,
                cfg_model["reg_ratio"],
                len(entity2id),
                len(relation2id),
            )
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), cfg_training["grad_norm"])
            optimizer.step()

            # Evaluate the model
            val_metrics = validate(val_triples, test_graph, model, all_triples)

            # Save model checkpoint if best
            if val_metrics["mrr"] > best_mrr:
                best_mrr = val_metrics["mrr"]
                torch.save(model.state_dict(), ckpt_file)

    # Test the pretrained model
    model_test = model.load_state_dict(torch.load(ckpt_file))
    metrics = test(test_triples, model, test_graph, all_triples)
    print("Test metrics: ", metrics)