Пример #1
0
 def run(self, config, trial):
     """Run the early stopper HPO experiment."""
     results = pipeline(
         dataset=config['dataset'],
         model=config['model'],
         random_seed=trial,
         device='cpu',
         stopper='early',
         stopper_kwargs=dict(
             metric='adjusted_mean_rank',
             frequency=config['frequency'],
             patience=config['patience'],
             relative_delta=config['relative_delta'],
         ),
         training_kwargs=dict(
             num_epochs=1000,
             tqdm_kwargs=dict(leave=False),
         ),
         evaluation_kwargs=dict(use_tqdm=False),
         automatic_memory_optimization=False,  # not necessary on CPU
     )
     return (
         len(results.losses),
         results.metric_results.get_metric('both.avg.adjusted_mean_rank'),
         results.metric_results.get_metric('hits@10'),
     )
Пример #2
0
    def test_custom_training_loop(self):
        """Test providing a custom training loop."""
        losses = []

        class ModifiedTrainingLoop(SLCWATrainingLoop):
            """A wrapper around SLCWA training loop which remembers batch losses."""

            def _forward_pass(self, *args, **kwargs):  # noqa: D102
                loss = super()._forward_pass(*args, **kwargs)
                losses.append(loss)
                return loss

        _ = pipeline(
            training=self.training,
            testing=self.testing,
            validation=self.validation,
            training_loop=ModifiedTrainingLoop,
            model='TransE',
            training_kwargs=dict(num_epochs=1, use_tqdm=False),
            evaluation_kwargs=dict(use_tqdm=False),
            random_seed=0,
        )

        # empty lists are falsy
        self.assertTrue(losses)
Пример #3
0
    def _test_pipeline_x_resumption(self, training_loop_type: str):
        """Test whether the resumed pipeline creates the same results as the one shot pipeline."""
        # As the resumption capability currently is a function of the training loop, more thorough tests can be found
        # in the test_training.py unit tests. In the tests below the handling of training loop checkpoints by the
        # pipeline is checked.

        result_standard = pipeline(
            model=self.model,
            dataset=self.dataset,
            training_loop=training_loop_type,
            training_kwargs=dict(num_epochs=10,
                                 use_tqdm=False,
                                 use_tqdm_batch=False),
            random_seed=self.random_seed,
        )

        # Set up a shared result that runs two pipelines that should replicate the results of the standard pipeline.
        _ = pipeline(
            model=self.model,
            dataset=self.dataset,
            training_loop=training_loop_type,
            training_kwargs=dict(
                num_epochs=5,
                use_tqdm=False,
                use_tqdm_batch=False,
                checkpoint_name=self.checkpoint_name,
                checkpoint_directory=self.temporary_directory.name,
                checkpoint_frequency=0,
            ),
            random_seed=self.random_seed,
        )

        # Resume the previous pipeline
        result_split = pipeline(
            model=self.model,
            dataset=self.dataset,
            training_loop=training_loop_type,
            training_kwargs=dict(
                num_epochs=10,
                use_tqdm=False,
                use_tqdm_batch=False,
                checkpoint_name=self.checkpoint_name,
                checkpoint_directory=self.temporary_directory.name,
                checkpoint_frequency=0,
            ),
        )
        self.assertEqual(result_standard.losses, result_split.losses)
Пример #4
0
 def _help(self, model):
     return pipeline(
         dataset=NationsLiteral,
         model=model,
         training_kwargs=dict(num_epochs=5, use_tqdm=False),
         evaluation_kwargs=dict(use_tqdm=False),
         training_loop='lcwa',
     )
Пример #5
0
 def test_pipeline(self):
     """Test the pipeline on TransE and nations."""
     pipeline_result = pipeline(
         model='TransE',
         dataset='nations',
     )
     self.assertIsInstance(pipeline_result, PipelineResult)
     self.assertIsInstance(pipeline_result.model, Model)
     self.assertIsInstance(pipeline_result.model.regularizer, NoRegularizer)
Пример #6
0
    def train_embedding(self, g, model="SimplE"):
        # pykeen
        from pykeen.pipeline import pipeline
        from pykeen.triples import TriplesFactory

        # create pseudo-nodes to enucode node attributes
        pleasent, not_pleasent = len(g.concepts), len(g.concepts) + 1
        sensitiv, not_sensitive = len(g.concepts) + 2, len(g.concepts) + 3
        # build triples
        triples = []
        for c in g.concepts:
            # actual connections
            triples.extend(([c.index, 'semantic', j] for j in g.get_semantic_ids(c)))
            # encode attributes by binning
            if c.pleasentness != 0:
                triples.append([c.index, 'pleasent', pleasent if c.pleasentness > 0 else not_pleasent])
            if c.sensitivity != 0:
                triples.append([c.index, 'sensitiv', sensitiv if c.sensitivity > 0 else not_sensitive])
        triples, n = np.asarray(triples), len(triples)
        print("Number of Triples (Train/Total): %i/%i" % (int(0.8 * n), n))
        # create mask for training and testing separation
        train_mask = np.full(n, False)
        train_mask[:int(n * 0.9)] = True
        np.random.shuffle(train_mask)
        # separate into training and testing
        train_triples = triples[train_mask]
        test_triples = triples[~train_mask]
        # create triples factories
        train_factory = TriplesFactory(triples=train_triples)
        test_factory = TriplesFactory(triples=test_triples)
        # create and run pipeline
        results = pipeline(
            # data
            training_triples_factory=train_factory,
            testing_triples_factory=test_factory,
            # model
            model=model,
            model_kwargs={
                "embedding_dim": self.embedd_dim,
                "automatic_memory_optimization": True
            }
        )
        # get embedding tensor - remove pseudo nodes
        weight = results.model.entity_embeddings.weight[:len(g.concepts), ...].cpu()

        # update word2id
        words = [c.text for c in g.concepts]
        self.word2id = OrderedDict( zip(words, range(1, len(words) + 1)) )  # 0th element is padding
        # update embeddings - add padding embedding at position 0
        self.embedding = nn.Embedding(
            num_embeddings=len(words) + 1,
            embedding_dim=self.embedd_dim,
            _weight=torch.cat((torch.zeros((1, self.embedd_dim)), weight), dim=0)
        )
        # return results
        return results
Пример #7
0
 def setUpClass(cls):
     """Set up a shared result."""
     cls.result = pipeline(
         model='TransE',
         dataset='nations',
         training_kwargs=dict(num_epochs=5),
     )
     cls.model = cls.result.model
     nations = Nations()
     cls.testing_mapped_triples = nations.testing.mapped_triples.to(cls.model.device)
Пример #8
0
 def test_unlabeled_triples(self):
     """Test running the pipeline on unlabeled triples factories."""
     _ = pipeline(
         training=self.training,
         testing=self.testing,
         validation=self.validation,
         model='TransE',
         training_kwargs=dict(num_epochs=1, use_tqdm=False),
         evaluation_kwargs=dict(use_tqdm=False),
     )
Пример #9
0
 def test_specify_regularizer(self):
     """Test a pipeline that uses a regularizer."""
     pipeline_result = pipeline(
         model=TransE,
         dataset='nations',
         regularizer='powersum',
     )
     self.assertIsInstance(pipeline_result, PipelineResult)
     self.assertIsInstance(pipeline_result.model, Model)
     self.assertIsInstance(pipeline_result.model.regularizer,
                           PowerSumRegularizer)
Пример #10
0
 def _help(self, model):
     rv = pipeline(
         dataset=NationsLiteral,
         model=model,
         training_kwargs=dict(num_epochs=5, use_tqdm=False),
         evaluation_kwargs=dict(use_tqdm=False),
         training_loop='lcwa',
     )
     self.assertIsNotNone(rv)
     with tempfile.TemporaryDirectory() as d:
         rv.save_to_directory(d)
Пример #11
0
 def test_eager_unlabeled_dataset(self):
     """Test running the pipeline on unlabeled triples factories in a dataset."""
     dataset = EagerDataset(
         training=self.training,
         testing=self.testing,
         validation=self.validation,
     )
     _ = pipeline(
         dataset=dataset,
         model='TransE',
         training_kwargs=dict(num_epochs=1, use_tqdm=False),
         evaluation_kwargs=dict(use_tqdm=False),
     )
Пример #12
0
 def _help_test_interaction_resolver(self, model_cls):
     self.assertTrue(issubclass(model_cls, ERModel))
     self.assertIsInstance(model_cls._interaction, TransEInteraction)
     self.assertEqual(2, model_cls._interaction.p)
     _ = pipeline(
         training=self.training,
         testing=self.testing,
         validation=self.validation,
         model=model_cls,
         training_kwargs=dict(num_epochs=1, use_tqdm=False),
         evaluation_kwargs=dict(use_tqdm=False),
         random_seed=0,
     )
Пример #13
0
 def setUpClass(cls):
     """Set up a shared result."""
     cls.device = resolve_device('cuda')
     cls.result = pipeline(
         model='TransE',
         dataset='nations',
         training_kwargs=dict(num_epochs=5, use_tqdm=False),
         evaluation_kwargs=dict(use_tqdm=False),
         device=cls.device,
         random_seed=42,
     )
     cls.model = cls.result.model
     nations = Nations()
     cls.testing_mapped_triples = nations.testing.mapped_triples.to(cls.model.device)
Пример #14
0
 def test_pipeline(self):
     """Test the pipeline on RotatE with negative sampling self adversarial loss and nations."""
     loss = NSSALoss
     loss_kwargs = {"margin": 1., "adversarial_temperature": 1.}
     pipeline_results = pipeline(
         model='RotatE',
         dataset='nations',
         loss=loss,
         loss_kwargs=loss_kwargs,
     )
     self.assertIsInstance(pipeline_results, PipelineResult)
     self.assertIsInstance(pipeline_results.model.loss, loss)
     self.assertEqual(pipeline_results.model.loss.margin, 1.)
     self.assertEqual(pipeline_results.model.loss.adversarial_temperature,
                      1.)
Пример #15
0
 def test_pipeline_evaluation_filtering_with_validation_triples(self):
     """Test if the evaluator's triple filtering with validation triples works as expected using the pipeline."""
     results = pipeline(
         model=self.model,
         dataset=self.dataset,
         training_loop_kwargs=dict(automatic_memory_optimization=False),
         training_kwargs=dict(num_epochs=0, use_tqdm=False),
         evaluator_kwargs=dict(filtered=True,
                               automatic_memory_optimization=False),
         evaluation_kwargs=dict(use_tqdm=False),
         device=self.device,
         random_seed=42,
         filter_validation_when_testing=True,
     )
     assert results.metric_results.arithmetic_mean_rank['both'][
         'realistic'] == 1, 'The rank should equal 1'
Пример #16
0
 def test_pipeline(self):
     """Test the pipeline on RotatE with negative sampling self adversarial loss and nations."""
     loss = NSSALoss
     loss_kwargs = {"margin": 1.0, "adversarial_temperature": 1.0}
     pipeline_results = pipeline(
         model="RotatE",
         dataset="nations",
         loss=loss,
         loss_kwargs=loss_kwargs,
         training_kwargs=dict(use_tqdm=False),
     )
     self.assertIsInstance(pipeline_results, PipelineResult)
     self.assertIsInstance(pipeline_results.model.loss, loss)
     self.assertEqual(pipeline_results.model.loss.margin, 1.0)
     self.assertEqual(
         pipeline_results.model.loss.inverse_softmax_temperature, 1.0)
Пример #17
0
def train(outfolder, training, epochs=10):

    from pykeen.pipeline import pipeline
    result = pipeline(
        training=combinedtraining_file,
        testing=combinedtraining_file,
        model='TransE',
        training_kwargs=dict(num_epochs=epochs),
    )
    result.save_to_directory(outfolder)

    with open("{}/entity_id_to_label.json".format(outfolder), 'w') as outfile:
        json.dump(result.training.entity_id_to_label, outfile, indent=2)
    with open("{}/relation_id_to_label.json".format(outfolder),
              'w') as outfile:
        json.dump(result.training.relation_id_to_label, outfile, indent=2)
    return result
Пример #18
0
 def test_specify_regularizer(self):
     """Test a pipeline that uses a regularizer."""
     for regularizer, cls in [
         (None, pykeen.regularizers.NoRegularizer),
         ('no', pykeen.regularizers.NoRegularizer),
         (NoRegularizer, pykeen.regularizers.NoRegularizer),
         ('powersum', pykeen.regularizers.PowerSumRegularizer),
         ('lp', pykeen.regularizers.LpRegularizer),
     ]:
         with self.subTest(regularizer=regularizer):
             pipeline_result = pipeline(
                 model='TransE',
                 dataset='Nations',
                 regularizer=regularizer,
                 training_kwargs=dict(num_epochs=1),
             )
             self.assertIsInstance(pipeline_result, PipelineResult)
             self.assertIsInstance(pipeline_result.model, Model)
             self.assertIsInstance(pipeline_result.model.regularizer, cls)
Пример #19
0
 def test_interaction_instance_builder(self):
     """Test resolving an interaction model instance."""
     model = make_model(
         dimensions={"d": 3},
         interaction=TransEInteraction,
         interaction_kwargs=dict(p=2),
         triples_factory=self.training,
     )
     self.assertIsInstance(model, ERModel)
     self.assertIsInstance(model.interaction, TransEInteraction)
     self.assertEqual(2, model.interaction.p)
     _ = pipeline(
         training=self.training,
         testing=self.testing,
         validation=self.validation,
         model=model,
         training_kwargs=dict(num_epochs=1, use_tqdm=False),
         evaluation_kwargs=dict(use_tqdm=False),
         random_seed=0,
     )
Пример #20
0
 def __fit(
     self,
     model_text,
     num_epochs, 
     train_batch_size, 
     eval_batch_size,
     model_location
 ):
     """
     Fit the model. This method can be expanded more for optimizing the model in a better manner.
     To make the code scalable - we can use json config for training_kwargs, model_kwargs, etc.
     """
     self.__result = pipeline(
         training=self.__training,
         validation=self.__valid,
         testing=self.__testing,
         model=model_text,
         training_kwargs=dict(num_epochs=num_epochs, batch_size=train_batch_size),
         evaluation_kwargs=dict(batch_size=eval_batch_size)
     )
     
     self.__result.save_to_directory(model_location)
     self.__model = self.__result.model
Пример #21
0
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

training_path: str = "kg/train.hrt.txt"
validation_path: str = "kg/valid.hrt.txt"
testing_path: str = "kg/test.hrt.txt"

training = TriplesFactory(path=training_path, )
valid = TriplesFactory(
    path=validation_path,
    entity_to_id=training.entity_to_id,
    relation_to_id=training.relation_to_id,
)
testing = TriplesFactory(
    path=testing_path,
    entity_to_id=training.entity_to_id,
    relation_to_id=training.relation_to_id,
)

result = pipeline(training=training,
                  validation=valid,
                  testing=testing,
                  model='TransE',
                  training_kwargs=dict(num_epochs=2, batch_size=512),
                  evaluation_kwargs=dict(batch_size=128))
result.save_to_directory('saved-model')

import torch
model = torch.load('saved-model/trained_model.pkl')
print(model.predict_heads('VARIANT_DISEASE_associated', 'Leigh_syndrome'))
Пример #22
0
from pykeen.triples import TriplesFactory
from pykeen.evaluation import RankBasedEvaluator
from pykeen.pipeline import pipeline
import json

n_tokeep = 300
minimum = 500

tf = TriplesFactory.from_path(f'data/rare/rare_{minimum}_{n_tokeep}.csv')
training, testing = tf.split([.8, .2])

result_pipeline = pipeline(
    training=training,
    testing=testing,
    model='RESCAL',
    model_kwargs=dict(embedding_dim=300),
    training_kwargs=dict(  #sampler="schlichtkrull",
        # checkpoint_name='RGCN_checkpointt.pt',
        # checkpoint_frequency=5,
        num_epochs=200  #,
        #batch_size=128
    ),
    evaluator=RankBasedEvaluator,
    evaluator_kwargs=dict(ks=[50]))
result_pipeline.plot_losses()

result_pipeline.plot()
Пример #23
0
def run_inverse_stability_workflow(dataset: str, model: str, training_loop: str, random_seed=0, device='cpu'):
    """Run an inverse stability experiment."""
    dataset: Dataset = get_dataset(
        dataset=dataset,
        dataset_kwargs=dict(
            create_inverse_triples=True,
        ),
    )
    dataset_name = dataset.get_normalized_name()
    model_cls: Type[Model] = get_model_cls(model)
    model_name = model_cls.__name__.lower()

    dataset_dir = INVERSE_STABILITY / dataset_name
    dataset_dir.mkdir(exist_ok=True, parents=True)

    pipeline_result = pipeline(
        dataset=dataset,
        model=model,
        training_loop=training_loop,
        training_kwargs=dict(
            num_epochs=1000,
            use_tqdm_batch=False,
        ),
        stopper='early',
        stopper_kwargs=dict(patience=5, frequency=5),
        random_seed=random_seed,
        device=device,
    )
    test_tf = dataset.testing
    model = pipeline_result.model
    # Score with original triples
    scores_forward = model.score_hrt(test_tf.mapped_triples)
    scores_forward_np = scores_forward.detach().numpy()[:, 0]

    # Score with inverse triples
    scores_inverse = model.score_hrt_inverse(test_tf.mapped_triples)
    scores_inverse_np = scores_inverse.detach().numpy()[:, 0]

    scores_path = dataset_dir / f'{model_name}_{training_loop}_scores.tsv'
    df = pd.DataFrame(
        list(zip(
            itt.repeat(training_loop),
            itt.repeat(dataset_name),
            itt.repeat(model_name),
            scores_forward_np,
            scores_inverse_np,
        )),
        columns=['training_loop', 'dataset', 'model', 'forward', 'inverse'],
    )
    df.to_csv(scores_path, sep='\t', index=False)

    fig, ax = plt.subplots(1, 1)
    sns.histplot(data=df, x='forward', label='Forward', ax=ax, color='blue', stat="density")
    sns.histplot(data=df, x='inverse', label='Inverse', ax=ax, color='orange', stat="density")
    ax.set_title(f'{dataset_name} - {model_name} - {training_loop}')
    ax.set_xlabel('Score')
    plt.legend()
    plt.savefig(dataset_dir / f'{model_name}_{training_loop}_overlay.png', dpi=300)
    plt.close(fig)

    fig, ax = plt.subplots(1, 1)
    sns.histplot(scores_forward_np - scores_inverse_np, ax=ax, stat="density")
    ax.set_title(f'{dataset_name} - {model_name} - {training_loop}')
    ax.set_xlabel('Forward - Inverse Score Difference')
    plt.savefig(dataset_dir / f'{model_name}_{training_loop}_residuals.png', dpi=300)
    plt.close(fig)

    return df
Пример #24
0
def run_inverse_stability_workflow(dataset: str,
                                   model: str,
                                   training_loop: str,
                                   random_seed=0,
                                   device="cpu"):
    """Run an inverse stability experiment."""
    dataset_instance: Dataset = get_dataset(
        dataset=dataset,
        dataset_kwargs=dict(create_inverse_triples=True, ),
    )
    dataset_name = dataset_instance.get_normalized_name()
    model_cls: Type[Model] = model_resolver.lookup(model)
    model_name = model_cls.__name__.lower()

    dataset_dir = INVERSE_STABILITY / dataset_name
    dataset_dir.mkdir(exist_ok=True, parents=True)

    pipeline_result = pipeline(
        dataset=dataset_instance,
        model=model,
        training_loop=training_loop,
        training_kwargs=dict(
            num_epochs=1000,
            use_tqdm_batch=False,
        ),
        stopper="early",
        stopper_kwargs=dict(patience=5, frequency=5),
        random_seed=random_seed,
        device=device,
    )
    test_tf = dataset_instance.testing
    model = pipeline_result.model
    # Score with original triples
    scores_forward = model.score_hrt(test_tf.mapped_triples)
    scores_forward_np = scores_forward.detach().numpy()[:, 0]

    # Score with inverse triples
    scores_inverse = model.score_hrt_inverse(test_tf.mapped_triples)
    scores_inverse_np = scores_inverse.detach().numpy()[:, 0]

    scores_path = dataset_dir / f"{model_name}_{training_loop}_scores.tsv"
    df = pd.DataFrame(
        list(
            zip(
                itt.repeat(training_loop),
                itt.repeat(dataset_name),
                itt.repeat(model_name),
                scores_forward_np,
                scores_inverse_np,
            )),
        columns=["training_loop", "dataset", "model", "forward", "inverse"],
    )
    df.to_csv(scores_path, sep="\t", index=False)

    fig, ax = plt.subplots(1, 1)
    sns.histplot(data=df,
                 x="forward",
                 label="Forward",
                 ax=ax,
                 color="blue",
                 stat="density")
    sns.histplot(data=df,
                 x="inverse",
                 label="Inverse",
                 ax=ax,
                 color="orange",
                 stat="density")
    ax.set_title(f"{dataset_name} - {model_name} - {training_loop}")
    ax.set_xlabel("Score")
    plt.legend()
    plt.savefig(dataset_dir / f"{model_name}_{training_loop}_overlay.png",
                dpi=300)
    plt.close(fig)

    fig, ax = plt.subplots(1, 1)
    sns.histplot(scores_forward_np - scores_inverse_np, ax=ax, stat="density")
    ax.set_title(f"{dataset_name} - {model_name} - {training_loop}")
    ax.set_xlabel("Forward - Inverse Score Difference")
    plt.savefig(dataset_dir / f"{model_name}_{training_loop}_residuals.png",
                dpi=300)
    plt.close(fig)

    return df
Пример #25
0
valid._num_relations = _num_relations
test = TriplesFactory(path=test_path,
                      entity_to_id=entity_to_id,
                      relation_to_id=relation_to_id)
test._num_entities = _num_entities
test._num_relations = _num_relations

model = 'TransE'
result = pipeline(
    model=model,
    training_triples_factory=train,
    validation_triples_factory=valid,
    testing_triples_factory=test,
    training_kwargs={'num_epochs': 300},  # 30
    model_kwargs={'embedding_dim': 300},
    stopper='early',
    stopper_kwargs={
        'frequency': 10,
        'stopped': True,
        'patience': 1
    },
    evaluation_kwargs={'batch_size': 32},
    optimizer_kwargs={'lr': 0.1},
)

# =============================================================================
# print(result.metric_results.hits_at_k['avg'])
# print(result.metric_results.hits_at_k['pred'])
# np.save(f'{dataset}_{model}_pred.npy', result.metric_results.hits_at_k['pred'])
# =============================================================================
print(result)
result.save_to_directory(f'{dataset}_{model}')