Пример #1
0
def test_logger():
    logger_name = "TEST"
    log = Logger()
    logger = log.get_logger(logger_name)
    captured = io.StringIO()
    test_handler = logging.StreamHandler(captured)
    test_handler.setFormatter(logging.Formatter(Logger._FORMAT))
    logger.addHandler(test_handler)
    message = "This is a log message"

    log.level = logging.WARNING
    logger.debug(message)
    logger.info(message)
    logger.warning(message)
    logger.error(message)
    logger.critical(message)
    another_logger = log.get_logger("ANOTHER")

    result = captured.getvalue()
    captured.close()
    assert not "%s - DEBUG - %s" % (logger_name, message) in result
    assert not "%s - INFO - %s" % (logger_name, message) in result
    assert "%s - WARNING - %s" % (logger_name, message) in result
    assert "%s - ERROR - %s" % (logger_name, message) in result
    assert "%s - CRITICAL - %s" % (logger_name, message) in result
    assert Logger() is log
    assert another_logger.level == log.level
Пример #2
0
class EarlyStopper:
    """ Class used by trainer for handling the early stopping mechanism during the training of KGE algorithms.

        Args:
            patience (int): Number of epochs to wait before early stopping the training on no improvement.
            No early stopping if it is a negative number (default: {-1}).
            monitor (Monitor): the type of metric that earlystopper will monitor.

    """

    _logger = Logger().get_logger(__name__)

    def __init__(self, patience, monitor):

        self.monitor = monitor
        self.patience = patience

        # controlling variables.
        self.previous_metrics = None
        self.patience_left = patience

    def should_stop(self, curr_metrics):
        should_stop = False
        value, name = self.monitor.value, self.monitor.name

        if self.previous_metrics is not None:
            if self.monitor == Monitor.MEAN_RANK or self.monitor == Monitor.FILTERED_MEAN_RANK:
                is_worse = self.previous_metrics[value] < curr_metrics[value]
            else:
                is_worse = self.previous_metrics[value] > curr_metrics[value]

            if self.patience_left > 0 and is_worse:
                self.patience_left -= 1
                self._logger.info(
                    '%s more chances before the trainer stops the training. (prev_%s, curr_%s): (%.4f, %.4f)'
                    % (self.patience_left, name, name,
                       self.previous_metrics[value], curr_metrics[value]))

            elif self.patience_left == 0 and is_worse:
                self._logger.info('Stop the training.')
                should_stop = True

            else:
                self._logger.info('Reset the patience count to %d' %
                                  (self.patience))
                self.patience_left = self.patience

        self.previous_metrics = curr_metrics

        return should_stop
Пример #3
0
class EarlyStopper:
    
    _logger = Logger().get_logger(__name__)

    def __init__(self, patience, monitor):

        self.monitor = monitor
        self.patience = patience
        
        # controlling variables.
        self.previous_metrics = None
        self.patience_left = patience

    def should_stop(self, curr_metrics):
        should_stop = False
        value, name = self.monitor.value, self.monitor.name

        if self.previous_metrics is not None:
            if self.monitor == Monitor.MEAN_RANK or self.monitor == Monitor.FILTERED_MEAN_RANK:
                is_worse = self.previous_metrics[value] < curr_metrics[value]
            else:
                is_worse = self.previous_metrics[value] > curr_metrics[value]

            if self.patience_left > 0 and is_worse:
                self.patience_left -= 1
                self._logger.info(
                    '%s more chances before the trainer stops the training. (prev_%s, curr_%s): (%.4f, %.4f)' %
                    (self.patience_left, name, name, self.previous_metrics[value], curr_metrics[value]))
            
            elif self.patience_left == 0 and is_worse:
                self._logger.info('Stop the training.')
                should_stop = True
            
            else:
                self._logger.info('Reset the patience count to %d' % (self.patience))
                self.patience_left = self.patience
                
        self.previous_metrics = curr_metrics

        return should_stop
Пример #4
0
class Config:
    """ The class defines the basic configuration for the pykg2vec.

        Config consists of the necessary parameter description used by all the 
        modules including the algorithms and utility functions.

        Args:
            test_step (int): Testing is carried out every test_step.
            test_num (int): Number of triples that will be tested during evaluation.
            triple_num (int): Number of triples that will be used for plotting the embedding.
            tmp (Path Object): Path where temporary model information is stored.
            result (Path Object): Gives the path where the result will be saved.
            figures (Path Object): Gives the path where the figures will be saved.
            load_from_data (bool): If True, loads the model parameters if available from memory.
            save_model (True): If True, store the trained model parameters.
            disp_summary (bool): If True, display the summary before and after training the algorithm.
            disp_result (bool): If True, displays result while training.
            plot_embedding (bool): If True, will plot the embedding after performing t-SNE based dimensionality reduction.
            log_training_placement (bool): If True, allows us to find out which devices the operations and tensors are assigned to.
            plot_training_result (bool): If True, plots the loss values stored during training.
            plot_testing_result (bool): If True, it will plot all the testing result such as mean rank, hit ratio, etc.
            plot_entity_only (bool): If True, plots the t-SNE reduced embdding of the entities in a figure.
            hits (List): Gives the list of integer for calculating hits.
            knowledge_graph (Object): It prepares and holds the instance of the knowledge graph dataset.
            kg_meta (object): Stores the statistics metadata of the knowledge graph.
    
    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, args):

        for arg_name in vars(args):
            self.__dict__[arg_name] = getattr(args, arg_name)

        # Training and evaluating related variables
        self.hits = [1, 3, 5, 10]
        self.disp_result = False
        self.patience = 3  # should make this configurable as well.

        # Visualization related,
        # p.s. the visualizer is disable for most of the KGE methods for now.
        self.disp_triple_num = 20
        self.plot_training_result = True
        self.plot_testing_result = True

        # Knowledge Graph Information
        self.knowledge_graph = KnowledgeGraph(
            dataset=args.dataset_name, custom_dataset_path=args.dataset_path)
        for key in self.knowledge_graph.kg_meta.__dict__:
            self.__dict__[key] = self.knowledge_graph.kg_meta.__dict__[key]

        # The results of training will be stored in the following folders
        # which are relative to the parent folder (the path of the dataset).
        dataset_path = self.knowledge_graph.dataset.dataset_path
        self.path_tmp = dataset_path / 'intermediate'
        self.path_tmp.mkdir(parents=True, exist_ok=True)
        self.path_result = dataset_path / 'results'
        self.path_result.mkdir(parents=True, exist_ok=True)
        self.path_figures = dataset_path / 'figures'
        self.path_figures.mkdir(parents=True, exist_ok=True)
        self.path_embeddings = dataset_path / 'embeddings'
        self.path_embeddings.mkdir(parents=True, exist_ok=True)

        if args.exp is True:
            paper_params = HyperparamterLoader().load_hyperparameter(
                args.dataset_name, args.model_name)
            for key, value in paper_params.items():
                self.__dict__[
                    key] = value  # copy all the setting from the paper.

    def summary(self):
        """Function to print the summary."""
        summary = []
        summary.append("")
        summary.append("------------------Global Setting--------------------")
        # Acquire the max length and add four more spaces
        maxspace = len(max([k for k in self.__dict__.keys()])) + 20
        for key, val in self.__dict__.items():
            if isinstance(val, (KGMetaData, KnowledgeGraph)):
                continue

            if len(key) < maxspace:
                for i in range(maxspace - len(key)):
                    key = ' ' + key
            summary.append("%s : %s" % (key, val))
        summary.append("---------------------------------------------------")
        summary.append("")
        self._logger.info("\n".join(summary))
Пример #5
0
class Importer:
    """The class defines methods for importing pykg2vec modules.

    Importer is used to defines the maps for the algorithm names and
    provides methods for loading configuration and models.

    Attributes:
        model_path (str): Path where the models are defined.
        config_path (str): Path where the configuration for each models are defineds.
        modelMap (dict): This map transforms the names of model to the actual class names.
        configMap (dict): This map transforms the input config names to the actuall config class names.

    Examples:
        >>> from pykg2vec import Importer
        >>> config_def, model_def = Importer().import_model_config('transe')
        >>> config = config_def()
        >>> model = model_def(config)

    """
    _logger = Logger().get_logger(__name__)

    def __init__(self):
        self.model_path = "pykg2vec.models"
        self.config_path = "pykg2vec.config"
        self.modelMap = {"analogy": "pointwise.ANALOGY",
                         "complex": "pointwise.Complex",
                         "complexn3": "pointwise.ComplexN3",
                         "conve": "projection.ConvE",
                         "convkb": "pointwise.ConvKB",
                         "cp": "pointwise.CP",
                         "hole": "pairwise.HoLE",
                         "distmult": "pointwise.DistMult",
                         "kg2e": "pairwise.KG2E",
                         "ntn": "pairwise.NTN",
                         "proje_pointwise": "projection.ProjE_pointwise",
                         "rescal": "pairwise.Rescal",
                         "rotate": "pairwise.RotatE",
                         "simple": "pointwise.SimplE",
                         "simple_ignr": "pointwise.SimplE_ignr",
                         "slm": "pairwise.SLM",
                         "sme": "pairwise.SME",
                         "sme_bl": "pairwise.SME_BL",
                         "transd": "pairwise.TransD",
                         "transe": "pairwise.TransE",
                         "transh": "pairwise.TransH",
                         "transm": "pairwise.TransM",
                         "transr": "pairwise.TransR",
                         "tucker": "projection.TuckER"}

    def import_model_config(self, name):
        """This function imports models and configuration.

        This function is used to dynamically import the modules within
        pykg2vec.

        Args:
          name (str): The input to the module is either name of the model or the configuration file. The strings are converted to lowercase to makesure the user inputs can easily be matched to the names of the models and the configuration class.

        Returns:
          object: Configuration and model object after it is successfully loaded.

          `config_obj` (object): Returns the configuration class object of the corresponding algorithm.
          `model_obj` (object): Returns the model class object of the corresponding algorithm.

        Raises:
          ModuleNotFoundError: It raises a module not found error if the configuration or the model cannot be found.
        """
        config_obj = getattr(importlib.import_module(self.config_path), "Config")
        model_obj = None
        try:
            if name in self.modelMap:
                splited_path = self.modelMap[name].split('.')
            else:
                raise ValueError("%s model has not been implemented. please select from: %s" % (name, ' '.join(map(lambda x: str(x).split(".")[1], self.modelMap.values()))))
            model_obj = getattr(importlib.import_module(self.model_path + ".%s" % splited_path[0]), splited_path[1])
        except ModuleNotFoundError:
            self._logger.error("%s model has not been implemented. please select from: %s" % (name, ' '.join(map(str.split(".")[1], self.modelMap.values()))))
            raise ValueError("%s model has not been implemented. please select from: %s" % (name, ' '.join(map(str.split(".")[1], self.modelMap.values()))))

        return config_obj, model_obj
Пример #6
0
class HyperparameterLoader:
    """Hyper parameters loading based datasets and embedding algorithms"""

    _logger = Logger().get_logger(__name__)

    def __init__(self, args):
        self.hyperparams  = {}
        self.search_space = {}

        # load hyperparameters from options (file, dir or with pkg.)
        default_search_space_dir = (Path(__file__).resolve().parent)/"searchspaces"
        for config_file in default_search_space_dir.glob('**/*.yaml'):
            self.search_space = self._load_ss_yaml(config_file, self.search_space)
        default_hyperparam_dir = (Path(__file__).resolve().parent)/"hyperparams"
        for config_file in default_hyperparam_dir.glob('**/*.yaml'):
            self.hyperparams = self._load_hp_yaml(config_file, self.hyperparams)

        # load search spaces from options (file, dir or with pkg.)
        if hasattr(args, "hp_abs_file") and args.hp_abs_file is not None:
            self.hyperparams = self._load_hp_yaml(args.hp_abs_file, self.hyperparams)
        if hasattr(args, "ss_abs_file") and args.ss_abs_file is not None:
            self.search_space = self._load_ss_yaml(args.ss_abs_file, self.search_space)

    def load_hyperparameter(self, dataset_name, algorithm):
        d_name = dataset_name.lower()
        a_name = algorithm.lower()

        if d_name in self.hyperparams and a_name in self.hyperparams[d_name]:
            params = self.hyperparams[d_name][a_name]
            return params

        raise Exception("This experimental setting for (%s, %s) has not been configured" % (dataset_name, algorithm))

    def load_search_space(self, algorithm):
        if algorithm in self.search_space:
            return self.search_space[algorithm]
        raise ValueError("Hyperparameter search space is not configured for %s" % algorithm)

    @staticmethod
    def _load_hp_yaml(config_file, hyperparams):
        if not os.path.isfile(config_file):
            raise FileNotFoundError("Cannot find configuration file %s" % config_file)
        if str(config_file).endswith("yaml") or str(config_file).endswith("yml"):
            with open(os.path.abspath(config_file), "r") as file:
                try:
                    config = yaml.safe_load(file)
                    algorithm = config["model_name"].lower()
                    for dataset in config["datasets"]:
                        if dataset["dataset"] in hyperparams:
                            hyperparams[dataset["dataset"]][algorithm] = dataset["parameters"]
                        else:
                            hyperparams = {**hyperparams, **{dataset["dataset"]: {algorithm: dataset["parameters"]}}}
                except yaml.YAMLError:
                    HyperparameterLoader._logger.error("Cannot load configuration: %s" % config_file)
                    raise
        else:
            raise ValueError("Configuration file must have .yaml or .yml extension: %s" % config_file)
        return hyperparams

    @staticmethod
    def _load_ss_yaml(config_file, search_space):
        ''' loading search space configurationfrom yaml file'''
        if not os.path.isfile(config_file):
            raise FileNotFoundError("Cannot find configuration file %s" % config_file)
        if str(config_file).endswith("yaml") or str(config_file).endswith("yml"):
            with open(os.path.abspath(config_file), "r") as file:
                try:
                    config = yaml.safe_load(file)
                    algorithm = config["model_name"].lower()
                    search_space = {**search_space, **{algorithm: HyperparameterLoader._config_tuning_space(config["search_space"])}}
                except yaml.YAMLError:
                    HyperparameterLoader._logger.error("Cannot load configuration: %s" % config_file)
                    raise
        else:
            raise ValueError("Configuration file must have .yaml or .yml extension: %s" % config_file)
        return search_space

    @staticmethod
    def _config_tuning_space(tuning_space_raw):
        if tuning_space_raw is None:
            return None

        hyper_obj = {}
        if "learning_rate" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"learning_rate": hp.loguniform('learning_rate', np.log(tuning_space_raw['learning_rate']['min']), np.log(tuning_space_raw['learning_rate']['max']))}}
        if "hidden_size" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"hidden_size": scope.int(hp.qloguniform('hidden_size', np.log(tuning_space_raw['hidden_size']['min']), np.log(tuning_space_raw['hidden_size']['max']), 1))}}
        if "ent_hidden_size" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"ent_hidden_size": scope.int(hp.qloguniform("ent_hidden_size", np.log(tuning_space_raw['ent_hidden_size']['min']), np.log(tuning_space_raw['ent_hidden_size']['max']), 1))}}
        if "rel_hidden_size" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"rel_hidden_size": scope.int(hp.qloguniform("rel_hidden_size", np.log(tuning_space_raw['rel_hidden_size']['min']), np.log(tuning_space_raw['rel_hidden_size']['max']), 1))}}
        if "batch_size" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"batch_size": scope.int(hp.qloguniform("batch_size", np.log(tuning_space_raw['batch_size']['min']), np.log(tuning_space_raw['batch_size']['max']), 1))}}
        if "margin" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"margin": hp.uniform("margin", tuning_space_raw["margin"]["min"], tuning_space_raw["margin"]["max"])}}
        if "lmbda" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"lmbda": hp.loguniform('lmbda', np.log(tuning_space_raw["lmbda"]["min"]), np.log(tuning_space_raw["lmbda"]["max"]))}}
        if "distance_measure" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"distance_measure": hp.choice('distance_measure', tuning_space_raw["distance_measure"])}}
        if "cmax" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"cmax": hp.loguniform('cmax', np.log(tuning_space_raw["cmax"]["min"]), np.log(tuning_space_raw["cmax"]["max"]))}}
        if "cmin" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"cmin": hp.loguniform('cmin', np.log(tuning_space_raw["cmin"]["min"]), np.log(tuning_space_raw["cmin"]["max"]))}}
        if "optimizer" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"optimizer": hp.choice("optimizer", tuning_space_raw["optimizer"])}}
        if "bilinear" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"bilinear": hp.choice('bilinear', tuning_space_raw["bilinear"])}}
        if "epochs" in tuning_space_raw:
            hyper_obj = {**hyper_obj, **{"epochs": hp.choice("epochs", tuning_space_raw["epochs"])}}

        return hyper_obj
Пример #7
0
class KnownDataset:
    """The class consists of modules to handle the known datasets.

       There are various known knowledge graph datasets used by the research
       community. These datasets maybe in different format. This module
       helps in parsing those known datasets for training and testing
       the algorithms.

       Args:
          name (str): Name of the datasets
          url (str): The full url where the dataset resides.
          prefix (str): The prefix of the dataset given the website.

       Attributes:
           dataset_home_path (object): Path object where the data will be downloaded
           root_oath (object): Path object for the specific dataset.

       Examples:
           >>> from pykg2vec.data.kgcontroller import KnownDataset
           >>> name = "dL50a"
           >>> url = "https://github.com/louisccc/KGppler/raw/master/datasets/dL50a.tgz"
           >>> prefix = 'deeplearning_dataset_50arch-'
           >>> kgdata =  KnownDataset(name, url, prefix)
           >>> kgdata.download()
           >>> kgdata.extract()
           >>> kgdata.dump()

    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, name, url, prefix):

        self.name = name
        self.url = url
        self.prefix = prefix

        self.dataset_home_path = Path('..') / 'dataset'
        self.dataset_home_path.mkdir(parents=True, exist_ok=True)
        self.dataset_home_path = self.dataset_home_path.resolve()
        self.root_path = self.dataset_home_path / self.name
        self.tar = self.root_path / ('%s.tgz' % self.name)
        self.zip = self.root_path / ('%s.zip' % self.name)

        if not self.root_path.exists():
            self.download()
            self.extract()

        path_eq_root = [
            'YAGO3_10', 'WN18RR', 'FB15K_237', 'Kinship', 'Nations', 'UMLS',
            'NELL_995'
        ]
        if self.name == 'WN18':
            self.dataset_path = self.root_path / 'wordnet-mlj12'
        elif self.name in path_eq_root:
            self.dataset_path = self.root_path
        else:
            self.dataset_path = self.root_path / self.name

        self.data_paths = {
            'train': self.dataset_path / ('%strain.txt' % self.prefix),
            'test': self.dataset_path / ('%stest.txt' % self.prefix),
            'valid': self.dataset_path / ('%svalid.txt' % self.prefix)
        }

        self.cache_triplet_paths = {
            'train': self.dataset_path / 'triplets_train.pkl',
            'test': self.dataset_path / 'triplets_test.pkl',
            'valid': self.dataset_path / 'triplets_valid.pkl'
        }

        self.cache_metadata_path = self.dataset_path / 'metadata.pkl'
        self.cache_hr_t_path = self.dataset_path / 'hr_t.pkl'
        self.cache_tr_h_path = self.dataset_path / 'tr_h.pkl'
        self.cache_hr_t_train_path = self.dataset_path / 'hr_t_train.pkl'
        self.cache_tr_h_train_path = self.dataset_path / 'tr_h_train.pkl'
        self.cache_idx2entity_path = self.dataset_path / 'idx2entity.pkl'
        self.cache_idx2relation_path = self.dataset_path / 'idx2relation.pkl'
        self.cache_entity2idx_path = self.dataset_path / 'entity2idx.pkl'
        self.cache_relation2idx_path = self.dataset_path / 'relation2idx.pkl'
        self.cache_relationproperty_path = self.dataset_path / 'relationproperty.pkl'

    def download(self):
        ''' Downloads the given dataset from url'''
        self._logger.info("Downloading the dataset %s" % self.name)

        self.root_path.mkdir()
        if self.url.endswith('.tar.gz') or self.url.endswith('.tgz'):
            with urllib.request.urlopen(self.url) as response, open(
                    str(self.tar), 'wb') as out_file:
                shutil.copyfileobj(response, out_file)
        elif self.url.endswith('.zip'):
            with urllib.request.urlopen(self.url) as response, open(
                    str(self.zip), 'wb') as out_file:
                shutil.copyfileobj(response, out_file)
        else:
            raise NotImplementedError("Unknown compression format")

    def extract(self):
        ''' Extract the downloaded file under the folder with the given dataset name'''

        try:
            if (os.path.exists(self.tar)):
                self._logger.info(
                    "Extracting the downloaded dataset from %s to %s" %
                    (self.tar, self.root_path))
                extract_tar(str(self.tar), str(self.root_path))
                return
            if (os.path.exists(self.zip)):
                self._logger.info(
                    "Extracting the downloaded dataset from %s to %s" %
                    (self.zip, self.root_path))
                extract_zip(str(self.zip), str(self.root_path))
                return
        except Exception as e:
            self._logger.error("Could not extract the target file!")
            self._logger.error("%s %s" % (type(e), e.args))

    def read_metadata(self):
        ''' Reads the metadata of the knowledge graph if available'''
        with open(str(self.cache_metadata_path), 'rb') as f:
            meta = pickle.load(f)
            return meta

    def is_meta_cache_exists(self):
        ''' Checks if the metadata of the knowledge graph if available'''
        return self.cache_metadata_path.exists()

    def dump(self):
        ''' Displays all the metadata of the knowledge graph'''
        for key, value in self.__dict__.items():
            self._logger.info("%s %s" % (key, value))
Пример #8
0
class UserDefinedDataset(object):
    """The class consists of modules to handle the user defined datasets.

      User may define their own datasets to be processed with the
      pykg2vec library.

      Args:
         name (str): Name of the datasets

      Attributes:
          dataset_home_path (object): Path object where the data will be downloaded
          root_oath (object): Path object for the specific dataset.

    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, name, custom_dataset_path):
        self.name = name

        self.dataset_path = Path(custom_dataset_path).resolve()
        self.root_path = self.dataset_path

        if not self.root_path.exists():
            raise NotImplementedError("%s user defined dataset not found!" %
                                      self.root_path)

        train_file = self.root_path / (name + '-train.txt')
        test_file = self.root_path / (name + '-test.txt')
        valid_file = self.root_path / (name + '-valid.txt')

        if not train_file.exists():
            raise NotImplementedError("%s training file not found!" %
                                      train_file)
        if not test_file.exists():
            raise NotImplementedError("%s test file not found!" % test_file)
        if not test_file.exists():
            raise NotImplementedError("%s validation file not found!" %
                                      valid_file)

        self.data_paths = {
            'train': self.root_path / (name + '-train.txt'),
            'test': self.root_path / (name + '-test.txt'),
            'valid': self.root_path / (name + '-valid.txt')
        }

        self.cache_triplet_paths = {
            'train': self.root_path / 'triplets_train.pkl',
            'test': self.root_path / 'triplets_test.pkl',
            'valid': self.root_path / 'triplets_valid.pkl'
        }

        self.cache_metadata_path = self.root_path / 'metadata.pkl'
        self.cache_hr_t_path = self.root_path / 'hr_t.pkl'
        self.cache_tr_h_path = self.root_path / 'tr_h.pkl'
        self.cache_hr_t_train_path = self.root_path / 'hr_t_train.pkl'
        self.cache_tr_h_train_path = self.root_path / 'tr_h_train.pkl'
        self.cache_idx2entity_path = self.root_path / 'idx2entity.pkl'
        self.cache_idx2relation_path = self.root_path / 'idx2relation.pkl'
        self.cache_entity2idx_path = self.root_path / 'entity2idx.pkl'
        self.cache_relation2idx_path = self.root_path / 'relation2idx.pkl'
        self.cache_relationproperty_path = self.root_path / 'relationproperty.pkl'

    def is_meta_cache_exists(self):
        """ Checks if the metadata has been cached"""
        return self.cache_metadata_path.exists()

    def read_metadata(self):
        """ Reads the metadata of the user defined dataset"""
        with open(str(self.cache_metadata_path), 'rb') as f:
            meta = pickle.load(f)
            return meta

    def dump(self):
        """ Prints the metadata of the user-defined dataset."""
        for key, value in self.__dict__.items():
            self._logger.info("%s %s" % (key, value))
Пример #9
0
class KnowledgeGraph(object):
    """The class is the main module that handles the knowledge graph.

      KnowledgeGraph is responsible for downloading, parsing, processing and preparing
      the training, testing and validation dataset.

      Args:
         dataset_name (str): Name of the datasets

      Attributes:
        dataset_name (str): The name of the dataset.
        dataset (object): The dataset object isntance.
        triplets (dict): dictionary with three list of training, testing and validation triples.
        relations (list):list of all the relations.
        entities (list): List of all the entities.
        entity2idx (dict): Dictionary for mapping string name of entities to unique numerical id.
        idx2entity (dict): Dictionary for mapping the id to string.
        relation2idx (dict): Dictionary for mapping the id to string.
        idx2relation (dict): Dictionary for mapping the id to string.
        hr_t (dict):  Dictionary with set as a default key and list as values.
        tr_h (dict):  Dictionary with set as a default key and list as values.
        hr_t_train (dict):  Dictionary with set as a default key and list as values.
        tr_h_train (dict):  Dictionary with set as a default key and list as values.
        relation_property (list): list storing the entities tied to a specific relation.
        kg_meta (object): Object storing the statistics metadata of the dataset.

      Examples:
          >>> from pykg2vec.config.global_config import KnowledgeGraph
          >>> knowledge_graph = KnowledgeGraph(dataset='Freebase15k')
          >>> knowledge_graph.prepare_data()
    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, dataset='Freebase15k', custom_dataset_path=None):

        self.dataset_name = dataset

        if dataset.lower() == 'freebase15k' or dataset.lower() == 'fb15k':
            self.dataset = FreebaseFB15k()
        elif dataset.lower() == 'deeplearning50a' or dataset.lower(
        ) == 'dl50a':
            self.dataset = DeepLearning50a()
        elif dataset.lower() == 'wordnet18' or dataset.lower() == 'wn18':
            self.dataset = WordNet18()
        elif dataset.lower() == 'wordnet18_rr' or dataset.lower() == 'wn18_rr':
            self.dataset = WordNet18_RR()
        elif dataset.lower() == 'yago3_10' or dataset.lower() == 'yago':
            self.dataset = YAGO3_10()
        elif dataset.lower() == 'freebase15k_237' or dataset.lower(
        ) == 'fb15k_237':
            self.dataset = FreebaseFB15k_237()
        elif dataset.lower() == 'kinship' or dataset.lower() == 'ks':
            self.dataset = Kinship()
        elif dataset.lower() == 'nations':
            self.dataset = Nations()
        elif dataset.lower() == 'umls':
            self.dataset = UMLS()
        elif dataset.lower() == 'nell_995':
            self.dataset = NELL_995()
        else:
            # if the dataset does not match with existing one, check if it exists in user's local space.
            # if it still can't find corresponding folder, raise exception in UserDefinedDataset.__init__()

            self.dataset = UserDefinedDataset(dataset, custom_dataset_path)

        # KG data structure stored in triplet format
        self.triplets = {'train': [], 'test': [], 'valid': []}
        self.triple_store = self.triplets

        # TODO: should also have graph-based data structure for a KG.
        self.relations = []
        self.entities = []

        self.entity2idx = {}
        self.idx2entity = {}
        self.relation2idx = {}
        self.idx2relation = {}

        self.hr_t = defaultdict(set)
        self.tr_h = defaultdict(set)

        self.hr_t_train = defaultdict(set)
        self.tr_h_train = defaultdict(set)

        self.hr_t_valid = defaultdict(set)
        self.tr_h_valid = defaultdict(set)

        self.relation_property = []

        if self.dataset.is_meta_cache_exists():
            self.kg_meta = self.dataset.read_metadata()
        else:
            self.kg_meta = KGMetaData()
            self.prepare_data()

    def force_prepare_data(self):
        shutil.rmtree(str(self.dataset.root_path), ignore_errors=True)

        time.sleep(1)

        self.__init__(dataset=self.dataset_name)

    def prepare_data(self):
        """Function to prepare the dataset"""
        if self.dataset.is_meta_cache_exists():
            return

        self.read_entities()
        self.read_relations()
        self.read_mappings()
        self.read_triple_ids('train')
        self.read_triple_ids('test')
        self.read_triple_ids('valid')
        self.read_hr_t()
        self.read_tr_h()
        self.read_hr_t_train()
        self.read_tr_h_train()
        self.read_hr_t_valid()
        self.read_tr_h_valid()
        self.read_relation_property()

        self.kg_meta.tot_relation = len(self.relations)
        self.kg_meta.tot_entity = len(self.entities)
        self.kg_meta.tot_valid_triples = len(self.triplets['valid'])
        self.kg_meta.tot_test_triples = len(self.triplets['test'])
        self.kg_meta.tot_train_triples = len(self.triplets['train'])
        self.kg_meta.tot_triple = self.kg_meta.tot_valid_triples + \
                                  self.kg_meta.tot_test_triples + \
                                  self.kg_meta.tot_train_triples

        self.cache_data()

    def cache_data(self):
        """Function to cache the prepared dataset in the memory"""
        with open(str(self.dataset.cache_metadata_path), 'wb') as f:
            pickle.dump(self.kg_meta, f)
        with open(str(self.dataset.cache_triplet_paths['train']), 'wb') as f:
            pickle.dump(self.triplets['train'], f)
        with open(str(self.dataset.cache_triplet_paths['test']), 'wb') as f:
            pickle.dump(self.triplets['test'], f)
        with open(str(self.dataset.cache_triplet_paths['valid']), 'wb') as f:
            pickle.dump(self.triplets['valid'], f)
        with open(str(self.dataset.cache_hr_t_path), 'wb') as f:
            pickle.dump(self.hr_t, f)
        with open(str(self.dataset.cache_tr_h_path), 'wb') as f:
            pickle.dump(self.tr_h, f)
        with open(str(self.dataset.cache_hr_t_train_path), 'wb') as f:
            pickle.dump(self.hr_t_train, f)
        with open(str(self.dataset.cache_tr_h_train_path), 'wb') as f:
            pickle.dump(self.tr_h_train, f)
        with open(str(self.dataset.cache_idx2entity_path), 'wb') as f:
            pickle.dump(self.idx2entity, f)
        with open(str(self.dataset.cache_idx2relation_path), 'wb') as f:
            pickle.dump(self.idx2relation, f)
        with open(str(self.dataset.cache_relation2idx_path), 'wb') as f:
            pickle.dump(self.relation2idx, f)
        with open(str(self.dataset.cache_entity2idx_path), 'wb') as f:
            pickle.dump(self.entity2idx, f)
        with open(str(self.dataset.cache_relationproperty_path), 'wb') as f:
            pickle.dump(self.relation_property, f)

    def read_cache_data(self, key):
        """Function to read the cached dataset from the memory"""
        if key == 'triplets_train':
            with open(str(self.dataset.cache_triplet_paths['train']),
                      'rb') as f:
                triplets = pickle.load(f)

                return triplets
        elif key == 'triplets_test':
            with open(str(self.dataset.cache_triplet_paths['test']),
                      'rb') as f:
                triplets = pickle.load(f)

                return triplets
        elif key == 'triplets_valid':
            with open(str(self.dataset.cache_triplet_paths['valid']),
                      'rb') as f:
                triplets = pickle.load(f)

                return triplets

        elif key == 'hr_t':
            with open(str(self.dataset.cache_hr_t_path), 'rb') as f:
                hr_t = pickle.load(f)

                return hr_t

        elif key == 'tr_h':
            with open(str(self.dataset.cache_tr_h_path), 'rb') as f:
                tr_h = pickle.load(f)

                return tr_h

        elif key == 'hr_t_train':
            with open(str(self.dataset.cache_hr_t_train_path), 'rb') as f:
                hr_t_train = pickle.load(f)

                return hr_t_train

        elif key == 'tr_h_train':
            with open(str(self.dataset.cache_tr_h_train_path), 'rb') as f:
                tr_h_train = pickle.load(f)

                return tr_h_train

        elif key == 'idx2entity':
            with open(str(self.dataset.cache_idx2entity_path), 'rb') as f:
                idx2entity = pickle.load(f)

                return idx2entity

        elif key == 'idx2relation':
            with open(str(self.dataset.cache_idx2relation_path), 'rb') as f:
                idx2relation = pickle.load(f)

                return idx2relation

        elif key == 'entity2idx':
            with open(str(self.dataset.cache_entity2idx_path), 'rb') as f:
                entity2idx = pickle.load(f)

                return entity2idx

        elif key == 'relation2idx':
            with open(str(self.dataset.cache_relation2idx_path), 'rb') as f:
                relation2idx = pickle.load(f)

                return relation2idx

        elif key == 'relationproperty':
            with open(str(self.dataset.cache_relationproperty_path),
                      'rb') as f:
                relation_property = pickle.load(f)

                return relation_property

    def is_cache_exists(self):
        """Function to check if the dataset is cached in the memory"""
        return self.dataset.is_meta_cache_exists()

    def read_triplets(self, set_type):
        '''
            read triplets from txt files in dataset folder.
            (in string format)
        '''
        triplets = self.triplets[set_type]

        if len(triplets) == 0:
            with open(str(self.dataset.data_paths[set_type]),
                      'r',
                      encoding='utf-8') as file:
                for line in file.readlines():
                    s, p, o = line.split('\t')
                    triplets.append(Triple(s.strip(), p.strip(), o.strip()))

        return triplets

    def read_entities(self):
        """ Function to read the entities. """
        if len(self.entities) == 0:
            entities = set()

            all_triplets = self.read_triplets('train') + \
                           self.read_triplets('valid') + \
                           self.read_triplets('test')

            for triplet in all_triplets:
                entities.add(triplet.h)
                entities.add(triplet.t)

            self.entities = np.sort(list(entities))

        return self.entities

    def read_relations(self):
        """ Function to read the relations. """
        if len(self.relations) == 0:
            relations = set()

            all_triplets = self.read_triplets('train') + \
                           self.read_triplets('valid') + \
                           self.read_triplets('test')

            for triplet in all_triplets:
                relations.add(triplet.r)

            self.relations = np.sort(list(relations))

        return self.relations

    def read_mappings(self):
        """ Function to generate the mapping from string name to integer ids. """
        self.entity2idx = {v: k
                           for k, v in enumerate(self.read_entities())}  ##
        self.idx2entity = {v: k for k, v in self.entity2idx.items()}
        self.relation2idx = {
            v: k
            for k, v in enumerate(self.read_relations())
        }  ##
        self.idx2relation = {v: k for k, v in self.relation2idx.items()}

    def read_triple_ids(self, set_type):
        """ Function to read the triple idx.

            Args:
                set_type (str): Type of data, eithe train, test or valid.
        """
        # assert entities can not be none
        # assert relations can not be none
        triplets = self.triplets[set_type]

        entity2idx = self.entity2idx
        relation2idx = self.relation2idx

        if len(triplets) != 0:
            for t in triplets:
                t.set_ids(entity2idx[t.h], relation2idx[t.r], entity2idx[t.t])

        return triplets

    def read_hr_t(self):
        """ Function to read the list of tails for the given head and relation pair. """
        for set_type in self.triplets:
            triplets = self.triplets[set_type]

            for t in triplets:
                self.hr_t[(t.h, t.r)].add(t.t)

        return self.hr_t

    def read_tr_h(self):
        """ Function to read the list of heads for the given tail and relation pair. """
        for set_type in self.triplets:
            triplets = self.triplets[set_type]

            for t in triplets:
                self.tr_h[(t.t, t.r)].add(t.h)

        return self.tr_h

    def read_hr_t_train(self):
        """ Function to read the list of tails for the given head and relation pair for the training set. """
        triplets = self.triplets['train']

        for t in triplets:
            self.hr_t_train[(t.h, t.r)].add(t.t)

        return self.hr_t_train

    def read_tr_h_train(self):
        """ Function to read the list of heads for the given tail and relation pair for the training set. """
        triplets = self.triplets['train']

        for t in triplets:
            self.tr_h_train[(t.t, t.r)].add(t.h)

        return self.tr_h_train

    def read_hr_t_valid(self):
        """ Function to read the list of tails for the given head and relation pair for the valid set. """
        triplets = self.triplets['valid']

        for t in triplets:
            self.hr_t_valid[(t.h, t.r)].add(t.t)

        return self.hr_t_valid

    def read_tr_h_valid(self):
        """ Function to read the list of heads for the given tail and relation pair for the valid set. """
        triplets = self.triplets['valid']

        for t in triplets:
            self.tr_h_valid[(t.t, t.r)].add(t.h)

        return self.tr_h_valid

    def read_relation_property(self):
        """ Function to read the relation property.

         Returns:
             list: Returns the list of relation property.
         """
        relation_property_head = {x: [] for x in range(len(self.relations))}
        relation_property_tail = {x: [] for x in range(len(self.relations))}

        for t in self.triplets['train']:
            relation_property_head[t.r].append(t.h)
            relation_property_tail[t.r].append(t.t)

        self.relation_property = {}
        for x in relation_property_head.keys():
            value_up = len(set(relation_property_tail[x]))

            value_bot = len(set(relation_property_head[x])) + len(
                set(relation_property_tail[x]))

            if value_bot == 0:
                value = 0
            else:
                value = value_up / value_bot

            self.relation_property[x] = value

        return self.relation_property

    ''' reserved for debugging '''

    def dump(self):
        """ Function to dump statistic information of a dataset """
        ''' dump key information'''
        dump = []
        dump.append("")
        dump.append("----------Metadata Info for Dataset:%s----------------" %
                    self.dataset_name)
        dump.append("Total Training Triples   :%s" %
                    self.kg_meta.tot_train_triples)
        dump.append("Total Testing Triples    :%s" %
                    self.kg_meta.tot_test_triples)
        dump.append("Total validation Triples :%s" %
                    self.kg_meta.tot_valid_triples)
        dump.append("Total Entities           :%s" % self.kg_meta.tot_entity)
        dump.append("Total Relations          :%s" % self.kg_meta.tot_relation)
        dump.append("---------------------------------------------")
        dump.append("")
        self._logger.info(("\n".join(dump)))
Пример #10
0
class HyperparamterLoader:
    """Hyper parameters loading based datasets and embedding algorithms"""

    _logger = Logger().get_logger(__name__)

    def __init__(self, args):
        self.hyperparams, self.search_space = self._load_parameter_config(
            args.hp_abs_dir) if hasattr(
                args, "hp_abs_dir") else self._load_parameter_config(None)

    def load_hyperparameter(self, dataset_name, algorithm):
        d_name = dataset_name.lower()
        a_name = algorithm.lower()

        if d_name in self.hyperparams and a_name in self.hyperparams[d_name]:
            params = self.hyperparams[d_name][a_name]
            return params

        raise Exception(
            "This experimental setting for (%s, %s) has not been configured" %
            (dataset_name, algorithm))

    def load_search_space(self, algorithm):
        if algorithm in self.search_space:
            return self.search_space[algorithm]
        raise ValueError(
            "Hyperparameter search space is not configured for %s" % algorithm)

    @staticmethod
    def _load_parameter_config(config_abs_dir):
        default_config_dir = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "hyperparams")
        hyperparams, search_space = HyperparamterLoader._load_yaml_config(
            default_config_dir, {}, {})
        if config_abs_dir is not None:
            hyperparams, search_space = HyperparamterLoader._load_yaml_config(
                config_abs_dir, hyperparams, search_space)

        return hyperparams, search_space

    @staticmethod
    def _load_yaml_config(config_dir, hyperparams, search_space):
        for config_file in os.listdir(config_dir):
            if config_file.endswith("yaml") or config_file.endswith("yml"):
                with open(
                        os.path.abspath(os.path.join(config_dir, config_file)),
                        "r") as file:
                    try:
                        config = yaml.safe_load(file)
                        algorithm = os.path.splitext(config_file)[0].lower()
                        if config["dataset"] in hyperparams:
                            hyperparams[config["dataset"]][algorithm] = config[
                                "parameters"]
                        else:
                            hyperparams = {
                                **hyperparams,
                                **{
                                    config["dataset"]: {
                                        algorithm: config["parameters"]
                                    }
                                }
                            }
                        search_space = {
                            **search_space,
                            **{
                                algorithm:
                                HyperparamterLoader._config_tuning_space(config["search_space"])
                            }
                        }
                    except yaml.YAMLError:
                        HyperparamterLoader._logger.error(
                            "Cannot load configuration: %s" % config_file)
                        raise
            else:
                HyperparamterLoader._logger.warning(
                    "Skipped non YAML file: %s" % config_file)
        return hyperparams, search_space

    @staticmethod
    def _config_tuning_space(tuning_space_raw):
        if tuning_space_raw is None:
            return None

        hyper_obj = {}
        if "learning_rate" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "learning_rate":
                    hp.loguniform(
                        'learning_rate',
                        np.log(tuning_space_raw['learning_rate']['min']),
                        np.log(tuning_space_raw['learning_rate']['max']))
                }
            }
        if "hidden_size" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "hidden_size":
                    scope.int(
                        hp.qloguniform(
                            'hidden_size',
                            np.log(tuning_space_raw['hidden_size']['min']),
                            np.log(tuning_space_raw['hidden_size']['max']), 1))
                }
            }
        if "ent_hidden_size" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "ent_hidden_size":
                    scope.int(
                        hp.qloguniform(
                            "ent_hidden_size",
                            np.log(tuning_space_raw['ent_hidden_size']['min']),
                            np.log(tuning_space_raw['ent_hidden_size']['max']), 1))
                }
            }
        if "rel_hidden_size" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "rel_hidden_size":
                    scope.int(
                        hp.qloguniform(
                            "rel_hidden_size",
                            np.log(tuning_space_raw['rel_hidden_size']['min']),
                            np.log(tuning_space_raw['rel_hidden_size']['max']), 1))
                }
            }
        if "batch_size" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "batch_size":
                    scope.int(
                        hp.qloguniform(
                            "batch_size",
                            np.log(tuning_space_raw['batch_size']['min']),
                            np.log(tuning_space_raw['batch_size']['max']), 1))
                }
            }
        if "margin" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "margin":
                    hp.uniform("margin", tuning_space_raw["margin"]["min"], tuning_space_raw["margin"]["max"])
                }
            }
        if "lmbda" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "lmbda":
                    hp.loguniform('lmbda',
                                  np.log(tuning_space_raw["lmbda"]["min"]),
                                  np.log(tuning_space_raw["lmbda"]["max"]))
                }
            }
        if "distance_measure" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "distance_measure":
                    hp.choice('distance_measure', tuning_space_raw["distance_measure"])
                }
            }
        if "cmax" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "cmax":
                    hp.loguniform('cmax',
                                  np.log(tuning_space_raw["cmax"]["min"]),
                                  np.log(tuning_space_raw["cmax"]["max"]))
                }
            }
        if "cmin" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "cmin":
                    hp.loguniform('cmin',
                                  np.log(tuning_space_raw["cmin"]["min"]),
                                  np.log(tuning_space_raw["cmin"]["max"]))
                }
            }
        if "optimizer" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "optimizer":
                    hp.choice("optimizer", tuning_space_raw["optimizer"])
                }
            }
        if "bilinear" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "bilinear": hp.choice('bilinear', tuning_space_raw["bilinear"])
                }
            }
        if "epochs" in tuning_space_raw:
            hyper_obj = {
                **hyper_obj,
                **{
                    "epochs": hp.choice("epochs", tuning_space_raw["epochs"])
                }
            }

        return hyper_obj
Пример #11
0
class BaysOptimizer:
    """Bayesian optimizer class for tuning hyperparameter.

      This class implements the Bayesian Optimizer for tuning the
      hyper-parameter.

      Args:
        args (object): The Argument Parser object providing arguments.
        name_dataset (str): The name of the dataset.
        sampling (str): sampling to be used for generating negative triples


      Examples:
        >>> from pykg2vec.common import KGEArgParser
        >>> from pykg2vec.utils.bayesian_optimizer import BaysOptimizer
        >>> model = Complex()
        >>> args = KGEArgParser().get_args(sys.argv[1:])
        >>> bays_opt = BaysOptimizer(args=args)
        >>> bays_opt.optimize()
    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, args):
        """store the information of database"""
        if args.model_name.lower() in [
                "conve", "convkb", "proje_pointwise", "interacte", "hyper",
                "acre"
        ]:
            raise Exception(
                "Model %s has not been supported in tuning hyperparameters!" %
                args.model)

        self.model_name = args.model_name
        self.knowledge_graph = KnowledgeGraph(
            dataset=args.dataset_name, custom_dataset_path=args.dataset_path)
        self.kge_args = args
        self.max_evals = args.max_number_trials if not args.debug else 3

        self.config_obj, self.model_obj = Importer().import_model_config(
            self.model_name.lower())
        self.config_local = self.config_obj(self.kge_args)
        self.search_space = HyperparameterLoader(args).load_search_space(
            self.model_name.lower())
        self._best_result = None
        self.trainer = None

    def optimize(self):
        """Function that performs bayesian optimization"""
        trials = Trials()

        self._best_result = fmin(fn=self._get_loss,
                                 space=self.search_space,
                                 trials=trials,
                                 algo=tpe.suggest,
                                 max_evals=self.max_evals)

        columns = list(self.search_space.keys())
        results = pd.DataFrame(columns=['iteration'] + columns + ['loss'])

        for idx, trial in enumerate(trials.trials):
            row = [idx]
            translated_eval = space_eval(
                self.search_space,
                {k: v[0]
                 for k, v in trial['misc']['vals'].items()})
            for k in columns:
                row.append(translated_eval[k])
            row.append(trial['result']['loss'])
            results.loc[idx] = row

        path = self.config_local.path_result / self.model_name
        path.mkdir(parents=True, exist_ok=True)
        results.to_csv(str(path / "trials.csv"), index=False)

        self._logger.info(results)
        self._logger.info('Found golden setting:')
        self._logger.info(space_eval(self.search_space, self._best_result))

    def return_best(self):
        """Function to return the best hyper-parameters"""
        assert self._best_result is not None, 'Cannot find golden setting. Has optimize() been called?'
        return space_eval(self.search_space, self._best_result)

    def _get_loss(self, params):
        """Function that defines and acquires the loss"""

        # copy the hyperparameters to trainer config and hyperparameter set.
        for key, value in params.items():
            self.config_local.__dict__[key] = value
        self.config_local.__dict__['device'] = self.kge_args.device
        model = self.model_obj(**self.config_local.__dict__)

        self.trainer = Trainer(model, self.config_local)

        # configure common setting for a tuning training.
        self.config_local.disp_result = False
        self.config_local.disp_summary = False
        self.config_local.save_model = False

        # do not overwrite test numbers if set
        if self.config_local.test_num is None:
            self.config_local.test_num = 1000

        if self.kge_args.debug:
            self.config_local.epochs = 1

        # start the trial.
        self.trainer.build_model()
        loss = self.trainer.tune_model()

        return {'loss': loss, 'status': STATUS_OK}
Пример #12
0
class Trainer(TrainerMeta):
    """Class for handling the training of the algorithms.

        Args:
            model (object): Model object
            debug (bool): Flag to check if its debugging
            tuning (bool): Flag to denoting tuning if True
            patience (int): Number of epochs to wait before early stopping the training on no improvement.
            No early stopping if it is a negative number (default: {-1}).

        Examples:
            >>> from pykg2vec.utils.trainer import Trainer
            >>> from pykg2vec.core.TransE import TransE
            >>> trainer = Trainer(TransE())
            >>> trainer.build_model()
            >>> trainer.train_model()
    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, model):
        self.model = model
        self.config = model.config

        self.training_results = []

        self.evaluator = None
        self.generator = None

    def build_model(self):
        """function to build the model"""
        if self.config.optimizer == 'sgd':
            self.optimizer = tf.keras.optimizers.SGD(learning_rate=self.config.learning_rate)
        elif self.config.optimizer == 'rms':
            self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=self.config.learning_rate)
        elif self.config.optimizer == 'adam':
            self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.config.learning_rate)
        elif self.config.optimizer == 'adagrad':
            self.optimizer = tf.keras.optimizers.Adagrad(learning_rate=self.config.learning_rate, initial_accumulator_value=0.0, epsilon=1e-08)
        elif self.config.optimizer == 'adadelta':
            self.optimizer = tf.keras.optimizers.Adadelta(learning_rate=self.config.learning_rate)
        else:
            raise NotImplementedError("No support for %s optimizer" % self.config.optimizer)
        
        # For optimizer that has not supported gpu computation in TF2, place parameters in cpu. 
        if self.config.optimizer in ['rms', 'adagrad', 'adadelta']:
            with tf.device('cpu:0'):
                self.model.def_parameters()
        else:
            self.model.def_parameters()

        self.config.summary()
        self.config.summary_hyperparameter(self.model.model_name)

        self.early_stopper = EarlyStopper(self.config.patience, Monitor.FILTERED_MEAN_RANK)

    ''' Training related functions:'''
    @tf.function
    def train_step_pairwise(self, pos_h, pos_r, pos_t, neg_h, neg_r, neg_t):
        with tf.GradientTape() as tape:
            pos_preds = self.model.forward(pos_h, pos_r, pos_t)
            neg_preds = self.model.forward(neg_h, neg_r, neg_t)
            
            if self.config.sampling == 'adversarial_negative_sampling':
                # RotatE: Adversarial Nnegative Sampling and alpha is the temperature.
                pos_preds = -pos_preds
                neg_preds = -neg_preds
                pos_preds = tf.math.log_sigmoid(pos_preds)
                neg_preds = tf.reshape(neg_preds, [-1, self.config.neg_rate])
                softmax = tf.stop_gradient(tf.nn.softmax(neg_preds*self.config.alpha, axis=1))
                neg_preds = tf.reduce_sum(softmax * (tf.math.log_sigmoid(-neg_preds)), axis=-1)
                loss = -tf.reduce_mean(neg_preds) - tf.reduce_mean(pos_preds)
            else:
                # others that use margin-based & pairwise loss function. (unif or bern)
                loss = tf.reduce_sum(tf.maximum(pos_preds + self.config.margin - neg_preds, 0))
            
            if hasattr(self.model, 'get_reg'):
                # now only NTN uses regularizer, 
                # other pairwise based KGE methods use normalization to regularize parameters.
                loss += self.model.get_reg()

        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

        return loss

    @tf.function
    def train_step_projection(self, h, r, t, hr_t, tr_h):
        with tf.GradientTape() as tape:
            hr_t = tf.cast(tf.sparse.to_dense(tf.sparse.reorder(hr_t)), dtype=tf.float32)
            tr_h = tf.cast(tf.sparse.to_dense(tf.sparse.reorder(tr_h)), dtype=tf.float32)
           
            if self.model.model_name.lower() == "conve" or self.model.model_name.lower() == "tucker":   
                if hasattr(self.config, 'label_smoothing'):
                    hr_t = hr_t * (1.0 - self.config.label_smoothing) + 1.0 / self.config.kg_meta.tot_entity
                    tr_h = tr_h * (1.0 - self.config.label_smoothing) + 1.0 / self.config.kg_meta.tot_entity

                pred_tails = self.model.forward(h, r, direction="tail") # (h, r) -> hr_t forward
                pred_heads = self.model.forward(t, r, direction="head") # (t, r) -> tr_h backward

                loss_tails = tf.reduce_mean(tf.keras.backend.binary_crossentropy(hr_t, pred_tails))
                loss_heads = tf.reduce_mean(tf.keras.backend.binary_crossentropy(tr_h, pred_heads))

                loss = loss_tails + loss_heads
            
            else:
                loss_tails = self.model.forward(h, r, hr_t, direction="tail") # (h, r) -> hr_t forward
                loss_heads = self.model.forward(t, r, tr_h, direction="head") # (t, r) -> tr_h backward

                loss = loss_tails + loss_heads

                if hasattr(self.model, 'get_reg'):
                    # now only complex distmult uses regularizer in algorithms, 
                    loss += self.model.get_reg()


        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

        return loss

    @tf.function
    def train_step_pointwise(self, h, r, t, y):
        with tf.GradientTape() as tape:
            preds = self.model.forward(h, r, t)

            loss = tf.reduce_mean(tf.nn.softplus(y*preds)) 

            if hasattr(self.model, 'get_reg'): # for complex & complex-N3 & DistMult & CP & ANALOGY
                loss += self.model.get_reg(h, r, t)

        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

        return loss

    def train_model(self, monitor=Monitor.FILTERED_MEAN_RANK):
        """Function to train the model."""
        self.generator = Generator(self.model)
        self.evaluator = Evaluator(self.model)

        if self.config.loadFromData:
            self.load_model()
        
        for cur_epoch_idx in range(self.config.epochs):
            self._logger.info("Epoch[%d/%d]" % (cur_epoch_idx, self.config.epochs))
            
            self.train_model_epoch(cur_epoch_idx)

            if cur_epoch_idx % self.config.test_step == 0:
                metrics = self.evaluator.mini_test(cur_epoch_idx)
                              
                if self.early_stopper.should_stop(metrics):
                    ### Early Stop Mechanism
                    ### start to check if the metric is still improving after each mini-test. 
                    ### Example, if test_step == 5, the trainer will check metrics every 5 epoch.
                    break

        self.evaluator.full_test(cur_epoch_idx)
        self.evaluator.metric_calculator.save_test_summary(self.model.model_name)

        self.generator.stop()
        self.save_training_result()

        if self.config.save_model:
            self.save_model()

        if self.config.disp_result:
            self.display()

        if self.config.disp_summary:
            self.config.summary()
            self.config.summary_hyperparameter(self.model.model_name)

        self.export_embeddings()

        return cur_epoch_idx # the runned epoches.

    def tune_model(self):
        """Function to tune the model."""
        current_loss = float("inf")

        self.generator = Generator(self.model)
        self.evaluator = Evaluator(self.model, tuning=True)
       
        for cur_epoch_idx in range(self.config.epochs):
            current_loss = self.train_model_epoch(cur_epoch_idx, tuning=True)

        self.evaluator.full_test(cur_epoch_idx)

        self.generator.stop()
        
        return current_loss

    def train_model_epoch(self, epoch_idx, tuning=False):
        """Function to train the model for one epoch."""
        acc_loss = 0

        num_batch = self.model.config.kg_meta.tot_train_triples // self.config.batch_size if not self.config.debug else 10
       
        metrics_names = ['acc_loss', 'loss'] 
        progress_bar = tf.keras.utils.Progbar(num_batch, stateful_metrics=metrics_names)
        
        self.generator.start_one_epoch(num_batch)

        for batch_idx in range(num_batch):
            data = list(next(self.generator))
            
            if self.model.training_strategy == TrainingStrategy.PROJECTION_BASED:
                h = tf.convert_to_tensor(data[0], dtype=tf.int32)
                r = tf.convert_to_tensor(data[1], dtype=tf.int32)
                t = tf.convert_to_tensor(data[2], dtype=tf.int32)
                hr_t = data[3]
                rt_h = data[4]
                loss = self.train_step_projection(h, r, t, hr_t, rt_h)
            elif self.model.training_strategy == TrainingStrategy.POINTWISE_BASED:
                h = tf.convert_to_tensor(data[0], dtype=tf.int32)
                r = tf.convert_to_tensor(data[1], dtype=tf.int32)
                t = tf.convert_to_tensor(data[2], dtype=tf.int32)
                y = tf.convert_to_tensor(data[3], dtype=tf.float32)
                loss = self.train_step_pointwise(h, r, t, y)
            else:
                ph = tf.convert_to_tensor(data[0], dtype=tf.int32)
                pr = tf.convert_to_tensor(data[1], dtype=tf.int32)
                pt = tf.convert_to_tensor(data[2], dtype=tf.int32)
                nh = tf.convert_to_tensor(data[3], dtype=tf.int32)
                nr = tf.convert_to_tensor(data[4], dtype=tf.int32)
                nt = tf.convert_to_tensor(data[5], dtype=tf.int32)
                loss = self.train_step_pairwise(ph, pr, pt, nh, nr, nt)

            acc_loss += loss

            if not tuning:
                progress_bar.add(1, values=[('acc_loss', acc_loss), ('loss', loss)])

        self.training_results.append([epoch_idx, acc_loss.numpy()])

        return acc_loss.numpy()
   
    def enter_interactive_mode(self):
        self.build_model()
        self.load_model()

        self.evaluator = Evaluator(self.model)
        self._logger.info("""The training/loading of the model has finished!
                                    Now enter interactive mode :)
                                    -----
                                    Example 1: trainer.infer_tails(1,10,topk=5)""")
        self.infer_tails(1,10,topk=5)

        self._logger.info("""-----
                                    Example 2: trainer.infer_heads(10,20,topk=5)""")
        self.infer_heads(10,20,topk=5)

        self._logger.info("""-----
                                    Example 3: trainer.infer_rels(1,20,topk=5)""")
        self.infer_rels(1,20,topk=5)

    def exit_interactive_mode(self):
        self._logger.info("Thank you for trying out inference interactive script :)")

    def infer_tails(self,h,r,topk=5):
        tails = self.evaluator.test_tail_rank(h,r,topk).numpy()
        logs = []
        logs.append("")
        logs.append("(head, relation)->({},{}) :: Inferred tails->({})".format(h,r,",".join([str(i) for i in tails])))
        logs.append("")
        idx2ent = self.model.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.model.config.knowledge_graph.read_cache_data('idx2relation')
        logs.append("head: %s" % idx2ent[h])
        logs.append("relation: %s" % idx2rel[r])

        for idx, tail in enumerate(tails):
            logs.append("%dth predicted tail: %s" % (idx, idx2ent[tail]))

        self._logger.info("\n".join(logs))
        return {tail: idx2ent[tail] for tail in tails}

    def infer_heads(self,r,t,topk=5):
        heads = self.evaluator.test_head_rank(r,t,topk).numpy()
        logs = []
        logs.append("")
        logs.append("(relation,tail)->({},{}) :: Inferred heads->({})".format(t,r,",".join([str(i) for i in heads])))
        logs.append("")
        idx2ent = self.model.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.model.config.knowledge_graph.read_cache_data('idx2relation')
        logs.append("tail: %s" % idx2ent[t])
        logs.append("relation: %s" % idx2rel[r])

        for idx, head in enumerate(heads):
            logs.append("%dth predicted head: %s" % (idx, idx2ent[head]))

        self._logger.info("\n".join(logs))
        return {head: idx2ent[head] for head in heads}

    def infer_rels(self, h, t, topk=5):
        if self.model.model_name.lower() in ["proje_pointwise", "conve", "tucker"]:
            self._logger.info("%s model doesn't support relation inference in nature.")
            return

        rels = self.evaluator.test_rel_rank(h,t,topk).numpy()
        logs = []
        logs.append("")
        logs.append("(head,tail)->({},{}) :: Inferred rels->({})".format(h, t, ",".join([str(i) for i in rels])))
        logs.append("")
        idx2ent = self.model.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.model.config.knowledge_graph.read_cache_data('idx2relation')
        logs.append("head: %s" % idx2ent[h])
        logs.append("tail: %s" % idx2ent[t])

        for idx, rel in enumerate(rels):
            logs.append("%dth predicted rel: %s" % (idx, idx2rel[rel]))

        self._logger.info("\n".join(logs))
        return {rel: idx2rel[rel] for rel in rels}
    
    ''' Procedural functions:'''

    def save_model(self):
        """Function to save the model."""
        saved_path = self.config.path_tmp / self.model.model_name
        saved_path.mkdir(parents=True, exist_ok=True)
        self.model.save_weights(str(saved_path / 'model.vec'))

    def load_model(self):
        """Function to load the model."""
        saved_path = self.config.path_tmp / self.model.model_name
        if saved_path.exists():
            self.model.load_weights(str(saved_path / 'model.vec'))

    def display(self):
        """Function to display embedding."""
        options = {"ent_only_plot": True,
                    "rel_only_plot": not self.config.plot_entity_only,
                    "ent_and_rel_plot": not self.config.plot_entity_only}

        if self.config.plot_embedding:
            viz = Visualization(model=self.model, vis_opts = options)

            viz.plot_embedding(resultpath=self.config.figures, algos=self.model.model_name, show_label=False)

        if self.config.plot_training_result:
            viz = Visualization(model=self.model)
            viz.plot_train_result()

        if self.config.plot_testing_result:
            viz = Visualization(model=self.model)
            viz.plot_test_result()
    
    def export_embeddings(self):
        """
            Export embeddings in tsv and pandas pickled format. 
            With tsvs (both label, vector files), you can:
            1) Use those pretained embeddings for your applications.  
            2) Visualize the embeddings in this website to gain insights. (https://projector.tensorflow.org/)

            Pandas dataframes can be read with pd.read_pickle('desired_file.pickle')
        """
        save_path = self.config.path_embeddings / self.model.model_name
        save_path.mkdir(parents=True, exist_ok=True)
        
        idx2ent = self.model.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.model.config.knowledge_graph.read_cache_data('idx2relation')


        series_ent = pd.Series(idx2ent)
        series_rel = pd.Series(idx2rel)
        series_ent.to_pickle(save_path / "ent_labels.pickle")
        series_rel.to_pickle(save_path / "rel_labels.pickle")

        with open(str(save_path / "ent_labels.tsv"), 'w') as l_export_file:
            for label in idx2ent.values():
                l_export_file.write(label + "\n")

        with open(str(save_path / "rel_labels.tsv"), 'w') as l_export_file:
            for label in idx2rel.values():
                l_export_file.write(label + "\n")

        for parameter in self.model.parameter_list:
            all_ids = list(range(0, int(parameter.shape[0])))
            stored_name = parameter.name.split(':')[0]
            # import pdb; pdb.set_trace()

            if len(parameter.shape) == 2:
                all_embs = parameter.numpy()
                with open(str(save_path / ("%s.tsv" % stored_name)), 'w') as v_export_file:
                    for idx in all_ids:
                        v_export_file.write("\t".join([str(x) for x in all_embs[idx]]) + "\n")

                df = pd.DataFrame(all_embs)
                df.to_pickle(save_path / ("%s.pickle" % stored_name))

    def save_training_result(self):
        """Function that saves training result"""
        files = os.listdir(str(self.model.config.path_result))
        l = len([f for f in files if self.model.model_name in f if 'Training' in f])
        df = pd.DataFrame(self.training_results, columns=['Epochs', 'Loss'])
        with open(str(self.model.config.path_result / (self.model.model_name + '_Training_results_' + str(l) + '.csv')),
                  'w') as fh:
            df.to_csv(fh)
Пример #13
0
class Visualization:
    """Class to aid in visualizing the results and embddings.

        Args:
            model (object): Model object
            vis_opts (list): Options for visualization.
            sess (object): TensorFlow session object, initialized by the trainer.

        Examples:
            >>> from pykg2vec.utils.visualization import Visualization
            >>> from pykg2vec.utils.trainer import Trainer
            >>> from pykg2vec.models.TransE import TransE
            >>> model = TransE()
            >>> trainer = Trainer(model=model)
            >>> trainer.build_model()
            >>> trainer.train_model()
            >>> viz = Visualization(model=model)
            >>> viz.plot_train_result()
    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, model, config, vis_opts=None):
        if vis_opts:
            self.ent_only_plot = vis_opts["ent_only_plot"]
            self.rel_only_plot = vis_opts["rel_only_plot"]
            self.ent_and_rel_plot = vis_opts["ent_and_rel_plot"]
        else:
            self.ent_only_plot = False
            self.rel_only_plot = False
            self.ent_and_rel_plot = False

        self.model = model
        self.config = config

        self.algo_list = [
            'ANALOGY', 'Complex', 'ComplexN3', 'ConvE', 'CP', 'DistMult',
            'DistMult2', 'HoLE', 'KG2E', 'NTN', 'ProjE_pointwise', 'Rescal',
            'RotatE', 'SimplE_avg', 'SimplE_ignr', 'SLM', 'SME_Bilinear',
            'SME_Linear', 'TransD', 'TransE', 'TransH', 'TransM', 'TransR',
            'TuckER'
        ]

        self.h_name = []
        self.r_name = []
        self.t_name = []

        self.h_emb = []
        self.r_emb = []
        self.t_emb = []

        self.h_proj_emb = []
        self.r_proj_emb = []
        self.t_proj_emb = []

        if self.model is not None:
            self.validation_triples_ids = self.config.knowledge_graph.read_cache_data(
                'triplets_valid')
            self.idx2entity = self.config.knowledge_graph.read_cache_data(
                'idx2entity')
            self.idx2relation = self.config.knowledge_graph.read_cache_data(
                'idx2relation')

        self.get_idx_n_emb()

    def get_idx_n_emb(self):
        """Function to get the integer ids and the embedding."""

        idx = np.random.choice(len(self.validation_triples_ids),
                               self.config.disp_triple_num)
        triples = []
        for i, _ in enumerate(idx):
            triples.append(self.validation_triples_ids[idx[i]])

        for t in triples:
            self.h_name.append(self.idx2entity[t.h])
            self.r_name.append(self.idx2relation[t.r])
            self.t_name.append(self.idx2entity[t.t])

            emb_h, emb_r, emb_t = self.model.embed(
                torch.LongTensor([t.h]).to(self.config.device),
                torch.LongTensor([t.r]).to(self.config.device),
                torch.LongTensor([t.t]).to(self.config.device))

            self.h_emb.append(emb_h)
            self.r_emb.append(emb_r)
            self.t_emb.append(emb_t)

            if self.ent_and_rel_plot:
                try:
                    emb_h, emb_r, emb_t = self.model.embed(
                        torch.LongTensor([t.h]).to(self.config.device),
                        torch.LongTensor([t.r]).to(self.config.device),
                        torch.LongTensor([t.t]).to(self.config.device))
                    self.h_proj_emb.append(emb_h)
                    self.r_proj_emb.append(emb_r)
                    self.t_proj_emb.append(emb_t)
                except Exception as e:
                    self._logger.exception(e)

    def plot_embedding(self,
                       resultpath=None,
                       algos=None,
                       show_label=False,
                       disp_num_r_n_e=20):
        """Function to plot the embedding.

            Args:
                resultpath (str): Path where the result will be saved.
                show_label (bool): If True, will display the labels.
                algos (str): Name of the algorithms that generated the embedding.
                disp_num_r_n_e (int): Total number of entities to display for head, tail and relation.

        """
        assert self.model is not None, 'Please provide a model!'

        if self.ent_only_plot:
            x = torch.cat(self.h_emb + self.t_emb, dim=0)
            ent_names = np.concatenate((self.h_name, self.t_name), axis=0)
            self._logger.info("\t Reducing dimension using TSNE to 2!")
            x = TSNE(n_components=2).fit_transform(x.detach().cpu())
            x = np.asarray(x)
            ent_names = np.asarray(ent_names)

            self.draw_embedding(x, ent_names, resultpath,
                                algos + '_entity_plot', show_label)

        if self.rel_only_plot:
            x = torch.cat(self.r_emb, dim=0)
            self._logger.info("\t Reducing dimension using TSNE to 2!")
            x = TSNE(n_components=2).fit_transform(x.detach().cpu())
            self.draw_embedding(x, self.r_name, resultpath,
                                algos + '_rel_plot', show_label)

        if self.ent_and_rel_plot:
            length = len(self.h_proj_emb)
            x = torch.cat(self.h_proj_emb + self.r_proj_emb + self.t_proj_emb,
                          dim=0)
            self._logger.info("\t Reducing dimension using TSNE to 2!")
            x = TSNE(n_components=2).fit_transform(x.detach().cpu())

            h_embs = x[:length, :]
            r_embs = x[length:2 * length, :]
            t_embs = x[2 * length:3 * length, :]

            self.draw_embedding_rel_space(
                h_embs[:disp_num_r_n_e], r_embs[:disp_num_r_n_e],
                t_embs[:disp_num_r_n_e], self.h_name[:disp_num_r_n_e],
                self.r_name[:disp_num_r_n_e], self.t_name[:disp_num_r_n_e],
                resultpath, algos + '_ent_n_rel_plot', show_label)

    def plot_train_result(self):
        """Function to plot the training result."""
        algo = self.algo_list
        path = self.config.path_result
        result = self.config.path_figures
        data = [self.config.dataset_name]

        files = os.listdir(str(path))
        files_lwcase = [f.lower() for f in files]
        for d in data:
            df = pd.DataFrame()
            for a in algo:
                file_no = len([
                    c for c in files_lwcase if a.lower() in c
                    if 'training' in c
                ])
                if file_no < 1:
                    continue
                file_path = str(path / (a.lower() + '_Training_results_' +
                                        str(file_no - 1) + '.csv'))
                if os.path.exists(file_path):
                    with open(
                            str(path / (a.lower() + '_Training_results_' +
                                        str(file_no - 1) + '.csv')),
                            'r') as fh:
                        df_2 = pd.read_csv(fh)
                    if df.empty:
                        df['Epochs'] = df_2['Epochs']
                        df['Loss'] = df_2['Loss']
                        df['Algorithm'] = [a] * len(df_2)
                    else:
                        df_3 = pd.DataFrame()
                        df_3['Epochs'] = df_2['Epochs']
                        df_3['Loss'] = df_2['Loss']
                        df_3['Algorithm'] = [a] * len(df_2)
                        frames = [df, df_3]
                        df = pd.concat(frames)
            plt.figure()
            seaborn.lineplot(x="Epochs",
                             y="Loss",
                             hue="Algorithm",
                             markers=True,
                             dashes=False,
                             data=df)
            files = os.listdir(str(result))
            files_lwcase = [f.lower() for f in files]
            file_no = len(
                [c for c in files_lwcase if d.lower() in c if 'training' in c])
            plt.savefig(str(
                result / (d + '_training_loss_plot_' + str(file_no) + '.pdf')),
                        bbox_inches='tight',
                        dpi=300)
            # plt.show()

    def plot_test_result(self):
        """Function to plot the testing result."""
        algo = self.algo_list
        path = self.config.path_result
        result = self.config.path_figures
        data = [self.config.dataset_name]
        hits = self.config.hits
        assert path is not None and algo is not None and data is not None, 'Please provide valid path, algorithm and dataset!'
        files = os.listdir(str(path))
        # files_lwcase = [f.lower() for f in files if 'Testing' in f]
        # self._logger.info(files_lwcase)
        for d in data:
            df = pd.DataFrame()
            for a in algo:
                file_algo = [
                    c for c in files if a.lower() in c.lower()
                    if 'testing' in c.lower()
                ]
                if not file_algo:
                    continue
                with open(str(path / file_algo[-1]), 'r') as fh:
                    df_2 = pd.read_csv(fh)

                if df.empty:
                    df['Algorithm'] = [a] * len(df_2)
                    df['Epochs'] = df_2['Epoch']
                    df['Mean Rank'] = df_2['Mean Rank']
                    df['Filt Mean Rank'] = df_2['Filtered Mean Rank']

                    for hit in hits:
                        df['Hits' + str(hit)] = df_2['Hit-%d Ratio' % hit]
                        df['Filt Hits' +
                           str(hit)] = df_2['Filtered Hit-%d Ratio' % hit]

                else:
                    df_3 = pd.DataFrame()
                    df_3['Algorithm'] = [a] * len(df_2)
                    df_3['Epochs'] = df_2['Epoch']
                    df_3['Mean Rank'] = df_2['Mean Rank']
                    df_3['Filt Mean Rank'] = df_2['Filtered Mean Rank']

                    for hit in hits:
                        df_3['Hits' + str(hit)] = df_2['Hit-%d Ratio' % hit]
                        df_3['Filt Hits' +
                             str(hit)] = df_2['Filtered Hit-%d Ratio' % hit]

                    frames = [df, df_3]
                    df = pd.concat(frames)

            files = os.listdir(str(result))
            df_4 = df.loc[df['Epochs'] == max(df['Epochs'])]
            df_4 = df_4.loc[:, df_4.columns != 'Epochs']

            file_no = len([
                c for c in files if d.lower() in c.lower()
                if 'testing' in c.lower() if 'latex' in c.lower()
            ])
            with open(
                    str(result / (d + '_testing_latex_table_' +
                                  str(file_no + 1) + '.txt')), 'w') as fh:
                fh.write(df_4.to_latex(index=False))

            file_no = len([
                c for c in files if d.lower() in c.lower()
                if 'testing' in c.lower() if 'table' in c.lower()
                if 'csv' in c.lower()
            ])
            with open(
                    str(result /
                        (d + '_testing_table_' + str(file_no + 1) + '.csv')),
                    'w') as fh:
                df_4.to_csv(fh, index=False)

            df_5 = pd.DataFrame(columns=['Metrics', 'Algorithm', 'Score'])
            metrics = [f for f in df_4.columns if f != 'Algorithm']
            for i in range(len(df_4)):
                # import pdb
                # pdb.set_trace()
                if df_5.empty:
                    df_5['Algorithm'] = [df_4.iloc[i]['Algorithm']
                                         ] * len(metrics)
                    df_5['Metrics'] = metrics
                    df_5['Score'] = df_4.iloc[i][metrics].values
                else:
                    df_t = pd.DataFrame()
                    df_t['Algorithm'] = [df_4.iloc[i]['Algorithm']
                                         ] * len(metrics)
                    df_t['Metrics'] = metrics
                    df_t['Score'] = df_4.iloc[i][metrics].values
                    frame = [df_5, df_t]
                    df_5 = pd.concat(frame)

            df_6 = df_5[~df_5['Metrics'].str.contains('Hits')]
            plt.figure()
            flatui = [
                "#d46a7e", "#d5b60a", "#9b59b6", "#3498db", "#95a5a6",
                "#34495e", "#2ecc71", "#e74c3c"
            ]
            g = seaborn.barplot(x="Metrics",
                                y='Score',
                                hue="Algorithm",
                                palette=flatui,
                                data=df_6)
            g.legend(loc='upper center', bbox_to_anchor=(0.5, 1.14), ncol=6)
            g.tick_params(labelsize=6)
            # ax = seaborn.lineplot(x="Metrics", y='Score', hue="Algorithm",
            #                       markers=True, dashes=False, data=df_5)

            files_lwcase = [f.lower() for f in files]
            file_no = len([
                c for c in files_lwcase if d.lower() in c if 'testing' in c
                if 'rank_plot' in c
            ])
            plt.savefig(
                str(result /
                    (d + '_testing_rank_plot_' + str(file_no + 1) + '.pdf')),
                bbox_inches='tight',
                dpi=300)
            # plt.show()

            df_6 = df_5[df_5['Metrics'].str.contains('Hits')]
            plt.figure()
            flatui = [
                "#3498db", "#95a5a6", "#34495e", "#2ecc71", "#e74c3c",
                "#d46a7e", "#d5b60a", "#9b59b6"
            ]
            g = seaborn.barplot(x="Metrics",
                                y='Score',
                                hue="Algorithm",
                                palette=flatui,
                                data=df_6)
            g.legend(loc='upper center', bbox_to_anchor=(0.5, 1.14), ncol=6)
            g.tick_params(labelsize=6)

            files_lwcase = [f.lower() for f in files]
            file_no = len([
                c for c in files_lwcase if d.lower() in c if 'testing' in c
                if 'hits_plot' in c
            ])
            plt.savefig(
                str(result /
                    (d + '_testing_hits_plot_' + str(file_no + 1) + '.pdf')),
                bbox_inches='tight',
                dpi=300)
            # plt.show()

    @staticmethod
    def draw_embedding(embs, names, resultpath, algos, show_label):
        """Function to draw the embedding.

            Args:
                embs (matrix): Two dimesnional embeddings.
                names (list):List of string name.
                resultpath (str):Path where the result will be save.
                algos (str): Name of the algorithms which generated the algorithm.
                show_label (bool): If True, prints the string names of the entities and relations.

        """
        pos = {}
        node_color_mp = {}
        unique_ent = set(names)
        colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys())

        tot_col = len(colors)
        j = 0
        for i, e in enumerate(unique_ent):
            node_color_mp[e] = colors[j]
            j += 1
            if j >= tot_col:
                j = 0

        G = nx.Graph()
        hm_ent = {}
        for i, ent in enumerate(names):
            hm_ent[i] = ent
            G.add_node(i)
            pos[i] = embs[i]

        colors = []
        for n in list(G.nodes):
            colors.append(node_color_mp[hm_ent[n]])

        plt.figure()
        nodes_draw = nx.draw_networkx_nodes(G,
                                            pos,
                                            node_color=colors,
                                            node_size=50)
        nodes_draw.set_edgecolor('k')
        if show_label:
            nx.draw_networkx_labels(G, pos, font_size=8)

        if not os.path.exists(resultpath):
            os.mkdir(resultpath)

        files = os.listdir(resultpath)
        file_no = len([c for c in files if algos + '_embedding_plot' in c])
        filename = algos + '_embedding_plot_' + str(file_no) + '.png'
        plt.savefig(str(resultpath / filename), bbox_inches='tight', dpi=300)
        # plt.show()

    @staticmethod
    def draw_embedding_rel_space(h_emb, r_emb, t_emb, h_name, r_name, t_name,
                                 resultpath, algos, show_label):
        """Function to draw the embedding in relation space.

            Args:
                h_emb (matrix): Two dimesnional embeddings of head.
                r_emb (matrix): Two dimesnional embeddings of relation.
                t_emb (matrix): Two dimesnional embeddings of tail.
                h_name (list):List of string name of the head.
                r_name (list):List of string name of the relation.
                t_name (list):List of string name of the tail.
                resultpath (str):Path where the result will be save.
                algos (str): Name of the algorithms which generated the algorithm.
                show_label (bool): If True, prints the string names of the entities and relations.

        """
        pos = {}
        node_color_mp_ent = {}
        node_color_mp_rel = {}
        unique_ent = set(h_name) | set(t_name)
        unique_rel = set(r_name)
        colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys())

        tot_col = len(colors)
        j = 0
        for i, e in enumerate(unique_ent):
            node_color_mp_ent[e] = colors[j]
            j += 1
            if j >= tot_col:
                j = 0

        tot_col = len(colors)
        j = 0
        for i, r in enumerate(unique_rel):
            node_color_mp_rel[r] = colors[j]
            j += 1
            if j >= tot_col:
                j = 0

        G = nx.DiGraph()
        idx = 0
        head_colors = []
        rel_colors = []
        tail_colors = []
        head_nodes = []
        tail_nodes = []
        rel_nodes = []

        for i, _ in enumerate(h_name):
            G.add_edge(idx, idx + 1)
            G.add_edge(idx + 1, idx + 2)

            head_nodes.append(idx)
            rel_nodes.append(idx + 1)
            tail_nodes.append(idx + 2)

            head_colors.append(node_color_mp_ent[h_name[i]])
            rel_colors.append(node_color_mp_rel[r_name[i]])
            tail_colors.append(node_color_mp_ent[t_name[i]])

            pos[idx] = h_emb[i]
            pos[idx + 1] = r_emb[i]
            pos[idx + 2] = t_emb[i]
            idx += 3

        plt.figure()
        nodes_draw = nx.draw_networkx_nodes(G,
                                            pos,
                                            nodelist=head_nodes,
                                            node_color=head_colors,
                                            node_shape='o',
                                            node_size=50)
        nodes_draw.set_edgecolor('k')

        nodes_draw = nx.draw_networkx_nodes(G,
                                            pos,
                                            nodelist=rel_nodes,
                                            node_color=rel_colors,
                                            node_size=50,
                                            node_shape='D',
                                            with_labels=show_label)
        nodes_draw.set_edgecolor('k')

        nodes_draw = nx.draw_networkx_nodes(G,
                                            pos,
                                            nodelist=tail_nodes,
                                            node_color=tail_colors,
                                            node_shape='*',
                                            node_size=50)
        nodes_draw.set_edgecolor('k')

        if show_label:
            nx.draw_networkx_labels(G, pos, font_size=8)
        nx.draw_networkx_edges(G, pos, arrows=True, width=0.5, alpha=0.5)

        if not os.path.exists(resultpath):
            os.mkdir(resultpath)

        files = os.listdir(resultpath)
        file_no = len([c for c in files if algos + '_embedding_plot' in c])
        plt.savefig(str(resultpath /
                        (algos + '_embedding_plot_' + str(file_no) + '.png')),
                    bbox_inches='tight',
                    dpi=300)
Пример #14
0
class Trainer:
    """ Class for handling the training of the algorithms.

        Args:
            model (object): KGE model object

        Examples:
            >>> from pykg2vec.utils.trainer import Trainer
            >>> from pykg2vec.models.TransE import TransE
            >>> trainer = Trainer(TransE())
            >>> trainer.build_model()
            >>> trainer.train_model()
    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, model, config):
        self.model = model
        self.config = config

        self.training_results = []

        self.evaluator = None
        self.generator = None

    def build_model(self):
        """function to build the model"""
        self.model.to(self.config.device)
        if self.config.optimizer == "adam":
            self.optimizer = optim.Adam(
                self.model.parameters(),
                lr=self.config.learning_rate,
            )
        elif self.config.optimizer == "sgd":
            self.optimizer = optim.SGD(
                self.model.parameters(),
                lr=self.config.learning_rate,
            )
        elif self.config.optimizer == "adagrad":
            self.optimizer = optim.Adagrad(
                self.model.parameters(),
                lr=self.config.learning_rate,
            )
        elif self.config.optimizer == "rms":
            self.optimizer = optim.RMSprop(
                self.model.parameters(),
                lr=self.config.learning_rate,
            )
        else:
            raise NotImplementedError("No support for %s optimizer" %
                                      self.config.optimizer)

        self.config.summary()

        self.early_stopper = EarlyStopper(self.config.patience,
                                          Monitor.FILTERED_MEAN_RANK)

    ''' Training related functions:'''

    def train_step_pairwise(self, pos_h, pos_r, pos_t, neg_h, neg_r, neg_t):
        pos_preds = self.model(pos_h, pos_r, pos_t)
        neg_preds = self.model(neg_h, neg_r, neg_t)

        if self.config.sampling == 'adversarial_negative_sampling':
            # RotatE: Adversarial Negative Sampling and alpha is the temperature.
            pos_preds = -pos_preds
            neg_preds = -neg_preds
            pos_preds = F.logsigmoid(pos_preds)
            neg_preds = neg_preds.view((-1, self.config.neg_rate))
            softmax = nn.Softmax(dim=1)(neg_preds * self.config.alpha).detach()
            neg_preds = torch.sum(softmax * (F.logsigmoid(-neg_preds)), dim=-1)
            loss = -neg_preds.mean() - pos_preds.mean()
        else:
            # others that use margin-based & pairwise loss function. (uniform or bern)
            loss = pos_preds + self.config.margin - neg_preds
            loss = torch.max(loss, torch.zeros_like(loss)).sum()

        if hasattr(self.model, 'get_reg'):
            # now only NTN uses regularizer,
            # other pairwise based KGE methods use normalization to regularize parameters.
            loss += self.model.get_reg()

        return loss

    def train_step_projection(self, h, r, t, hr_t, tr_h):
        if self.model.model_name.lower(
        ) == "conve" or self.model.model_name.lower() == "tucker":
            if hasattr(self.config, 'label_smoothing'):
                hr_t = hr_t * (1.0 - self.config.label_smoothing
                               ) + 1.0 / self.config.tot_entity
                tr_h = tr_h * (1.0 - self.config.label_smoothing
                               ) + 1.0 / self.config.tot_entity

            pred_tails = self.model(h, r,
                                    direction="tail")  # (h, r) -> hr_t forward
            pred_heads = self.model(
                t, r, direction="head")  # (t, r) -> tr_h backward

            loss_tails = torch.mean(F.binary_cross_entropy(pred_tails, hr_t))
            loss_heads = torch.mean(F.binary_cross_entropy(pred_heads, tr_h))

            loss = loss_tails + loss_heads

        else:
            loss_tails = self.model(h, r, hr_t,
                                    direction="tail")  # (h, r) -> hr_t forward
            loss_heads = self.model(
                t, r, tr_h, direction="head")  # (t, r) -> tr_h backward

            loss = loss_tails + loss_heads

            if hasattr(self.model, 'get_reg'):
                # now only complex distmult uses regularizer in algorithms,
                loss += self.model.get_reg()

        return loss

    def train_step_pointwise(self, h, r, t, y):
        preds = self.model(h, r, t)
        loss = F.softplus(y * preds).mean()

        if hasattr(self.model, 'get_reg'
                   ):  # for complex & complex-N3 & DistMult & CP & ANALOGY
            loss += self.model.get_reg(h, r, t)

        return loss

    def train_model(self, monitor=Monitor.FILTERED_MEAN_RANK):
        """Function to train the model."""
        self.generator = Generator(self.model, self.config)
        self.evaluator = Evaluator(self.model, self.config)

        if self.config.load_from_data:
            self.load_model()

        for cur_epoch_idx in range(self.config.epochs):
            self._logger.info("Epoch[%d/%d]" %
                              (cur_epoch_idx, self.config.epochs))

            self.train_model_epoch(cur_epoch_idx)

            if cur_epoch_idx % self.config.test_step == 0:
                self.model.eval()
                metrics = self.evaluator.mini_test(cur_epoch_idx)

                if self.early_stopper.should_stop(metrics):
                    ### Early Stop Mechanism
                    ### start to check if the metric is still improving after each mini-test.
                    ### Example, if test_step == 5, the trainer will check metrics every 5 epoch.
                    break

        self.evaluator.full_test(cur_epoch_idx)
        self.evaluator.metric_calculator.save_test_summary(
            self.model.model_name)

        self.generator.stop()
        self.save_training_result()

        if self.config.save_model:
            self.save_model()

        if self.config.disp_result:
            self.display()

        self.export_embeddings()

        return cur_epoch_idx  # the runned epoches.

    def tune_model(self):
        """Function to tune the model."""
        current_loss = float("inf")

        self.generator = Generator(self.model, self.config)
        self.evaluator = Evaluator(self.model, self.config, tuning=True)

        for cur_epoch_idx in range(self.config.epochs):
            current_loss = self.train_model_epoch(cur_epoch_idx, tuning=True)

        self.evaluator.full_test(cur_epoch_idx)

        self.generator.stop()

        return current_loss

    def train_model_epoch(self, epoch_idx, tuning=False):
        """Function to train the model for one epoch."""
        acc_loss = 0

        num_batch = self.config.tot_train_triples // self.config.batch_size if not self.config.debug else 10

        self.generator.start_one_epoch(num_batch)

        progress_bar = tqdm(range(num_batch))

        for _ in progress_bar:
            data = list(next(self.generator))

            self.model.train()
            self.optimizer.zero_grad()

            if self.model.training_strategy == TrainingStrategy.PROJECTION_BASED:
                h = torch.LongTensor(data[0]).to(self.config.device)
                r = torch.LongTensor(data[1]).to(self.config.device)
                t = torch.LongTensor(data[2]).to(self.config.device)
                hr_t = data[3].to(self.config.device)
                tr_h = data[4].to(self.config.device)
                loss = self.train_step_projection(h, r, t, hr_t, tr_h)
            elif self.model.training_strategy == TrainingStrategy.POINTWISE_BASED:
                h = torch.LongTensor(data[0]).to(self.config.device)
                r = torch.LongTensor(data[1]).to(self.config.device)
                t = torch.LongTensor(data[2]).to(self.config.device)
                y = torch.LongTensor(data[3]).to(self.config.device)
                loss = self.train_step_pointwise(h, r, t, y)
            elif self.model.training_strategy == TrainingStrategy.PAIRWISE_BASED:
                pos_h = torch.LongTensor(data[0]).to(self.config.device)
                pos_r = torch.LongTensor(data[1]).to(self.config.device)
                pos_t = torch.LongTensor(data[2]).to(self.config.device)
                neg_h = torch.LongTensor(data[3]).to(self.config.device)
                neg_r = torch.LongTensor(data[4]).to(self.config.device)
                neg_t = torch.LongTensor(data[5]).to(self.config.device)
                loss = self.train_step_pairwise(pos_h, pos_r, pos_t, neg_h,
                                                neg_r, neg_t)
            else:
                raise NotImplementedError("Unknown training strategy: %s" %
                                          self.model.training_strategy)

            loss.backward()
            self.optimizer.step()
            acc_loss += loss.item()

            if not tuning:
                progress_bar.set_description('acc_loss: %f, cur_loss: %f' %
                                             (acc_loss, loss))

        self.training_results.append([epoch_idx, acc_loss])

        return acc_loss

    def enter_interactive_mode(self):
        self.build_model()
        self.load_model()

        self.evaluator = Evaluator(self.model, self.config)
        self._logger.info("""The training/loading of the model has finished!
                                    Now enter interactive mode :)
                                    -----
                                    Example 1: trainer.infer_tails(1,10,topk=5)"""
                          )
        self.infer_tails(1, 10, topk=5)

        self._logger.info("""-----
                                    Example 2: trainer.infer_heads(10,20,topk=5)"""
                          )
        self.infer_heads(10, 20, topk=5)

        self._logger.info("""-----
                                    Example 3: trainer.infer_rels(1,20,topk=5)"""
                          )
        self.infer_rels(1, 20, topk=5)

    def exit_interactive_mode(self):
        self._logger.info(
            "Thank you for trying out inference interactive script :)")

    def infer_tails(self, h, r, topk=5):
        tails = self.evaluator.test_tail_rank(h, r, topk).cpu().numpy()
        logs = [""]
        logs.append("(head, relation)->({},{}) :: Inferred tails->({})".format(
            h, r, ",".join([str(i) for i in tails])))
        logs.append("")
        idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation')
        logs.append("head: %s" % idx2ent[h])
        logs.append("relation: %s" % idx2rel[r])

        for idx, tail in enumerate(tails):
            logs.append("%dth predicted tail: %s" % (idx, idx2ent[tail]))

        self._logger.info("\n".join(logs))
        return {tail: idx2ent[tail] for tail in tails}

    def infer_heads(self, r, t, topk=5):
        heads = self.evaluator.test_head_rank(r, t, topk).cpu().numpy()
        logs = [""]
        logs.append("(relation,tail)->({},{}) :: Inferred heads->({})".format(
            t, r, ",".join([str(i) for i in heads])))
        logs.append("")
        idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation')
        logs.append("tail: %s" % idx2ent[t])
        logs.append("relation: %s" % idx2rel[r])

        for idx, head in enumerate(heads):
            logs.append("%dth predicted head: %s" % (idx, idx2ent[head]))

        self._logger.info("\n".join(logs))
        return {head: idx2ent[head] for head in heads}

    def infer_rels(self, h, t, topk=5):
        if self.model.model_name.lower() in [
                "proje_pointwise", "conve", "tucker"
        ]:
            self._logger.info(
                "%s model doesn't support relation inference in nature.")
            return

        rels = self.evaluator.test_rel_rank(h, t, topk).cpu().numpy()
        logs = [""]
        logs.append("(head,tail)->({},{}) :: Inferred rels->({})".format(
            h, t, ",".join([str(i) for i in rels])))
        logs.append("")
        idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation')
        logs.append("head: %s" % idx2ent[h])
        logs.append("tail: %s" % idx2ent[t])

        for idx, rel in enumerate(rels):
            logs.append("%dth predicted rel: %s" % (idx, idx2rel[rel]))

        self._logger.info("\n".join(logs))
        return {rel: idx2rel[rel] for rel in rels}

    # ''' Procedural functions:'''
    def save_model(self):
        """Function to save the model."""
        saved_path = self.config.path_tmp / self.model.model_name
        saved_path.mkdir(parents=True, exist_ok=True)
        torch.save(self.model.state_dict(), str(saved_path / 'model.vec.pt'))

    def load_model(self):
        """Function to load the model."""
        saved_path = self.config.path_tmp / self.model.model_name
        if saved_path.exists():
            self.model.load_state_dict(
                torch.load(str(saved_path / 'model.vec.pt')))
            self.model.eval()

    def display(self):
        """Function to display embedding."""
        options = {
            "ent_only_plot": True,
            "rel_only_plot": not self.config.plot_entity_only,
            "ent_and_rel_plot": not self.config.plot_entity_only
        }

        if self.config.plot_embedding:
            viz = Visualization(self.model, self.config, vis_opts=options)
            viz.plot_embedding(resultpath=self.config.path_figures,
                               algos=self.model.model_name,
                               show_label=False)

        if self.config.plot_training_result:
            viz = Visualization(self.model, self.config)
            viz.plot_train_result()

        if self.config.plot_testing_result:
            viz = Visualization(self.model, self.config)
            viz.plot_test_result()

    def export_embeddings(self):
        """
            Export embeddings in tsv and pandas pickled format.
            With tsvs (both label, vector files), you can:
            1) Use those pretained embeddings for your applications.
            2) Visualize the embeddings in this website to gain insights. (https://projector.tensorflow.org/)

            Pandas dataframes can be read with pd.read_pickle('desired_file.pickle')
        """
        save_path = self.config.path_embeddings / self.model.model_name
        save_path.mkdir(parents=True, exist_ok=True)

        idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation')

        with open(str(save_path / "ent_labels.tsv"), 'w') as l_export_file:
            for label in idx2ent.values():
                l_export_file.write(label + "\n")

        with open(str(save_path / "rel_labels.tsv"), 'w') as l_export_file:
            for label in idx2rel.values():
                l_export_file.write(label + "\n")

        for named_embedding in self.model.parameter_list:
            all_ids = list(range(0, int(named_embedding.weight.shape[0])))

            stored_name = named_embedding.name

            if len(named_embedding.shape) == 2:
                all_embs = named_embedding.weight.detach().cpu().numpy()
                with open(str(save_path / ("%s.tsv" % stored_name)),
                          'w') as v_export_file:
                    for idx in all_ids:
                        v_export_file.write(
                            "\t".join([str(x) for x in all_embs[idx]]) + "\n")

    def save_training_result(self):
        """Function that saves training result"""
        files = os.listdir(str(self.config.path_result))
        l = len([
            f for f in files if self.model.model_name in f if 'Training' in f
        ])
        df = pd.DataFrame(self.training_results, columns=['Epochs', 'Loss'])
        with open(
                str(self.config.path_result /
                    (self.model.model_name + '_Training_results_' + str(l) +
                     '.csv')), 'w') as fh:
            df.to_csv(fh)
Пример #15
0
class MetricCalculator:
    '''
        MetricCalculator aims to
        1) address all the statistic tasks.
        2) provide interfaces for querying results.

        MetricCalculator is expected to be used by "evaluation_process".
    '''
    _logger = Logger().get_logger(__name__)

    def __init__(self, config):
        self.config = config

        self.hr_t = config.knowledge_graph.read_cache_data('hr_t')
        self.tr_h = config.knowledge_graph.read_cache_data('tr_h')

        # (f)mr  : (filtered) mean rank
        # (f)mrr : (filtered) mean reciprocal rank
        # (f)hit : (filtered) hit-k ratio
        self.mr = {}
        self.fmr = {}
        self.mrr = {}
        self.fmrr = {}
        self.hit = {}
        self.fhit = {}

        self.epoch = None

        self.reset()

    def reset(self):
        # temporarily used buffers and indexes.
        self.rank_head = []
        self.rank_tail = []
        self.f_rank_head = []
        self.f_rank_tail = []
        self.epoch = None
        self.start_time = timeit.default_timer()

    def append_result(self, result):
        predict_tail = result[0]
        predict_head = result[1]

        h, r, t = result[2], result[3], result[4]

        self.epoch = result[5]

        t_rank, f_t_rank = self.get_tail_rank(predict_tail, h, r, t)
        h_rank, f_h_rank = self.get_head_rank(predict_head, h, r, t)

        self.rank_head.append(h_rank)
        self.rank_tail.append(t_rank)
        self.f_rank_head.append(f_h_rank)
        self.f_rank_tail.append(f_t_rank)

    def get_tail_rank(self, tail_candidate, h, r, t):
        """Function to evaluate the tail rank.

           Args:
               id_replace_tail (list): List of the predicted tails for the given head, relation pair
               h (int): head id
               r (int): relation id
               t (int): tail id
               hr_t (dict): list of tails for the given hwS and relation pari.

            Returns:
                Tensors: Returns tail rank and filetered tail rank
        """
        trank = 0
        ftrank = 0

        for j in range(len(tail_candidate)):
            val = tail_candidate[-j - 1]
            if val != t:
                trank += 1
                ftrank += 1
                if val in self.hr_t[(h, r)]:
                    ftrank -= 1
            else:
                break

        return trank, ftrank

    def get_head_rank(self, head_candidate, h, r, t):
        """Function to evaluate the head rank.

           Args:
               head_candidate (list): List of the predicted head for the given tail, relation pair
               h (int): head id
               r (int): relation id
               t (int): tail id

            Returns:
                Tensors: Returns head  rank and filetered head rank
        """
        hrank = 0
        fhrank = 0

        for j in range(len(head_candidate)):
            val = head_candidate[-j - 1]
            if val != h:
                hrank += 1
                fhrank += 1
                if val in self.tr_h[(t, r)]:
                    fhrank -= 1
            else:
                break

        return hrank, fhrank

    def settle(self):
        head_ranks = np.asarray(self.rank_head, dtype=np.float32) + 1
        tail_ranks = np.asarray(self.rank_tail, dtype=np.float32) + 1
        head_franks = np.asarray(self.f_rank_head, dtype=np.float32) + 1
        tail_franks = np.asarray(self.f_rank_tail, dtype=np.float32) + 1

        ranks = np.concatenate((head_ranks, tail_ranks))
        franks = np.concatenate((head_franks, tail_franks))

        self.mr[self.epoch] = np.mean(ranks)
        self.mrr[self.epoch] = np.mean(np.reciprocal(ranks))
        self.fmr[self.epoch] = np.mean(franks)
        self.fmrr[self.epoch] = np.mean(np.reciprocal(franks))

        for hit in self.config.hits:
            self.hit[(self.epoch, hit)] = np.mean(ranks <= hit,
                                                  dtype=np.float32)
            self.fhit[(self.epoch, hit)] = np.mean(franks <= hit,
                                                   dtype=np.float32)

    def get_curr_scores(self):
        scores = {
            'mr': self.mr[self.epoch],
            'fmr': self.fmr[self.epoch],
            'mrr': self.mrr[self.epoch],
            'fmrr': self.fmrr[self.epoch]
        }
        return scores

    def save_test_summary(self, model_name):
        """Function to save the test of the summary.

            Args:
                model_name (str): specify the name of the model.

        """
        files = os.listdir(str(self.config.path_result))
        l = len([f for f in files if model_name in f if 'Testing' in f])
        with open(
                str(self.config.path_result /
                    (model_name + '_summary_' + str(l) + '.txt')), 'w') as fh:
            fh.write('----------------SUMMARY----------------\n')
            for key, val in self.config.__dict__.items():
                if 'gpu' in key:
                    continue
                if 'knowledge_graph' in key:
                    continue
                if not isinstance(val, str):
                    if isinstance(val, list):
                        v_tmp = '['
                        for i, v in enumerate(val):
                            if i == 0:
                                v_tmp += str(v)
                            else:
                                v_tmp += ',' + str(v)
                        v_tmp += ']'
                        val = v_tmp
                    else:
                        val = str(val)
                fh.write(key + ':' + val + '\n')
            fh.write('-----------------------------------------\n')
            fh.write(
                "\n----------Metadata Info for Dataset:%s----------------" %
                self.config.knowledge_graph.dataset_name)
            fh.write("Total Training Triples   :%d\n" %
                     self.config.tot_train_triples)
            fh.write("Total Testing Triples    :%d\n" %
                     self.config.tot_test_triples)
            fh.write("Total validation Triples :%d\n" %
                     self.config.tot_valid_triples)
            fh.write("Total Entities           :%d\n" % self.config.tot_entity)
            fh.write("Total Relations          :%d\n" %
                     self.config.tot_relation)
            fh.write("---------------------------------------------")

        columns = [
            'Epoch', 'Mean Rank', 'Filtered Mean Rank', 'Mean Reciprocal Rank',
            'Filtered Mean Reciprocal Rank'
        ]
        for hit in self.config.hits:
            columns += ['Hit-%d Ratio' % hit, 'Filtered Hit-%d Ratio' % hit]

        results = []
        for epoch, _ in self.mr.items():
            res_tmp = [
                epoch, self.mr[epoch], self.fmr[epoch], self.mrr[epoch],
                self.fmrr[epoch]
            ]

            for hit in self.config.hits:
                res_tmp.append(self.hit[(epoch, hit)])
                res_tmp.append(self.fhit[(epoch, hit)])

            results.append(res_tmp)

        df = pd.DataFrame(results, columns=columns)

        with open(
                str(self.config.path_result /
                    (model_name + '_Testing_results_' + str(l) + '.csv')),
                'a') as fh:
            df.to_csv(fh)

    def display_summary(self):
        """Function to print the test summary."""
        stop_time = timeit.default_timer()
        test_results = []
        test_results.append('')
        test_results.append(
            "------Test Results for %s: Epoch: %d --- time: %.2f------------" %
            (self.config.dataset_name, self.epoch,
             stop_time - self.start_time))
        test_results.append('--# of entities, # of relations: %d, %d' %
                            (self.config.tot_entity, self.config.tot_relation))
        test_results.append('--mr,  filtered mr             : %.4f, %.4f' %
                            (self.mr[self.epoch], self.fmr[self.epoch]))
        test_results.append('--mrr, filtered mrr            : %.4f, %.4f' %
                            (self.mrr[self.epoch], self.fmrr[self.epoch]))
        for hit in self.config.hits:
            test_results.append('--hits%d                        : %.4f ' %
                                (hit, (self.hit[(self.epoch, hit)])))
            test_results.append('--filtered hits%d               : %.4f ' %
                                (hit, (self.fhit[(self.epoch, hit)])))
        test_results.append(
            "---------------------------------------------------------")
        test_results.append('')
        self._logger.info("\n".join(test_results))
Пример #16
0
class Trainer:
    """ Class for handling the training of the algorithms.

        Args:
            model (object): KGE model object

        Examples:
            >>> from pykg2vec.utils.trainer import Trainer
            >>> from pykg2vec.models.pairwise import TransE
            >>> trainer = Trainer(TransE())
            >>> trainer.build_model()
            >>> trainer.train_model()
    """
    TRAINED_MODEL_FILE_NAME = "model.vec.pt"
    TRAINED_MODEL_CONFIG_NAME = "config.npy"
    _logger = Logger().get_logger(__name__)

    def __init__(self, model, config):
        self.model = model
        self.config = config

        self.best_metric = None
        self.monitor = None

        self.training_results = []

        self.evaluator = None
        self.generator = None
        self.optimizer = None
        self.early_stopper = None

    def build_model(self, monitor=Monitor.FILTERED_MEAN_RANK):
        """function to build the model"""
        if self.config.load_from_data is not None:
            self.load_model(self.config.load_from_data)

        self.evaluator = Evaluator(self.model, self.config)

        self.model.to(self.config.device)

        if self.config.optimizer == "adam":
            self.optimizer = optim.Adam(
                self.model.parameters(),
                lr=self.config.learning_rate,
            )
        elif self.config.optimizer == "sgd":
            self.optimizer = optim.SGD(
                self.model.parameters(),
                lr=self.config.learning_rate,
            )
        elif self.config.optimizer == "adagrad":
            self.optimizer = optim.Adagrad(
                self.model.parameters(),
                lr=self.config.learning_rate,
            )
        elif self.config.optimizer == "rms":
            self.optimizer = optim.RMSprop(
                self.model.parameters(),
                lr=self.config.learning_rate,
            )
        elif self.config.optimizer == "riemannian":
            param_names = [
                name for name, param in self.model.named_parameters()
            ]
            self.optimizer = RiemannianOptimizer(self.model.parameters(),
                                                 lr=self.config.learning_rate,
                                                 param_names=param_names)
        else:
            raise NotImplementedError("No support for %s optimizer" %
                                      self.config.optimizer)

        self.config.summary()

        self.early_stopper = EarlyStopper(self.config.patience, monitor)

    # Training related functions:
    def train_step_pairwise(self, pos_h, pos_r, pos_t, neg_h, neg_r, neg_t):
        pos_preds = self.model(pos_h, pos_r, pos_t)
        neg_preds = self.model(neg_h, neg_r, neg_t)

        if self.model.model_name.lower() == "rotate":
            loss = self.model.loss(pos_preds, neg_preds, self.config.neg_rate,
                                   self.config.alpha)
        else:
            loss = self.model.loss(pos_preds, neg_preds, self.config.margin)
        loss += self.model.get_reg(None, None, None)

        return loss

    def train_step_projection(self, h, r, t, hr_t, tr_h):
        if self.model.model_name.lower() in [
                "conve", "tucker", "interacte", "hyper", "acre"
        ]:
            pred_tails = self.model(h, r,
                                    direction="tail")  # (h, r) -> hr_t forward
            pred_heads = self.model(
                t, r, direction="head")  # (t, r) -> tr_h backward

            if hasattr(self.config, 'label_smoothing'):
                loss = self.model.loss(pred_heads, pred_tails, tr_h, hr_t,
                                       self.config.label_smoothing,
                                       self.config.tot_entity)
            else:
                loss = self.model.loss(pred_heads, pred_tails, tr_h, hr_t,
                                       None, None)
        else:
            pred_tails = self.model(h, r, hr_t,
                                    direction="tail")  # (h, r) -> hr_t forward
            pred_heads = self.model(
                t, r, tr_h, direction="head")  # (t, r) -> tr_h backward
            loss = self.model.loss(pred_heads, pred_tails)
        loss += self.model.get_reg(h, r, t)

        return loss

    def train_step_pointwise(self, h, r, t, target):
        preds = self.model(h, r, t)
        loss = self.model.loss(preds, target.type(preds.type()))
        loss += self.model.get_reg(h, r, t)
        return loss

    def train_model(self):

        # for key, value in self.config.__dict__.items():
        #     print(key," ",value)
        #print(self.config.__dict__[""])
        # pdb.set_trace()
        """Function to train the model."""
        self.generator = Generator(self.model, self.config)
        self.monitor = Monitor.FILTERED_MEAN_RANK
        for cur_epoch_idx in range(self.config.epochs):
            self._logger.info("Epoch[%d/%d]" %
                              (cur_epoch_idx, self.config.epochs))

            self.train_model_epoch(cur_epoch_idx)

            if cur_epoch_idx % self.config.test_step == 0:
                self.model.eval()
                with torch.no_grad():
                    metrics = self.evaluator.mini_test(cur_epoch_idx)

                    if self.early_stopper.should_stop(metrics):
                        ### Early Stop Mechanism
                        ### start to check if the metric is still improving after each mini-test.
                        ### Example, if test_step == 5, the trainer will check metrics every 5 epoch.
                        break

                    # store the best model weights.
                    if self.config.save_model:
                        if self.best_metric is None:
                            self.best_metric = metrics
                            self.save_model()
                        else:
                            if self.monitor == Monitor.MEAN_RANK or self.monitor == Monitor.FILTERED_MEAN_RANK:
                                is_better = self.best_metric[
                                    self.monitor.value] > metrics[
                                        self.monitor.value]
                            else:
                                is_better = self.best_metric[
                                    self.monitor.value] < metrics[
                                        self.monitor.value]
                            if is_better:
                                self.save_model()
                                self.best_metric = metrics

        self.model.eval()
        with torch.no_grad():
            self.evaluator.full_test(cur_epoch_idx)

        self.evaluator.metric_calculator.save_test_summary(
            self.model.model_name)

        self.generator.stop()
        self.save_training_result()

        # if self.config.save_model:
        #     self.save_model()

        if self.config.disp_result:
            self.display()

        self.export_embeddings()

        return cur_epoch_idx  # the runned epoches.

    def tune_model(self):
        """Function to tune the model."""
        current_loss = float("inf")

        self.generator = Generator(self.model, self.config)
        self.evaluator = Evaluator(self.model, self.config, tuning=True)

        for cur_epoch_idx in range(self.config.epochs):
            current_loss = self.train_model_epoch(cur_epoch_idx, tuning=True)

        self.model.eval()
        with torch.no_grad():
            self.evaluator.full_test(cur_epoch_idx)

        self.generator.stop()

        return current_loss

    def train_model_epoch(self, epoch_idx, tuning=False):
        """Function to train the model for one epoch."""
        acc_loss = 0

        num_batch = self.config.tot_train_triples // self.config.batch_size if not self.config.debug else 10

        self.generator.start_one_epoch(num_batch)

        progress_bar = tqdm(range(num_batch))

        for _ in progress_bar:
            data = list(next(self.generator))
            self.model.train()
            self.optimizer.zero_grad()

            if self.model.training_strategy == TrainingStrategy.PROJECTION_BASED:
                h = torch.LongTensor(data[0]).to(self.config.device)
                r = torch.LongTensor(data[1]).to(self.config.device)
                t = torch.LongTensor(data[2]).to(self.config.device)
                hr_t = data[3].to(self.config.device)
                tr_h = data[4].to(self.config.device)
                loss = self.train_step_projection(h, r, t, hr_t, tr_h)
            elif self.model.training_strategy == TrainingStrategy.POINTWISE_BASED:
                h = torch.LongTensor(data[0]).to(self.config.device)
                r = torch.LongTensor(data[1]).to(self.config.device)
                t = torch.LongTensor(data[2]).to(self.config.device)
                y = torch.LongTensor(data[3]).to(self.config.device)
                loss = self.train_step_pointwise(h, r, t, y)
            elif self.model.training_strategy == TrainingStrategy.PAIRWISE_BASED:
                pos_h = torch.LongTensor(data[0]).to(self.config.device)
                pos_r = torch.LongTensor(data[1]).to(self.config.device)
                pos_t = torch.LongTensor(data[2]).to(self.config.device)
                neg_h = torch.LongTensor(data[3]).to(self.config.device)
                neg_r = torch.LongTensor(data[4]).to(self.config.device)
                neg_t = torch.LongTensor(data[5]).to(self.config.device)
                loss = self.train_step_pairwise(pos_h, pos_r, pos_t, neg_h,
                                                neg_r, neg_t)
            else:
                raise NotImplementedError("Unknown training strategy: %s" %
                                          self.model.training_strategy)

            loss.backward()
            self.optimizer.step()
            acc_loss += loss.item()

            if not tuning:
                progress_bar.set_description('acc_loss: %f, cur_loss: %f' %
                                             (acc_loss, loss))

        self.training_results.append([epoch_idx, acc_loss])

        return acc_loss

    def enter_interactive_mode(self):
        self.build_model()
        self.load_model()

        self._logger.info("""The training/loading of the model has finished!
                                    Now enter interactive mode :)
                                    -----
                                    Example 1: trainer.infer_tails(1,10,topk=5)"""
                          )
        self.infer_tails(1, 10, topk=5)

        self._logger.info("""-----
                                    Example 2: trainer.infer_heads(10,20,topk=5)"""
                          )
        self.infer_heads(10, 20, topk=5)

        self._logger.info("""-----
                                    Example 3: trainer.infer_rels(1,20,topk=5)"""
                          )
        self.infer_rels(1, 20, topk=5)

    def exit_interactive_mode(self):
        self._logger.info(
            "Thank you for trying out inference interactive script :)")

    def infer_tails(self, h, r, topk=5):
        tails = self.evaluator.test_tail_rank(h, r,
                                              topk).detach().cpu().numpy()
        idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation')
        logs = [
            "",
            "(head, relation)->({},{}) :: Inferred tails->({})".format(
                h, r, ",".join([str(i) for i in tails])),
            "",
            "head: %s" % idx2ent[h],
            "relation: %s" % idx2rel[r],
        ]

        for idx, tail in enumerate(tails):
            logs.append("%dth predicted tail: %s" % (idx, idx2ent[tail]))

        self._logger.info("\n".join(logs))
        return {tail: idx2ent[tail] for tail in tails}

    def infer_heads(self, r, t, topk=5):
        heads = self.evaluator.test_head_rank(r, t,
                                              topk).detach().cpu().numpy()
        idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation')
        logs = [
            "",
            "(relation,tail)->({},{}) :: Inferred heads->({})".format(
                t, r, ",".join([str(i) for i in heads])),
            "",
            "tail: %s" % idx2ent[t],
            "relation: %s" % idx2rel[r],
        ]

        for idx, head in enumerate(heads):
            logs.append("%dth predicted head: %s" % (idx, idx2ent[head]))

        self._logger.info("\n".join(logs))
        return {head: idx2ent[head] for head in heads}

    def infer_rels(self, h, t, topk=5):
        if self.model.model_name.lower() in [
                "proje_pointwise", "conve", "tucker"
        ]:
            self._logger.info(
                "%s model doesn't support relation inference in nature.")
            return {}

        rels = self.evaluator.test_rel_rank(h, t, topk).detach().cpu().numpy()
        idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation')
        logs = [
            "",
            "(head,tail)->({},{}) :: Inferred rels->({})".format(
                h, t, ",".join([str(i) for i in rels])),
            "",
            "head: %s" % idx2ent[h],
            "tail: %s" % idx2ent[t],
        ]

        for idx, rel in enumerate(rels):
            logs.append("%dth predicted rel: %s" % (idx, idx2rel[rel]))

        self._logger.info("\n".join(logs))
        return {rel: idx2rel[rel] for rel in rels}

    # ''' Procedural functions:'''
    def save_model(self):
        """Function to save the model."""
        saved_path = self.config.path_tmp / self.model.model_name
        saved_path.mkdir(parents=True, exist_ok=True)
        torch.save(self.model.state_dict(),
                   str(saved_path / self.TRAINED_MODEL_FILE_NAME))

        # Save hyper-parameters into a yaml file with the model
        save_path_config = saved_path / self.TRAINED_MODEL_CONFIG_NAME
        np.save(save_path_config, self.config)

    def load_model(self, model_path=None):
        """Function to load the model."""
        if model_path is None:
            model_path_file = self.config.path_tmp / self.model.model_name / self.TRAINED_MODEL_FILE_NAME
            model_path_config = self.config.path_tmp / self.model.model_name / self.TRAINED_MODEL_CONFIG_NAME
        else:
            model_path = Path(model_path)
            model_path_file = model_path / self.TRAINED_MODEL_FILE_NAME
            model_path_config = model_path / self.TRAINED_MODEL_CONFIG_NAME

        if model_path_file.exists() and model_path_config.exists():
            config_temp = np.load(model_path_config, allow_pickle=True).item()
            config_temp.__dict__['load_from_data'] = self.config.__dict__[
                'load_from_data']
            self.config = config_temp

            _, model_def = Importer().import_model_config(
                self.config.model_name.lower())
            self.model = model_def(**self.config.__dict__)
            self.model.load_state_dict(torch.load(str(model_path_file)))
            self.model.eval()
        else:
            raise ValueError("Cannot load model from %s" % model_path_file)

    def display(self):
        """Function to display embedding."""
        options = {
            "ent_only_plot": True,
            "rel_only_plot": not self.config.plot_entity_only,
            "ent_and_rel_plot": not self.config.plot_entity_only
        }

        if self.config.plot_embedding:
            viz = Visualization(self.model, self.config, vis_opts=options)
            viz.plot_embedding(resultpath=self.config.path_figures,
                               algos=self.model.model_name,
                               show_label=False)

        if self.config.plot_training_result:
            viz = Visualization(self.model, self.config)
            viz.plot_train_result()

        if self.config.plot_testing_result:
            viz = Visualization(self.model, self.config)
            viz.plot_test_result()

    def export_embeddings(self):
        """
            Export embeddings in tsv and pandas pickled format.
            With tsvs (both label, vector files), you can:
            1) Use those pretained embeddings for your applications.
            2) Visualize the embeddings in this website to gain insights. (https://projector.tensorflow.org/)

            Pandas dataframes can be read with pd.read_pickle('desired_file.pickle')
        """
        save_path = self.config.path_embeddings / self.model.model_name
        save_path.mkdir(parents=True, exist_ok=True)

        idx2ent = self.config.knowledge_graph.read_cache_data('idx2entity')
        idx2rel = self.config.knowledge_graph.read_cache_data('idx2relation')

        with open(str(save_path / "ent_labels.tsv"), 'w') as l_export_file:
            for label in idx2ent.values():
                l_export_file.write(label + "\n")

        with open(str(save_path / "rel_labels.tsv"), 'w') as l_export_file:
            for label in idx2rel.values():
                l_export_file.write(label + "\n")

        for named_embedding in self.model.parameter_list:
            all_ids = list(range(0, int(named_embedding.weight.shape[0])))

            stored_name = named_embedding.name

            if len(named_embedding.weight.shape) == 2:
                all_embs = named_embedding.weight.detach().detach().cpu(
                ).numpy()
                with open(str(save_path / ("%s.tsv" % stored_name)),
                          'w') as v_export_file:
                    for idx in all_ids:
                        v_export_file.write(
                            "\t".join([str(x) for x in all_embs[idx]]) + "\n")

    def save_training_result(self):
        """Function that saves training result"""
        files = os.listdir(str(self.config.path_result))
        l = len([
            f for f in files if self.model.model_name in f if 'Training' in f
        ])
        df = pd.DataFrame(self.training_results, columns=['Epochs', 'Loss'])
        with open(
                str(self.config.path_result /
                    (self.model.model_name + '_Training_results_' + str(l) +
                     '.csv')), 'w') as fh:
            df.to_csv(fh)
Пример #17
0
class Evaluator:
    """Class to perform evaluation of the model.

        Args:
            model (object): Model object
            tuning (bool): Flag to denoting tuning if True

        Examples:
            >>> from pykg2vec.utils.evaluator import Evaluator
            >>> evaluator = Evaluator(model=model, tuning=True)
            >>> evaluator.test_batch(Session(), 0)
            >>> acc = evaluator.output_queue.get()
            >>> evaluator.stop()
    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, model, config, tuning=False):
        self.model = model
        self.config = config
        self.tuning = tuning
        self.test_data = self.config.knowledge_graph.read_cache_data(
            'triplets_test')
        self.eval_data = self.config.knowledge_graph.read_cache_data(
            'triplets_valid')
        self.metric_calculator = MetricCalculator(self.config)

    def test_tail_rank(self, h, r, topk=-1):
        if hasattr(self.model, 'predict_tail_rank'):
            rank = self.model.predict_tail_rank(
                torch.LongTensor([h]).to(self.config.device),
                torch.LongTensor([r]).to(self.config.device),
                topk=topk)
            return rank.squeeze(0)

        h_batch = torch.LongTensor([h]).repeat([self.config.tot_entity
                                                ]).to(self.config.device)
        r_batch = torch.LongTensor([r]).repeat([self.config.tot_entity
                                                ]).to(self.config.device)
        entity_array = torch.LongTensor(list(range(
            self.config.tot_entity))).to(self.config.device)

        preds = self.model.forward(h_batch, r_batch, entity_array)
        _, rank = torch.topk(preds, k=topk)
        return rank

    def test_head_rank(self, r, t, topk=-1):
        if hasattr(self.model, 'predict_head_rank'):
            rank = self.model.predict_head_rank(
                torch.LongTensor([t]).to(self.config.device),
                torch.LongTensor([r]).to(self.config.device),
                topk=topk)
            return rank.squeeze(0)

        entity_array = torch.LongTensor(list(range(
            self.config.tot_entity))).to(self.config.device)
        r_batch = torch.LongTensor([r]).repeat([self.config.tot_entity
                                                ]).to(self.config.device)
        t_batch = torch.LongTensor([t]).repeat([self.config.tot_entity
                                                ]).to(self.config.device)

        preds = self.model.forward(entity_array, r_batch, t_batch)
        _, rank = torch.topk(preds, k=topk)
        return rank

    def test_rel_rank(self, h, t, topk=-1):
        if hasattr(self.model, 'predict_rel_rank'):
            # TODO: This is not implemented for conve, convkb, proje_pointwise, tucker, interacte and hyper
            rank = self.model.predict_rel_rank(h.to(self.config.device),
                                               t.to(self.config.device),
                                               topk=topk)
            return rank.squeeze(0)

        h_batch = torch.LongTensor([h]).repeat([self.config.tot_relation
                                                ]).to(self.config.device)
        rel_array = torch.LongTensor(list(range(self.config.tot_relation))).to(
            self.config.device)
        t_batch = torch.LongTensor([t]).repeat([self.config.tot_relation
                                                ]).to(self.config.device)

        preds = self.model.forward(h_batch, rel_array, t_batch)
        _, rank = torch.topk(preds, k=topk)
        return rank

    def mini_test(self, epoch=None):
        if self.config.test_num == 0:
            tot_valid_to_test = len(self.eval_data)
        else:
            tot_valid_to_test = min(self.config.test_num, len(self.eval_data))
        if self.config.debug:
            tot_valid_to_test = 10

        self._logger.info("Mini-Testing on [%d/%d] Triples in the valid set." %
                          (tot_valid_to_test, len(self.eval_data)))
        return self.test(self.eval_data, tot_valid_to_test, epoch=epoch)

    def full_test(self, epoch=None):
        tot_valid_to_test = len(self.test_data)
        if self.config.debug:
            tot_valid_to_test = 10

        self._logger.info("Full-Testing on [%d/%d] Triples in the test set." %
                          (tot_valid_to_test, len(self.test_data)))
        return self.test(self.test_data, tot_valid_to_test, epoch=epoch)

    def test(self, data, num_of_test, epoch=None):
        self.metric_calculator.reset()

        progress_bar = tqdm(range(num_of_test))
        for i in progress_bar:
            h, r, t = data[i].h, data[i].r, data[i].t

            # generate head batch and predict heads.
            h_tensor = torch.LongTensor([h])
            r_tensor = torch.LongTensor([r])
            t_tensor = torch.LongTensor([t])

            hrank = self.test_head_rank(r_tensor, t_tensor,
                                        self.config.tot_entity)
            trank = self.test_tail_rank(h_tensor, r_tensor,
                                        self.config.tot_entity)

            result_data = [
                trank.detach().cpu().numpy(),
                hrank.detach().cpu().numpy(), h, r, t, epoch
            ]

            self.metric_calculator.append_result(result_data)

        self.metric_calculator.settle()
        self.metric_calculator.display_summary()

        if self.metric_calculator.epoch >= self.config.epochs - 1:
            self.metric_calculator.save_test_summary(self.model.model_name)

        return self.metric_calculator.get_curr_scores()
Пример #18
0
class BaysOptimizer(object):
    """Bayesian optimizer class for tuning hyperparameter.

      This class implements the Bayesian Optimizer for tuning the 
      hyper-parameter.

      Args:
        args (object): The Argument Parser object providing arguments.
        name_dataset (str): The name of the dataset.
        sampling (str): sampling to be used for generating negative triples


      Examples:
        >>> from pykg2vec.config.hyperparams import KGETuneArgParser
        >>> from pykg2vec.utils.bayesian_optimizer import BaysOptimizer
        >>> model = Complex()
        >>> args = KGETuneArgParser().get_args(sys.argv[1:])
        >>> bays_opt = BaysOptimizer(args=args)
        >>> bays_opt.optimize()
    """
    _logger = Logger().get_logger(__name__)

    def __init__(self, args=None):
        """store the information of database"""
        if args.model.lower() in ["tucker", "tucker_v2", "conve", "convkb", "proje_pointwise"]:
          raise Exception("Model %s has not been supported in tuning hyperparameters!" % args.model)

        model_name = args.model.lower()
        self.args = args
        self.knowledge_graph = KnowledgeGraph(dataset=args.dataset_name, custom_dataset_path=args.dataset_path)
        hyper_params = None
        try:
            self.model_obj = getattr(importlib.import_module(model_path + ".%s" % moduleMap[model_name]),
                                     modelMap[model_name])
            self.config_obj = getattr(importlib.import_module(config_path), configMap[model_name])
            hyper_params = getattr(importlib.import_module(hyper_param_path), hypMap[model_name])()

        except ModuleNotFoundError:
            self._logger.error("%s not implemented! Select from: %s" % \
                               (model_name, ' '.join(map(str, modelMap.values()))))
        
        from pykg2vec.config.config import KGEArgParser
        kge_args = KGEArgParser().get_args([])
        kge_args.dataset_name = args.dataset_name
        kge_args.debug = self.args.debug
        config = self.config_obj(kge_args)
        model =  self.model_obj(config)
        
        self.trainer = Trainer(model)
        
        self.search_space = hyper_params.search_space
        self.max_evals = self.args.max_number_trials if not self.args.debug else 1
        
    def optimize(self):
        """Function that performs bayesian optimization"""
        trials = Trials()
        
        self.best_result = fmin(fn=self.get_loss, space=self.search_space, trials=trials,
                                algo=tpe.suggest, max_evals=self.max_evals)
        
        columns = list(self.search_space.keys())   
        results = pd.DataFrame(columns=['iteration'] + columns + ['loss'])
        
        for idx, trial in enumerate(trials.trials):
            row = []
            row.append(idx)
            translated_eval = space_eval(self.search_space, {k: v[0] for k, v in trial['misc']['vals'].items()})
            for k in columns:
                row.append(translated_eval[k])
            row.append(trial['result']['loss'])
            results.loc[idx] = row

        path = self.trainer.config.path_result / self.trainer.model.model_name 
        path.mkdir(parents=True, exist_ok=True)
        results.to_csv(str(path / "trials.csv"), index=False)
        
        self._logger.info(results)
        self._logger.info('Found Golden Setting:')
        self._logger.info(space_eval(self.search_space, self.best_result))

    def return_best(self):
        """Function to return the best hyper-parameters"""
        return space_eval(self.search_space, self.best_result)

    def get_loss(self, params):
        """Function that defines and acquires the loss"""
        
        # copy the hyperparameters to trainer config and hyperparameter set. 
        for key, value in params.items():
          self.trainer.config.__dict__[key] = value
          self.trainer.config.hyperparameters[key] = value  
        
        # configure common setting for a tuning training. 
        self.trainer.config.disp_result = False
        self.trainer.config.disp_summary = False
        self.trainer.config.save_model = False

        # do not overwrite test numbers if set
        if self.trainer.config.test_num is None:
            self.trainer.config.test_num = 1000

        if self.args.debug:
          self.trainer.config.epochs = 1
          self.trainer.config.hyperparameters['epochs'] = 1
        
        # start the trial.
        self.trainer.build_model()
        loss = self.trainer.tune_model()

        return {'loss': loss, 'status': STATUS_OK}