示例#1
0
def insert_triples_from_graph_pattern(self, dataset_path, graph_pattern):
    # Loads the current dataset
    dtset = dataset.Dataset()
    dtset.load_from_binary(dataset_path)

    # Heavy task
    dtset.load_from_graph_pattern(verbose=2, where=graph_pattern)
    # dt.show()
    dtset.save_to_binary(dataset_path)

    # TODO Update values on dataset db

    return False
示例#2
0
    def build_dataset_object(self, dataset_dto):  # TODO: Maybe uneeded?
        """Returns a Dataset object if required by rest service

        This method need datasetDAO has binary_dataset variable initialized.

        :returns: a Dataset object
        :rtype: kgeserver.dataset.Dataset
        """
        if dataset_dto and dataset_dto._binary_dataset:
            dtst = dataset.Dataset()
            path = os.path.join(self.bin_path, dataset_dto._binary_dataset)
            dtst.load_from_binary(path)
            return dtst
        else:
            return None
示例#3
0
    def from_dict(self, result_dict, use_cache=True):
        """Given a result dict, with proper fields, builds a datasetDTO

        :param dict result_dict: A dict with all fields required
        :return: A dataset dictionary or None
        :rtype: dict
        """  # INFO:This method is intended ONLY to fill a dataset dict.
        # Query has an object
        self._binary_dataset = result_dict['binary_dataset']
        self._binary_model = result_dict['binary_model']
        self._binary_index = result_dict['binary_index']
        self.status = result_dict['status']
        self.name = result_dict['name']
        self.description = result_dict['description']
        self.id = int(result_dict['id'])
        self.dataset_type = result_dict['dataset_type']
        self.task = result_dict['task']

        if result_dict['triples'] is not None and\
           result_dict['relations'] is not None and\
           result_dict['entities'] is not None and use_cache:
            # These fields are filled and can be readable
            print("Using cached values")
            self.triples = result_dict['triples']
            self.entities = result_dict['entities']
            self.relations = result_dict['relations']
        else:
            # Fields should be readed from file
            print("Without cache")
            dtst = dataset.Dataset()
            dtst_path = os.path.join(self._base, self._binary_dataset)
            try:
                dtst.load_from_binary(dtst_path)
            except OSError as err:
                self.error = "Dataset not found: " + str(err)
                self.is_error_dto()
            self.triples = len(dtst.subs)
            self.entities = len(dtst.entities)
            self.relations = len(dtst.relations)

        alg_dao = AlgorithmDAO()
        algorithm, err = alg_dao.get_algorithm_by_id(result_dict['algorithm'])
        if algorithm is None:
            raise LookupError(err)
        self.algorithm = algorithm

        return None
示例#4
0
def find_embeddings_on_model(dataset_id, entities):
    """Returns a list with the corresponding embeddings

    This will return a list like this:

    [["Q1", [0, 1, -1, 0.4]], ["Q5", [1, -0.5, -0.1, 0]]]

    :param str model_path: The path to the binary model
    :param list entities: A list with the URI (or identifiers) of entities
    :returns: The embedding vector of each entity
    :rtype: dict
    """
    # Expected to return: {entities: [], embeddings: []} IN THE SAME ORDER!!
    dataset_dao = data_access.DatasetDAO()
    # dataset, err = dataset_dao.get_dataset_by_id(dataset_id)
    # if dataset is None:
    #     raise LookupError("The dataset couldn't be located")

    dataset_path, err = dataset_dao.get_binary_path(dataset_id)
    if dataset_path is None:
        raise FileNotFoundError("The binary dataset doesn't exist on database")

    # Load dataset from binary
    dtset = dataset.Dataset()
    dtset.load_from_binary(dataset_path)

    model_path, err = dataset_dao.get_model(dataset_id)
    if model_path is None:
        raise FileNotFoundError("The model path does not exist on database")
    # Load the model and initialize the search index
    model = skge.TransE.load(model_path)

    return_list = []
    for entity in entities:
        position = dtset.get_entity_id(entity)
        if position is None or position < 0:
            continue
        else:
            embedding = model.E[position]
        return_list.append([entity, embedding.tolist()])
    return return_list
示例#5
0
    def insert_triples(self, dataset_dto, triples_list):
        # TODO: This should not be here
        """Insert triples on the Dataset.
        """
        dtst_path = dataset_dto.get_binary_dataset()
        if dtst_path is None:
            return None, (500, "Dataset couldn't be loaded")
        dtset = dataset.Dataset()
        dtset.load_from_binary(dtst_path)
        dtset.show()

        result = dtset.load_dataset_from_json(triples_list)

        # sql = "SELECT binary_dataset FROM dataset WHERE id=?"
        # result = self.execute_query(sql, self.dataset["id"])
        # bin_file = result[0]['binary_dataset']

        dtset.show()

        result = result and\
            dtset.save_to_binary(dataset_dto.get_binary_dataset())

        return result, None
示例#6
0
def generate_dataset_from_sparql(self, dataset_id, graph_pattern, levels,
                                 **keyw_args):
    """Creates a recurrent dataset from a seed vector

    This method is intended to be called only with celery *.delay()*, to
    be executed in foreground. The status of the generation can be queried
    through it's celery UUID.

    :param levels: The number of levels to scan
    :param dataset_path: The path to dataset file
    :param graph_pattern: The main query containing triples
    :kwparam limit_ent: Use only for testing purposes
    """
    from celery import current_task  # in task definition
    dataset_dao = data_access.DatasetDAO()
    dataset_dao.update_status(dataset_id, RUNNING_TASK_MASK)

    dataset_path, err = dataset_dao.get_binary_path(dataset_id)
    if dataset_path is None:
        raise FileNotFoundError("Dataset path is not on the system")

    # Load current dataset
    dtset = dataset.Dataset()
    dtset.load_from_binary(dataset_path)

    # Obtains the Redis connection from celery.
    redis = self.app.backend
    # The id of the object
    celery_uuid = self.request.id
    # Saves the empty id to be retrieved first time without error
    # redis.set(celery_uuid, "{}".encode("utf-8"))
    progres_dao = data_access.ProgressDAO()
    progres_dao.create_progress(celery_uuid, 1)
    progress = progres_dao.get_progress(celery_uuid)
    progress.total_steps = 1
    progress.current_steps = 1
    progress.current = 0
    progress.total = 0
    progres_dao.set_progress(celery_uuid, progress)

    def init_progress_callback(max_iter):
        progress = progres_dao.get_progress(celery_uuid)
        progress.total = max_iter
        progres_dao.set_progress(celery_uuid, progress)

    sv_kwargs = {}
    sv_kwargs['where'] = graph_pattern
    sv_kwargs['callback'] = lambda: progres_dao.add_progress(celery_uuid)
    sv_kwargs['start_callback'] = init_progress_callback

    # Batch limit has to be an integer
    try:
        sv_kwargs['batch_size'] = int(keyw_args.pop('batch_size'))
    except (LookupError, ValueError, TypeError):
        pass

    # Get the seed vector and load first entities
    seed_vector = dtset.load_from_graph_pattern(**sv_kwargs)

    celery_uuid = "celery-task-progress-" + self.request.id

    def status_callback(status):
        """Saves the progress of the task on redis db"""
        # Create progress object
        progress = {
            "current": status['it_analyzed'],
            "total": status['it_total'],
            "current_steps": status['round_curr'] + 1,
            "total_steps": status['round_total']
        }

        # Retrieve task from redis
        task = redis.get(celery_uuid).decode("utf-8")
        task = json.loads(task)

        # Add task progress
        task['progress'] = progress

        # Save again on redis
        task = json.dumps(task).encode("utf-8")
        redis.set(celery_uuid, task)
        return

    # Build the optional args dict
    keyw_args["ext_callback"] = status_callback

    # Call to the *heavy* method
    dtset.load_dataset_recurrently(levels, seed_vector, **keyw_args)

    # Save new binary
    dtset.save_to_binary(dataset_path)

    # Restore status
    dataset_dao.set_status(dataset_id, 0)

    return False
示例#7
0
def build_autocomplete_index(self, dataset_id, langs=['en', 'es']):
    """Generates an autocomplete index from a dataset using choosen languages

    This method extracts labels, descriptions and other useful information
    from sparql endpoint (or any other source) and stores it on the search
    database (elasticsearch). As the dataset may contain too much information
    in many languages, this will only use the selected languages.

    :param int dataset_id: The dataset ID
    :param list langs: A list of languages in ISO 639-1 format
    """
    # Creates the progress object in redis
    celery_uuid = self.request.id
    progres_dao = data_access.ProgressDAO()

    # Load binary dataset
    dataset_dao = data_access.DatasetDAO()
    dataset_path, err = dataset_dao.get_binary_path(dataset_id)
    dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id)
    dtset = dataset.Dataset()
    dtset.load_from_binary(dataset_path)
    # Set working status
    # TODO: update status, not overwrite it
    dataset_dao.update_status(dataset_id,
                              SEARCHINDEXED_MASK | RUNNING_TASK_MASK)
    # Update Progress
    progres_dao.create_progress(celery_uuid, len(dtset.entities))
    progres_dao.update_progress(celery_uuid, 0)

    entity_dao = data_access.EntityDAO(dataset_dto.dataset_type, dataset_id)

    def get_labels(entity):
        """Auxiliar method to wrap dtset.entity_labels.

        Receives only one entity and stores on search Index
        """
        # Get the labels from endpoint
        labels, descriptions, alt_labels = dtset.entity_labels(entity,
                                                               langs=langs)

        # track progress: add one more step
        progres_dao.add_progress(celery_uuid)

        # Create the doc to be stored on elasticsearch and insert it
        entity_doc = {
            "entity_id": entity,
            "entity_uri": dtset.check_entity(entity),
            "label": labels,
            "alt_label": alt_labels,
            "description": descriptions
        }
        entity_dao.insert_entity(entity_doc)

    # Execute get_labels concurrently, using as many processes as cpu cores
    with ThreadPool(multiprocessing.cpu_count()) as p:
        all_labels = p.map(get_labels, dtset.entities)

    # Update status on DB when finished
    dataset_dao.update_status(dataset_id, SEARCHINDEXED_MASK, statusAnd=0b1110)

    return False
示例#8
0
def train_dataset_from_algorithm(self, dataset_id, algorithm_dict):
    """Trains a dataset given an algorithm

    It is able to save the progress of training.
    :param str dataset_path: The path where binary dataset is located
    :param dict algorithm: An algorithm to be used in dataset training
    """

    dataset_dao = data_access.DatasetDAO()

    # If it all goes ok, add id of algorithm to db
    dataset_dao.set_algorithm(dataset_id, algorithm_dict["id"])
    dataset_dao.update_status(dataset_id, RUNNING_TASK_MASK | TRAINED_MASK)

    dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id)
    # Generate the filepath to the dataset
    dtset_path = dataset_dto.get_binary_dataset()
    # Loads the current dataset
    dtset = dataset.Dataset()
    dtset.load_from_binary(dtset_path)

    # Obtains the Redis connection from celery.
    redis = self.app.backend
    # The id of the object
    celery_uuid = "celery-task-progress-" + self.request.id
    # Saves the empty id to be retrieved first time without error
    progress = {
        "current": -1,
        "total": algorithm_dict['max_epochs'],
        "current_steps": None,
        "total_steps": None
    }
    redis.set(celery_uuid, json.dumps({"progress": progress}).encode("utf-8"))

    def status_callback(trainer):
        """Saves the progress of the task on redis db"""
        print("Status Callback. Trainer {}".format(trainer.epoch))
        # Retrieve task from redis
        task = redis.get(celery_uuid).decode("utf-8")
        task = json.loads(task)

        # Add task progress
        task['progress']['current'] = trainer.epoch

        # Save again on redis
        task = json.dumps(task).encode("utf-8")
        redis.set(celery_uuid, task)
        return

    # Creates an optional parameters dict for better readability
    kwargs = {
        'train_all': True,  # All dataset will be trained, not validated
        'test_all': -1,  # No validation is going to be performed
        'model_type': skge.TransE,  # The default model will be used
        'ncomp': algorithm_dict['embedding_size'],  # Provided by the algorithm
        'margin': algorithm_dict['margin'],  # Provided by the algorithm
        'max_epochs': algorithm_dict['max_epochs'],  # Max number of iterations
        'external_callback': status_callback,  # The status callback
    }

    # Heavy task
    model = algorithm.ModelTrainer(dtset, **kwargs)
    modeloentrenado = model.run()
    model_path = dtset_path[:-4] + "_model.bin"
    modeloentrenado.save(model_path)

    # Update values on DB when model training has finished
    dataset_dao.update_status(dataset_id, TRAINED_MASK, statusAnd=0b1110)
    dataset_dao.set_model(dataset_id, model_path)

    return False
示例#9
0
            threads.append(t)
            t.start()

        for th in threads:
            th.join()

        best = sorted(model_trainer_scores,
                      key=lambda t: t[1][0],
                      reverse=True)[0]

        kwdict = best[0].get_conf()
        kwdict['train_all'] = True
        kwdict['test_all'] = -1
        new_model_trainer = ModelTrainer(self.dataset, **kwdict)
        return (model_trainer_scores, best, new_model_trainer)


if __name__ == '__main__':

    dtset = dataset.Dataset()
    # dataset.load_from_binary("holographic-embeddings/data/wn18.bin")
    dtset.load_from_binary("wdata_15k.bin")

    alg = Algorithm(dtset)
    alg.find_best()
    # modeltrainer = ModelTrainer(dtset, model_type=skge.HolE, test_all=10,
    #                             max_epochs=200, margin=0.2, ncomp=50,
    #                             mode="rank")
    # modeltrained = modeltrainer.run()
    # print(modeltrainer.scores)