def insert_triples_from_graph_pattern(self, dataset_path, graph_pattern): # Loads the current dataset dtset = dataset.Dataset() dtset.load_from_binary(dataset_path) # Heavy task dtset.load_from_graph_pattern(verbose=2, where=graph_pattern) # dt.show() dtset.save_to_binary(dataset_path) # TODO Update values on dataset db return False
def build_dataset_object(self, dataset_dto): # TODO: Maybe uneeded? """Returns a Dataset object if required by rest service This method need datasetDAO has binary_dataset variable initialized. :returns: a Dataset object :rtype: kgeserver.dataset.Dataset """ if dataset_dto and dataset_dto._binary_dataset: dtst = dataset.Dataset() path = os.path.join(self.bin_path, dataset_dto._binary_dataset) dtst.load_from_binary(path) return dtst else: return None
def from_dict(self, result_dict, use_cache=True): """Given a result dict, with proper fields, builds a datasetDTO :param dict result_dict: A dict with all fields required :return: A dataset dictionary or None :rtype: dict """ # INFO:This method is intended ONLY to fill a dataset dict. # Query has an object self._binary_dataset = result_dict['binary_dataset'] self._binary_model = result_dict['binary_model'] self._binary_index = result_dict['binary_index'] self.status = result_dict['status'] self.name = result_dict['name'] self.description = result_dict['description'] self.id = int(result_dict['id']) self.dataset_type = result_dict['dataset_type'] self.task = result_dict['task'] if result_dict['triples'] is not None and\ result_dict['relations'] is not None and\ result_dict['entities'] is not None and use_cache: # These fields are filled and can be readable print("Using cached values") self.triples = result_dict['triples'] self.entities = result_dict['entities'] self.relations = result_dict['relations'] else: # Fields should be readed from file print("Without cache") dtst = dataset.Dataset() dtst_path = os.path.join(self._base, self._binary_dataset) try: dtst.load_from_binary(dtst_path) except OSError as err: self.error = "Dataset not found: " + str(err) self.is_error_dto() self.triples = len(dtst.subs) self.entities = len(dtst.entities) self.relations = len(dtst.relations) alg_dao = AlgorithmDAO() algorithm, err = alg_dao.get_algorithm_by_id(result_dict['algorithm']) if algorithm is None: raise LookupError(err) self.algorithm = algorithm return None
def find_embeddings_on_model(dataset_id, entities): """Returns a list with the corresponding embeddings This will return a list like this: [["Q1", [0, 1, -1, 0.4]], ["Q5", [1, -0.5, -0.1, 0]]] :param str model_path: The path to the binary model :param list entities: A list with the URI (or identifiers) of entities :returns: The embedding vector of each entity :rtype: dict """ # Expected to return: {entities: [], embeddings: []} IN THE SAME ORDER!! dataset_dao = data_access.DatasetDAO() # dataset, err = dataset_dao.get_dataset_by_id(dataset_id) # if dataset is None: # raise LookupError("The dataset couldn't be located") dataset_path, err = dataset_dao.get_binary_path(dataset_id) if dataset_path is None: raise FileNotFoundError("The binary dataset doesn't exist on database") # Load dataset from binary dtset = dataset.Dataset() dtset.load_from_binary(dataset_path) model_path, err = dataset_dao.get_model(dataset_id) if model_path is None: raise FileNotFoundError("The model path does not exist on database") # Load the model and initialize the search index model = skge.TransE.load(model_path) return_list = [] for entity in entities: position = dtset.get_entity_id(entity) if position is None or position < 0: continue else: embedding = model.E[position] return_list.append([entity, embedding.tolist()]) return return_list
def insert_triples(self, dataset_dto, triples_list): # TODO: This should not be here """Insert triples on the Dataset. """ dtst_path = dataset_dto.get_binary_dataset() if dtst_path is None: return None, (500, "Dataset couldn't be loaded") dtset = dataset.Dataset() dtset.load_from_binary(dtst_path) dtset.show() result = dtset.load_dataset_from_json(triples_list) # sql = "SELECT binary_dataset FROM dataset WHERE id=?" # result = self.execute_query(sql, self.dataset["id"]) # bin_file = result[0]['binary_dataset'] dtset.show() result = result and\ dtset.save_to_binary(dataset_dto.get_binary_dataset()) return result, None
def generate_dataset_from_sparql(self, dataset_id, graph_pattern, levels, **keyw_args): """Creates a recurrent dataset from a seed vector This method is intended to be called only with celery *.delay()*, to be executed in foreground. The status of the generation can be queried through it's celery UUID. :param levels: The number of levels to scan :param dataset_path: The path to dataset file :param graph_pattern: The main query containing triples :kwparam limit_ent: Use only for testing purposes """ from celery import current_task # in task definition dataset_dao = data_access.DatasetDAO() dataset_dao.update_status(dataset_id, RUNNING_TASK_MASK) dataset_path, err = dataset_dao.get_binary_path(dataset_id) if dataset_path is None: raise FileNotFoundError("Dataset path is not on the system") # Load current dataset dtset = dataset.Dataset() dtset.load_from_binary(dataset_path) # Obtains the Redis connection from celery. redis = self.app.backend # The id of the object celery_uuid = self.request.id # Saves the empty id to be retrieved first time without error # redis.set(celery_uuid, "{}".encode("utf-8")) progres_dao = data_access.ProgressDAO() progres_dao.create_progress(celery_uuid, 1) progress = progres_dao.get_progress(celery_uuid) progress.total_steps = 1 progress.current_steps = 1 progress.current = 0 progress.total = 0 progres_dao.set_progress(celery_uuid, progress) def init_progress_callback(max_iter): progress = progres_dao.get_progress(celery_uuid) progress.total = max_iter progres_dao.set_progress(celery_uuid, progress) sv_kwargs = {} sv_kwargs['where'] = graph_pattern sv_kwargs['callback'] = lambda: progres_dao.add_progress(celery_uuid) sv_kwargs['start_callback'] = init_progress_callback # Batch limit has to be an integer try: sv_kwargs['batch_size'] = int(keyw_args.pop('batch_size')) except (LookupError, ValueError, TypeError): pass # Get the seed vector and load first entities seed_vector = dtset.load_from_graph_pattern(**sv_kwargs) celery_uuid = "celery-task-progress-" + self.request.id def status_callback(status): """Saves the progress of the task on redis db""" # Create progress object progress = { "current": status['it_analyzed'], "total": status['it_total'], "current_steps": status['round_curr'] + 1, "total_steps": status['round_total'] } # Retrieve task from redis task = redis.get(celery_uuid).decode("utf-8") task = json.loads(task) # Add task progress task['progress'] = progress # Save again on redis task = json.dumps(task).encode("utf-8") redis.set(celery_uuid, task) return # Build the optional args dict keyw_args["ext_callback"] = status_callback # Call to the *heavy* method dtset.load_dataset_recurrently(levels, seed_vector, **keyw_args) # Save new binary dtset.save_to_binary(dataset_path) # Restore status dataset_dao.set_status(dataset_id, 0) return False
def build_autocomplete_index(self, dataset_id, langs=['en', 'es']): """Generates an autocomplete index from a dataset using choosen languages This method extracts labels, descriptions and other useful information from sparql endpoint (or any other source) and stores it on the search database (elasticsearch). As the dataset may contain too much information in many languages, this will only use the selected languages. :param int dataset_id: The dataset ID :param list langs: A list of languages in ISO 639-1 format """ # Creates the progress object in redis celery_uuid = self.request.id progres_dao = data_access.ProgressDAO() # Load binary dataset dataset_dao = data_access.DatasetDAO() dataset_path, err = dataset_dao.get_binary_path(dataset_id) dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id) dtset = dataset.Dataset() dtset.load_from_binary(dataset_path) # Set working status # TODO: update status, not overwrite it dataset_dao.update_status(dataset_id, SEARCHINDEXED_MASK | RUNNING_TASK_MASK) # Update Progress progres_dao.create_progress(celery_uuid, len(dtset.entities)) progres_dao.update_progress(celery_uuid, 0) entity_dao = data_access.EntityDAO(dataset_dto.dataset_type, dataset_id) def get_labels(entity): """Auxiliar method to wrap dtset.entity_labels. Receives only one entity and stores on search Index """ # Get the labels from endpoint labels, descriptions, alt_labels = dtset.entity_labels(entity, langs=langs) # track progress: add one more step progres_dao.add_progress(celery_uuid) # Create the doc to be stored on elasticsearch and insert it entity_doc = { "entity_id": entity, "entity_uri": dtset.check_entity(entity), "label": labels, "alt_label": alt_labels, "description": descriptions } entity_dao.insert_entity(entity_doc) # Execute get_labels concurrently, using as many processes as cpu cores with ThreadPool(multiprocessing.cpu_count()) as p: all_labels = p.map(get_labels, dtset.entities) # Update status on DB when finished dataset_dao.update_status(dataset_id, SEARCHINDEXED_MASK, statusAnd=0b1110) return False
def train_dataset_from_algorithm(self, dataset_id, algorithm_dict): """Trains a dataset given an algorithm It is able to save the progress of training. :param str dataset_path: The path where binary dataset is located :param dict algorithm: An algorithm to be used in dataset training """ dataset_dao = data_access.DatasetDAO() # If it all goes ok, add id of algorithm to db dataset_dao.set_algorithm(dataset_id, algorithm_dict["id"]) dataset_dao.update_status(dataset_id, RUNNING_TASK_MASK | TRAINED_MASK) dataset_dto, err = dataset_dao.get_dataset_by_id(dataset_id) # Generate the filepath to the dataset dtset_path = dataset_dto.get_binary_dataset() # Loads the current dataset dtset = dataset.Dataset() dtset.load_from_binary(dtset_path) # Obtains the Redis connection from celery. redis = self.app.backend # The id of the object celery_uuid = "celery-task-progress-" + self.request.id # Saves the empty id to be retrieved first time without error progress = { "current": -1, "total": algorithm_dict['max_epochs'], "current_steps": None, "total_steps": None } redis.set(celery_uuid, json.dumps({"progress": progress}).encode("utf-8")) def status_callback(trainer): """Saves the progress of the task on redis db""" print("Status Callback. Trainer {}".format(trainer.epoch)) # Retrieve task from redis task = redis.get(celery_uuid).decode("utf-8") task = json.loads(task) # Add task progress task['progress']['current'] = trainer.epoch # Save again on redis task = json.dumps(task).encode("utf-8") redis.set(celery_uuid, task) return # Creates an optional parameters dict for better readability kwargs = { 'train_all': True, # All dataset will be trained, not validated 'test_all': -1, # No validation is going to be performed 'model_type': skge.TransE, # The default model will be used 'ncomp': algorithm_dict['embedding_size'], # Provided by the algorithm 'margin': algorithm_dict['margin'], # Provided by the algorithm 'max_epochs': algorithm_dict['max_epochs'], # Max number of iterations 'external_callback': status_callback, # The status callback } # Heavy task model = algorithm.ModelTrainer(dtset, **kwargs) modeloentrenado = model.run() model_path = dtset_path[:-4] + "_model.bin" modeloentrenado.save(model_path) # Update values on DB when model training has finished dataset_dao.update_status(dataset_id, TRAINED_MASK, statusAnd=0b1110) dataset_dao.set_model(dataset_id, model_path) return False
threads.append(t) t.start() for th in threads: th.join() best = sorted(model_trainer_scores, key=lambda t: t[1][0], reverse=True)[0] kwdict = best[0].get_conf() kwdict['train_all'] = True kwdict['test_all'] = -1 new_model_trainer = ModelTrainer(self.dataset, **kwdict) return (model_trainer_scores, best, new_model_trainer) if __name__ == '__main__': dtset = dataset.Dataset() # dataset.load_from_binary("holographic-embeddings/data/wn18.bin") dtset.load_from_binary("wdata_15k.bin") alg = Algorithm(dtset) alg.find_best() # modeltrainer = ModelTrainer(dtset, model_type=skge.HolE, test_all=10, # max_epochs=200, margin=0.2, ncomp=50, # mode="rank") # modeltrained = modeltrainer.run() # print(modeltrainer.scores)