def __init__( self, depth: int, walks_per_graph: float, sampler: Sampler = None, ): self.depth = depth self.walks_per_graph = walks_per_graph if sampler is not None: self.sampler = sampler else: self.sampler = UniformSampler()
def __init__( self, depth: int, walks_per_graph: float, sampler: Sampler = UniformSampler(), ): super().__init__(depth, walks_per_graph, sampler)
def __init__( self, depth: int, max_walks: Optional[int] = None, sampler: Sampler = UniformSampler(), n_jobs: int = 1, ): super().__init__(depth, max_walks, sampler, n_jobs)
def __init__( self, depth: int, walks_per_graph: float, sampler: Sampler = UniformSampler(), freq_thresholds: List[float] = [0.001], ): super().__init__(depth, walks_per_graph, sampler) self.freq_thresholds = freq_thresholds
def __init__( self, depth: int, max_walks: Optional[int] = None, sampler: Sampler = UniformSampler(), n_jobs: int = 1, is_support_remote: bool = True, ): super().__init__(depth, max_walks, sampler, n_jobs, is_support_remote)
def __init__( self, depth: int, max_walks: Optional[int] = None, sampler: Sampler = UniformSampler(), wl_iterations: int = 4, n_jobs: int = 1, ): super().__init__(depth, max_walks, sampler, n_jobs, False) self.wl_iterations = wl_iterations
def __init__( self, depth: int, max_walks: Optional[int] = None, sampler: Sampler = UniformSampler(), freq_thresholds: List[float] = [0.001], n_jobs: int = 1, ): super().__init__(depth, max_walks, sampler, n_jobs) self.freq_thresholds = freq_thresholds
def __init__( self, depth: int, max_walks: Optional[int] = None, sampler: Optional[Sampler] = None, n_jobs: int = 1, is_support_remote: bool = True, ): self.depth = depth self.is_support_remote = is_support_remote if n_jobs == -1: self.n_jobs = multiprocessing.cpu_count() else: self.n_jobs = n_jobs self.max_walks = max_walks if sampler is not None: self.sampler = sampler else: self.sampler = UniformSampler()
def __init__( self, depth: int, walks_per_graph: float, sampler: Sampler = UniformSampler(), hop_prob: float = 0.1, resolution: int = 1, ): super().__init__(depth, walks_per_graph, sampler) self.hop_prob = hop_prob self.resolution = resolution
def __init__( self, depth: int, max_walks: Optional[int] = None, sampler: Sampler = UniformSampler(), hop_prob: float = 0.1, resolution: int = 1, n_jobs: int = 1, ): super().__init__(depth, max_walks, sampler, n_jobs, False) self.hop_prob = hop_prob self.resolution = resolution
def __init__( self, depth: int, walks_per_graph: float, sampler: Sampler = UniformSampler(), grams: int = 3, wildcards: list = None, ): super().__init__(depth, walks_per_graph, sampler) self.grams = grams self.n_gram_map = {} # type: Dict[Tuple, str] self.wildcards = wildcards
def __init__( self, embedder: Optional[Embedder] = None, walkers: Optional[Sequence[Walker]] = None, ): if embedder is not None: self.embedder = embedder else: self.embedder = Word2Vec() self.walks_: List[rdflib.URIRef] = [] if walkers is not None: self.walkers = walkers else: self.walkers = [ RandomWalker(2, None, UniformSampler(inverse=False)) ]
def check_walker(name, Walker): walks_per_graph = 5 depth = 2 canonical_walks = Walker(depth, walks_per_graph, UniformSampler()).extract(KNOWLEDGE_GRAPH, ENTITIES_SUBSET) assert type(canonical_walks) == set if name == "WeisfeilerLehmanWalker": assert len(canonical_walks) <= len( ENTITIES_SUBSET * walks_per_graph * 5) # Sometimes, WalkletWalker returns one more walks than the ones specified. # We need to fix that. elif name == "WalkletWalker": assert len(canonical_walks) <= len(ENTITIES_SUBSET * walks_per_graph * (depth + 1)) else: assert len(canonical_walks) <= len(ENTITIES_SUBSET * walks_per_graph)
class Walker(metaclass=abc.ABCMeta): """Base class for the walking strategies. Attributes: depth: The depth per entity. max_walks: The maximum number of walks per entity. sampler: The sampling strategy. Defaults to UniformSampler(). n_jobs: The number of processes to use for multiprocessing. Use -1 to allocate as many processes as there are CPU cores available in the machine. Defaults to 1. is_support_remote: If true, indicate that the walking strategy can be used to retrieve walks via a SPARQL endpoint server. Defaults to False. """ # Global KG used later on for the worker process. kg = None def __init__( self, depth: int, max_walks: Optional[int] = None, sampler: Optional[Sampler] = None, n_jobs: int = 1, is_support_remote: bool = True, ): self.depth = depth self.is_support_remote = is_support_remote if n_jobs == -1: self.n_jobs = multiprocessing.cpu_count() else: self.n_jobs = n_jobs self.max_walks = max_walks if sampler is not None: self.sampler = sampler else: self.sampler = UniformSampler() def extract( self, kg: KG, instances: List[rdflib.URIRef], verbose=False ) -> Set[Tuple[Any, ...]]: """Fits the provided sampling strategy and then calls the private _extract method that is implemented for each of the walking strategies. Args: kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided instances. instances: The instances to be extracted from the Knowledge Graph. verbose: If true, display a progress bar for the extraction of the walks. Returns: The 2D matrix with its number of rows equal to the number of provided instances; number of column equal to the embedding size. """ if kg.is_remote and not self.is_support_remote: raise RemoteNotSupported( "Invalid walking strategy. Please, choose a walking strategy " + "that can retrieve walks via a SPARQL endpoint server." ) self.sampler.fit(kg) canonical_walks = set() # To avoid circular imports if "CommunityWalker" in str(self): self._community_detection(kg) # type: ignore if kg.is_remote: asyncio.run(kg._fill_entity_hops(instances)) # type: ignore with multiprocessing.Pool( self.n_jobs, self._init_worker, [kg] ) as pool: res = list( tqdm( pool.imap_unordered(self._proc, instances), total=len(instances), disable=not verbose, ) ) res = {k: v for elm in res for k, v in elm.items()} # type: ignore for instance in instances: canonical_walks.update(res[instance]) return canonical_walks @abc.abstractmethod def _extract( self, kg: KG, instance: rdflib.URIRef ) -> Dict[Any, Tuple[Tuple[str, ...], ...]]: """Extracts walks rooted at the provided instances which are then each transformed into a numerical representation. Args: kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided instances. instance: The instance to be extracted from the Knowledge Graph. Returns: The 2D matrix with its number of rows equal to the number of provided instances; number of column equal to the embedding size. """ raise NotImplementedError("This must be implemented!") def _init_worker(self, init_kg): """Initializes each worker process. Args: init_kg: The Knowledge Graph to provide to each worker process. """ global kg kg = init_kg def info(self): """Gets informations related to a Walker. Returns: A friendly display of the Walker. """ return ( f"{type(self).__name__}(depth={self.depth}," + f"max_walks={self.max_walks}," + f"sampler={type(self.sampler).__name__}," + f"n_jobs={self.n_jobs}," + f"is_support_remote={self.is_support_remote})" ) def print_walks( self, kg: KG, instances: List[rdflib.URIRef], filename: str, ) -> None: """Prints the walks of a Knowledge Graph. Args: kg: The Knowledge Graph. The graph from which the neighborhoods are extracted for the provided instances. instances: The instances to be extracted from the Knowledge Graph. filename: The filename that contains the rdflib.Graph """ walks = self.extract(kg, instances) walk_strs = [] for _, walk in enumerate(walks): s = "" for i in range(len(walk)): s += f"{walk[i]} " if i < len(walk) - 1: s += "--> " walk_strs.append(s) with open(filename, "w+") as f: for s in walk_strs: f.write(s) f.write("\n\n") def _proc( self, instance: rdflib.URIRef ) -> Dict[Any, Tuple[Tuple[str, ...], ...]]: """Executed by each process. Args: instance: The instance to be extracted from the Knowledge Graph. Returns: The extraction of walk by the process. """ global kg return self._extract(kg, instance) # type:ignore
# Ensure the determinism of this script by initializing a pseudo-random number. RANDOM_STATE = 22 test_data = pd.read_csv("samples/mutag/test.tsv", sep="\t") train_data = pd.read_csv("samples/mutag/train.tsv", sep="\t") train_entities = [entity for entity in train_data["bond"]] train_labels = list(train_data["label_mutagenic"]) test_entities = [entity for entity in test_data["bond"]] test_labels = list(test_data["label_mutagenic"]) entities = train_entities + test_entities samplers = [ ("Uniform", UniformSampler()), ("Object Frequency", ObjFreqSampler()), ("Inverse Object Frequency", ObjFreqSampler(inverse=True)), ( "Inverse Object Frequency Split", ObjFreqSampler(inverse=True, split=True), ), ("Predicate Frequency", PredFreqSampler()), ("Inverse Predicate Frequency", PredFreqSampler(inverse=True)), ("Predicate + Object Frequency", ObjPredFreqSampler()), ("Inverse Predicate + Object Frequency", ObjPredFreqSampler(inverse=True)), ("PageRank", PageRankSampler()), ("Inverse PageRank", PageRankSampler(inverse=True)), ("PageRank Split", PageRankSampler(split=True)), ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)), ]
from sklearn.svm import SVC from pyrdf2vec import RDF2VecTransformer from pyrdf2vec.embedders import Word2Vec from pyrdf2vec.graphs import KG from pyrdf2vec.samplers import UniformSampler from pyrdf2vec.walkers import RandomWalker, Walker DATASET = { "test": ["samples/mutag/test.tsv", "bond", "label_mutagenic"], "train": ["samples/mutag/train.tsv", "bond", "label_mutagenic"], } LABEL_PREDICATES = {"http://dl-learner.org/carcinogenesis#isMutagenic"} OUTPUT = "samples/mutag/mutag.owl" # We'll extract all possible walks of depth 4 (2 hops) WALKERS = [RandomWalker(2, None, UniformSampler(inverse=False))] PLOT_SAVE = "embeddings.png" PLOT_TITLE = "pyRDF2Vec" warnings.filterwarnings("ignore") np.random.seed(42) random.seed(42) def create_embeddings( kg: KG, entities: List[rdflib.URIRef], split: int, walkers: Sequence[Walker],
from pyrdf2vec.graphs import KG from pyrdf2vec.samplers import UniformSampler from pyrdf2vec.walkers import RandomWalker, Walker import numpy as np import logging logging.basicConfig( filename="rdf2vec.log", level=logging.INFO, format= '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') # Define the label predicates, all triples with these predicates # will be excluded from the graph logging.info("Read in knowledge graph.") label_predicates = [] kg = KG(location="data/dbp_graph.ttl", label_predicates=label_predicates) logging.info("Create walkers and transformers.") walkers = [RandomWalker(4, 5, UniformSampler())] transformer = RDF2VecTransformer(Word2Vec(sg=1), walkers=walkers) logging.info("Read in entities.") # Entities should be a list of URIs that can be found in the Knowledge Graph entities = list(np.load("data/entities.npy", allow_pickle=True)) logging.info("Calculate embeddings.") embeddings = transformer.fit_transform(kg, entities) logging.info("Write embeddings to disk.") np.save("data/embeddings.npy", embeddings) logging.info("Finished job.")
from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.svm import SVC from pyrdf2vec import RDF2VecTransformer from pyrdf2vec.embedders import Word2Vec from pyrdf2vec.graphs import KG from pyrdf2vec.samplers import UniformSampler from pyrdf2vec.walkers import RandomWalker, Walker DATASET = { "train": ["samples/products.csv", "product"], } LABEL_PREDICATES = ["http://dice-researcher.com/grocery-recommendation/recommendation#list"] OUTPUT = "samples/dataset.owl" WALKER = [RandomWalker(500, 4, UniformSampler())] PLOT_SAVE = "embeddings-new.png" PLOT_TITLE = "pyRDF2Vec" warnings.filterwarnings("ignore") def create_embeddings(kg, entities, split, walker=WALKER, sg=1): """Creates embeddings for a list of entities according to a knowledge graphs and a walking strategy. Args: kg (graph.KnowledgeGraph): The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances. entities (array-like): The train and test instances to create the
def test_fit(self): UniformSampler().fit(None)
class Walker(metaclass=abc.ABCMeta): """Base class for the walking strategies. Attributes: depth: The depth per entity. walks_per_graph: The maximum number of walks per entity. sampler: The sampling strategy. Default to UniformSampler(). """ def __init__( self, depth: int, walks_per_graph: float, sampler: Sampler = None, ): self.depth = depth self.walks_per_graph = walks_per_graph if sampler is not None: self.sampler = sampler else: self.sampler = UniformSampler() def extract( self, kg: KG, instances: List[rdflib.URIRef] ) -> Set[Tuple[Any, ...]]: """Fits the provided sampling strategy and then calls the private _extract method that is implemented for each of the walking strategies. Args: graph: The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances. instances: The instances to extract the knowledge graph. Returns: The 2D matrix with its number of rows equal to the number of provided instances; number of column equal to the embedding size. """ self.sampler.fit(kg) return self._extract(kg, instances) @abc.abstractmethod def _extract( self, kg: KG, instances: List[rdflib.URIRef] ) -> Set[Tuple[Any, ...]]: """Extracts walks rooted at the provided instances which are then each transformed into a numerical representation. Args: graph: The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances. instances: The instances to extract the knowledge graph. Returns: The 2D matrix with its number of rows equal to the number of provided instances; number of column equal to the embedding size. """ raise NotImplementedError("This must be implemented!") def print_walks( self, kg: KG, instances: List[rdflib.URIRef], file_name: str, ) -> None: """Prints the walks of a knowledge graph. Args: kg: The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances. instances: The instances to extract the knowledge graph. file_name: The filename that contains the rdflib.Graph """ walks = self.extract(kg, instances) walk_strs = [] for _, walk in enumerate(walks): s = "" for i in range(len(walk)): s += f"{walk[i]} " if i < len(walk) - 1: s += "--> " walk_strs.append(s) with open(file_name, "w+") as f: for s in walk_strs: f.write(s) f.write("\n\n")
def test_weight(self, setup, kg, root, is_reverse): sampler = UniformSampler() for hop in kg.get_hops(Vertex(f"{URL}#{root}"), is_reverse=is_reverse): assert sampler.get_weight(hop) == 1
class Walker(ABC): """Base class of the walking strategies. Attributes: _is_support_remote: True if the walking strategy can be used with a remote Knowledge Graph, False Otherwise Defaults to True. kg: The global KG used later on for the worker process. Defaults to None. max_depth: The maximum depth of one walk. max_walks: The maximum number of walks per entity. Defaults to None. random_state: The random state to use to keep random determinism with the walking strategy. Defaults to None. sampler: The sampling strategy. Defaults to UniformSampler. with_reverse: True to extracts parents and children hops from an entity, creating (max_walks * max_walks) walks of 2 * depth, allowing also to centralize this entity in the walks. False otherwise. This doesn't work with NGramWalker and WLWalker. Defaults to False. """ kg = attr.ib(init=False, repr=False, type=Optional[KG], default=None) max_depth = attr.ib( type=int, validator=[attr.validators.instance_of(int), _check_max_depth], ) max_walks = attr.ib( # type: ignore default=None, type=Optional[int], validator=[ attr.validators.optional(attr.validators.instance_of(int)), _check_max_walks, ], ) sampler = attr.ib( factory=lambda: UniformSampler(), type=Sampler, validator=attr.validators.instance_of(Sampler), # type: ignore ) n_jobs = attr.ib( # type: ignore default=None, type=Optional[int], validator=[ attr.validators.optional(attr.validators.instance_of(int)), _check_jobs, ], ) with_reverse = attr.ib( kw_only=True, type=Optional[bool], default=False, validator=attr.validators.instance_of(bool), ) random_state = attr.ib( kw_only=True, type=Optional[int], default=None, validator=attr.validators.optional(attr.validators.instance_of(int)), ) _is_support_remote = attr.ib(init=False, repr=False, type=bool, default=True) def __attrs_post_init__(self): if self.n_jobs == -1: self.n_jobs = multiprocessing.cpu_count() self.sampler.random_state = self.random_state def extract(self, kg: KG, entities: Entities, verbose: int = 0) -> List[List[SWalk]]: """Fits the provided sampling strategy and then calls the private _extract method that is implemented for each of the walking strategies. Args: kg: The Knowledge Graph. entities: The entities to be extracted from the Knowledge Graph. verbose: The verbosity level. 0: does not display anything; 1: display of the progress of extraction and training of walks; 2: debugging. Defaults to 0. Returns: The 2D matrix with its number of rows equal to the number of provided entities; number of column equal to the embedding size. Raises: WalkerNotSupported: If there is an attempt to use an invalid walking strategy to a remote Knowledge Graph. """ if kg._is_remote and not self._is_support_remote: raise WalkerNotSupported( "Invalid walking strategy. Please, choose a walking strategy " + "that can fetch walks via a SPARQL endpoint server.") self.sampler.fit(kg) process = self.n_jobs if self.n_jobs is not None else 1 if (kg._is_remote and kg.mul_req) and process >= 2: warnings.warn( "Using 'mul_req=True' and/or 'n_jobs>=2' speed up the " + "extraction of entity's walks, but may violate the policy " + "of some SPARQL endpoint servers.", category=RuntimeWarning, stacklevel=2, ) if kg._is_remote and kg.mul_req: kg._fill_hops(entities) with multiprocessing.Pool(process, self._init_worker, [kg]) as pool: res = list( tqdm( pool.imap(self._proc, entities), total=len(entities), disable=True if verbose == 0 else False, )) return self._post_extract(res) @abstractmethod def _extract(self, kg: KG, entity: Vertex) -> EntityWalks: """Extracts random walks for an entity based on a Knowledge Graph. Args: kg: The Knowledge Graph. entity: The root node to extract walks. Returns: A dictionary having the entity as key and a list of tuples as value corresponding to the extracted walks. Raises: NotImplementedError: If this method is called, without having provided an implementation. """ raise NotImplementedError("This must be implemented!") def _init_worker(self, init_kg: KG) -> None: """Initializes each worker process. Args: init_kg: The Knowledge Graph to provide to each worker process. """ global kg kg = init_kg # type: ignore def _post_extract(self, res: List[EntityWalks]) -> List[List[SWalk]]: """Post processed walks. Args: res: the result of the walks extracted with multiprocessing. Returns: The 2D matrix with its number of rows equal to the number of provided entities; number of column equal to the embedding size. """ return list(walks for entity_to_walks in res for walks in entity_to_walks.values()) def _proc(self, entity: str) -> EntityWalks: """Executed by each process. Args: entity: The entity to be extracted from the Knowledge Graph. Returns: The extraction of walk by the process. """ global kg return self._extract(kg, Vertex(entity)) # type: ignore