def check_sampler(Sampler): walks_per_graph = 5 canonical_walks = RandomWalker(2, walks_per_graph, Sampler()).extract( KNOWLEDGE_GRAPH, ENTITIES_SUBSET ) assert type(canonical_walks) == set assert len(canonical_walks) <= len(ENTITIES_SUBSET * walks_per_graph)
def test_load_save_transformer(self): RDF2VecTransformer( walkers=[RandomWalker(2, None), WeisfeilerLehmanWalker(2, 2)]).save() transformer = RDF2VecTransformer.load() assert len(transformer.walkers) == 2 assert isinstance(transformer.walkers[0], RandomWalker) assert isinstance(transformer.walkers[1], WeisfeilerLehmanWalker) os.remove("transformer_data")
def test_dfs(self, setup, kg, root, max_depth, max_walks, is_reverse): root = f"{URL}#{root}" for walk in RandomWalker(max_depth, max_walks, random_state=42)._dfs( kg, Vertex(root), is_reverse ): assert len(walk) <= (max_depth * 2) + 1 if is_reverse: assert walk[-1].name == root else: assert walk[0].name == root
def __init__( self, embedder: Optional[Embedder] = None, walkers: Optional[Sequence[Walker]] = None, ): if embedder is not None: self.embedder = embedder else: self.embedder = Word2Vec() if walkers is not None: self.walkers = walkers else: self.walkers = [RandomWalker(2, None)] self.walks_: List[rdflib.URIRef] = []
def __init__( self, vector_size=500, walkers=[RandomWalker(2, float("inf"))], n_jobs=1, window=5, sg=1, max_iter=10, negative=25, min_count=1, ): self.max_iter = max_iter self.min_count = min_count self.n_jobs = n_jobs self.negative = negative self.sg = sg self.vector_size = vector_size self.walkers = walkers self.window = window
def test_extract( self, setup, kg, root, max_depth, max_walks, with_reverse ): root = f"{URL}#{root}" walks = RandomWalker( max_depth, max_walks, with_reverse=with_reverse, random_state=42 )._extract(kg, Vertex(root))[root] if max_walks is not None: if with_reverse: assert len(walks) <= max_walks * max_walks else: assert len(walks) <= max_walks for walk in walks: for obj in walk[2::2]: assert obj.startswith("b'") if not with_reverse: assert walk[0] == root assert len(walk) <= (max_depth * 2) + 1 else: assert len(walk) <= ((max_depth * 2) + 1) * 2
from sklearn.svm import SVC from pyrdf2vec import RDF2VecTransformer from pyrdf2vec.embedders import Word2Vec from pyrdf2vec.graphs import KG from pyrdf2vec.samplers import UniformSampler from pyrdf2vec.walkers import RandomWalker, Walker DATASET = { "test": ["samples/mutag/test.tsv", "bond", "label_mutagenic"], "train": ["samples/mutag/train.tsv", "bond", "label_mutagenic"], } LABEL_PREDICATES = {"http://dl-learner.org/carcinogenesis#isMutagenic"} OUTPUT = "samples/mutag/mutag.owl" # We'll extract all possible walks of depth 4 (2 hops) WALKERS = [RandomWalker(2, None, UniformSampler(inverse=False))] PLOT_SAVE = "embeddings.png" PLOT_TITLE = "pyRDF2Vec" warnings.filterwarnings("ignore") np.random.seed(42) random.seed(42) def create_embeddings( kg: KG, entities: List[rdflib.URIRef], split: int, walkers: Sequence[Walker],
from pyrdf2vec.graphs import KG from pyrdf2vec.samplers import UniformSampler from pyrdf2vec.walkers import RandomWalker, Walker import numpy as np import logging logging.basicConfig( filename="rdf2vec.log", level=logging.INFO, format= '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') # Define the label predicates, all triples with these predicates # will be excluded from the graph logging.info("Read in knowledge graph.") label_predicates = [] kg = KG(location="data/dbp_graph.ttl", label_predicates=label_predicates) logging.info("Create walkers and transformers.") walkers = [RandomWalker(4, 5, UniformSampler())] transformer = RDF2VecTransformer(Word2Vec(sg=1), walkers=walkers) logging.info("Read in entities.") # Entities should be a list of URIs that can be found in the Knowledge Graph entities = list(np.load("data/entities.npy", allow_pickle=True)) logging.info("Calculate embeddings.") embeddings = transformer.fit_transform(kg, entities) logging.info("Write embeddings to disk.") np.save("data/embeddings.npy", embeddings) logging.info("Finished job.")
train_labels = list(train_data["label_mutagenic"]) test_entities = [entity for entity in test_data["bond"]] test_labels = list(test_data["label_mutagenic"]) entities = train_entities + test_entities labels = train_labels + test_labels embeddings, literals = RDF2VecTransformer( # Ensure random determinism for Word2Vec. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1, epochs=10), # Extract all walks with a maximum depth of 2 for each entity using two # processes and use a random state to ensure that the same walks are # generated for the entities. walkers=[RandomWalker(2, None, n_jobs=2, random_state=RANDOM_STATE)], verbose=1, ).fit_transform( KG( "samples/mutag/mutag.owl", skip_predicates={"http://dl-learner.org/carcinogenesis#isMutagenic"}, literals=[ [ "http://dl-learner.org/carcinogenesis#hasAtom", "http://dl-learner.org/carcinogenesis#charge", ], ], ), entities, )
("PageRank Split", PageRankSampler(split=True)), ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)), ] print(f"Prediction of {len(test_entities)} entities:") for _, sampler in samplers: embeddings, _ = RDF2VecTransformer( # type:ignore # Use one worker threads for Word2Vec to ensure random determinism. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1), # Extract a maximum of 100 walks of a maximum depth of 4 for each # entity using two processes and use a random state to ensure that the # same walks are generated for the entities. walkers=[ RandomWalker(4, 100, sampler, n_jobs=2, random_state=RANDOM_STATE) ], ).fit_transform( KG( "samples/mutag/mutag.owl", skip_predicates={ "http://dl-learner.org/carcinogenesis#isMutagenic" }, ), entities, ) train_embeddings = embeddings[:len(train_entities)] test_embeddings = embeddings[len(train_entities):] # Fit a Support Vector Machine on train embeddings and pick the best
from sklearn.svm import SVC from pyrdf2vec import RDF2VecTransformer from pyrdf2vec.embedders import Word2Vec from pyrdf2vec.graphs import KG from pyrdf2vec.walkers import RandomWalker, Walker DATASET = { "test": ["samples/mutag/test.tsv", "bond", "label_mutagenic"], "train": ["samples/mutag/train.tsv", "bond", "label_mutagenic"], } LABEL_PREDICATES = {"http://dl-learner.org/carcinogenesis#isMutagenic"} OUTPUT = "samples/mutag/mutag.owl" # We'll extract all possible walks of depth 2 with 4 processes. WALKERS = [RandomWalker(2, None, n_jobs=4)] # We'll extract all possible walks of depth 2 (without multi-processing) # WALKERS = [RandomWalker(2, None)] PLOT_SAVE = "embeddings.png" PLOT_TITLE = "pyRDF2Vec" warnings.filterwarnings("ignore") np.random.seed(42) random.seed(42) def create_embeddings( kg: KG, entities: List[rdflib.URIRef],
("Inverse Object Frequency", ObjFreqSampler(inverse=True)), ( "Inverse Object Frequency Split", ObjFreqSampler(inverse=True, split=True), ), ("Predicate Frequency", PredFreqSampler()), ("Inverse Predicate Frequency", PredFreqSampler(inverse=True)), ("Predicate + Object Frequency", ObjPredFreqSampler()), ("Inverse Predicate + Object Frequency", ObjPredFreqSampler(inverse=True)), ("PageRank", PageRankSampler()), ("Inverse PageRank", PageRankSampler(inverse=True)), ("PageRank Split", PageRankSampler(split=True)), ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)), ] for name, sampler in samplers: # Create embeddings with random walks transformer = RDF2VecTransformer(walkers=[RandomWalker(4, 100, sampler)]) walk_embeddings = transformer.fit_transform(kg, entities, verbose=True) # Split into train and test embeddings train_embeddings = walk_embeddings[:len(train_entities)] test_embeddings = walk_embeddings[len(train_entities):] # Fit a support vector machine on train embeddings and evaluate on test clf = SVC(random_state=42) clf.fit(train_embeddings, train_labels) print(end=f"[{name}] Support Vector Machine: Accuracy = ") print(accuracy_score(test_labels, clf.predict(test_embeddings)))
class RDF2VecTransformer: """Transforms nodes in a Knowledge Graph into an embedding. Attributes: _embeddings: All the embeddings of the model. Defaults to []. _entities: All the entities of the model. Defaults to []. _is_extract_walks_literals: True if the session must be closed after the call to the `transform` function. False, otherwise. Defaults to False. _literals: All the literals of the model. Defaults to []. _pos_entities: The positions of existing entities to be updated. Defaults to []. _pos_walks: The positions of existing walks to be updated. Defaults to []. _walks: All the walks of the model. Defaults to []. embedder: The embedding technique. Defaults to Word2Vec. walkers: The walking strategies. Defaults to [RandomWalker(2, None)] verbose: The verbosity level. 0: does not display anything; 1: display of the progress of extraction and training of walks; 2: debugging. Defaults to 0. """ embedder = attr.ib( factory=lambda: Word2Vec(), type=Embedder, validator=attr.validators.instance_of(Embedder), # type: ignore ) walkers = attr.ib( factory=lambda: [RandomWalker(2)], # type: ignore type=Sequence[Walker], validator=attr.validators.deep_iterable( member_validator=attr.validators.instance_of( Walker # type: ignore ), iterable_validator=attr.validators.instance_of(list), ), ) verbose = attr.ib( kw_only=True, default=0, type=int, validator=attr.validators.in_([0, 1, 2]), ) _is_extract_walks_literals = attr.ib( init=False, default=False, type=bool, repr=False, validator=attr.validators.instance_of(bool), ) _embeddings = attr.ib(init=False, type=Embeddings, factory=list) _entities = attr.ib(init=False, type=Entities, factory=list) _literals = attr.ib(init=False, type=Literals, factory=list) _walks = attr.ib(init=False, type=List[List[SWalk]], factory=list) _pos_entities = attr.ib(init=False, type=List[str], factory=list) _pos_walks = attr.ib(init=False, type=List[int], factory=list) def fit(self, walks: List[List[SWalk]], is_update: bool = False) -> RDF2VecTransformer: """Fits the embeddings based on the provided entities. Args: walks: The walks to fit. is_update: True if the new corpus should be added to old model's corpus, False otherwise. Defaults to False. Returns: The RDF2VecTransformer. """ if self.verbose == 2: print(self.embedder) tic = time.perf_counter() self.embedder.fit(walks, is_update) toc = time.perf_counter() if self.verbose >= 1: n_walks = sum([len(entity_walks) for entity_walks in walks]) print(f"Fitted {n_walks} walks ({toc - tic:0.4f}s)") if len(self._walks) != len(walks): n_walks = sum( [len(entity_walks) for entity_walks in self._walks]) print(f"> {n_walks} walks extracted " + f"for {len(self._entities)} entities.") return self def fit_transform(self, kg: KG, entities: Entities, is_update: bool = False) -> Tuple[Embeddings, Literals]: """Creates a model and generates embeddings and literals for the provided entities. Args: kg: The Knowledge Graph. entities: The entities including test entities to create the embeddings. Since RDF2Vec is unsupervised, there is no label leakage. is_update: True if the new corpus should be added to old model's corpus, False otherwise. Defaults to False. Returns: The embeddings and the literals of the provided entities. """ self._is_extract_walks_literals = True self.fit(self.get_walks(kg, entities), is_update) return self.transform(kg, entities) def get_walks(self, kg: KG, entities: Entities) -> List[List[SWalk]]: """Gets the walks of an entity based on a Knowledge Graph and a list of walkers Args: kg: The Knowledge Graph. entities: The entities including test entities to create the embeddings. Since RDF2Vec is unsupervised, there is no label leakage. Returns: The walks for the given entities. Raises: ValueError: If the provided entities aren't in the Knowledge Graph. """ if not kg._is_remote and not all( [Vertex(entity) in kg._vertices for entity in entities]): raise ValueError( "The provided entities must be in the Knowledge Graph.") # Avoids duplicate entities for unnecessary walk extractions. entities = list(set(entities)) if self.verbose == 2: print(kg) print(self.walkers[0]) walks: List[List[SWalk]] = [] tic = time.perf_counter() for walker in self.walkers: walks += walker.extract(kg, entities, self.verbose) toc = time.perf_counter() self._update(self._entities, entities) self._update(self._walks, walks) if self.verbose >= 1: n_walks = sum([len(entity_walks) for entity_walks in walks]) print(f"Extracted {n_walks} walks " + f"for {len(entities)} entities ({toc - tic:0.4f}s)") if (kg._is_remote and kg.mul_req and not self._is_extract_walks_literals): asyncio.run(kg.connector.close()) return walks def transform(self, kg: KG, entities: Entities) -> Tuple[Embeddings, Literals]: """Transforms the provided entities into embeddings and literals. Args: kg: The Knowledge Graph. entities: The entities including test entities to create the embeddings. Since RDF2Vec is unsupervised, there is no label leakage. Returns: The embeddings and the literals of the provided entities. """ assert self.embedder is not None embeddings = self.embedder.transform(entities) tic = time.perf_counter() literals = kg.get_literals(entities, self.verbose) toc = time.perf_counter() self._update(self._embeddings, embeddings) if len(literals) > 0: self._update(self._literals, literals) if kg._is_remote and kg.mul_req: self._is_extract_walks_literals = False asyncio.run(kg.connector.close()) if self.verbose >= 1 and len(literals) > 0: print(f"Extracted {len(literals)} literals for {len(entities)} " + f"entities ({toc - tic:0.4f}s)") return embeddings, literals def save(self, filename: str = "transformer_data") -> None: """Saves a RDF2VecTransformer object. Args: filename: The binary file to save the RDF2VecTransformer object. """ with open(filename, "wb") as f: pickle.dump(self, f) def _update(self, attr, values) -> None: """Updates an attribute with a variable. This method is useful to keep all entities, walks, literals and embeddings after several online training. Args: attr: The attribute to update var: The new values to add. """ if attr is None: attr = values elif isinstance(values[0], str): for i, entity in enumerate(values): if entity not in attr: attr.append(entity) else: self._pos_entities.append(attr.index(entity)) self._pos_walks.append(i) else: tmp = values for i, pos in enumerate(self._pos_entities): attr[pos] = tmp.pop(self._pos_walks[i]) attr += tmp @staticmethod def load(filename: str = "transformer_data") -> RDF2VecTransformer: """Loads a RDF2VecTransformer object. Args: filename: The binary file to load the RDF2VecTransformer object. Returns: The loaded RDF2VecTransformer. """ with open(filename, "rb") as f: transformer = pickle.load(f) if not isinstance(transformer, RDF2VecTransformer): raise ValueError( "Failed to load the RDF2VecTransformer object") return transformer
from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.svm import SVC from pyrdf2vec import RDF2VecTransformer from pyrdf2vec.embedders import Word2Vec from pyrdf2vec.graphs import KG from pyrdf2vec.samplers import UniformSampler from pyrdf2vec.walkers import RandomWalker, Walker DATASET = { "train": ["samples/products.csv", "product"], } LABEL_PREDICATES = ["http://dice-researcher.com/grocery-recommendation/recommendation#list"] OUTPUT = "samples/dataset.owl" WALKER = [RandomWalker(500, 4, UniformSampler())] PLOT_SAVE = "embeddings-new.png" PLOT_TITLE = "pyRDF2Vec" warnings.filterwarnings("ignore") def create_embeddings(kg, entities, split, walker=WALKER, sg=1): """Creates embeddings for a list of entities according to a knowledge graphs and a walking strategy. Args: kg (graph.KnowledgeGraph): The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances. entities (array-like): The train and test instances to create the
import rdflib from sklearn.manifold import TSNE from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.svm import SVC from pyrdf2vec import RDF2VecTransformer from pyrdf2vec.converters import rdflib_to_kg from pyrdf2vec.walkers import RandomWalker DATASET = { "test": ["samples/mutag-test.tsv", "bond", "label_mutagenic"], "train": ["samples/mutag-train.tsv", "bond", "label_mutagenic"], } LABEL_PREDICATES = ["http://dl-learner.org/carcinogenesis#isMutagenic"] OUTPUT = "samples/mutag.owl" WALKERS = [RandomWalker(4, float("inf"))] PLOT_SAVE = "embeddings.png" PLOT_TITLE = "pyRDF2Vec" warnings.filterwarnings("ignore") def create_embeddings(kg, entities, split, walkers, sg=1): """Creates embeddings for a list of entities according to a knowledge graphs and a walking strategy. Args: kg (graph.KnowledgeGraph): The knowledge graph. The graph from which the neighborhoods are extracted for the provided instances.
from pyrdf2vec import RDF2VecTransformer from pyrdf2vec.embedders import Word2Vec from pyrdf2vec.graphs import KG from pyrdf2vec.walkers import RandomWalker, Walker warnings.filterwarnings("ignore") np.random.seed(42) random.seed(42) FILE = "samples/countries-cities/entities.tsv" SPARQL_ENDPOINT = "https://dbpedia.org/sparql" LABEL_PREDICATES = {"www.w3.org/1999/02/22-rdf-syntax-ns#type"} # We'll extract all possible walks of depth 4 (with 25 hops) WALKERS = [RandomWalker(4, 25)] # We'll extract all possible walks of depth 4 (with 25 hops) with # multi-processing. Using multi-processing improves the speed of # extraction of walks, but this may conflict with the policy of the SPARQL # endpoint server. # WALKERS = [RandomWalker(4, 25, n_jobs=2)] PLOT_TITLE = "pyRDF2Vec" def create_embeddings( kg: KG, entities: List[rdflib.URIRef], walkers: Sequence[Walker], sg: int = 1, ) -> List[str]:
def test_extract_random_walks(self): walks = RandomWalker(4, float("inf")).extract_random_walks( KG, Vertex(str(generate_entities()))) assert type(walks) == list
from pyrdf2vec.graphs import KG from pyrdf2vec.walkers import RandomWalker # Ensure the determinism of this script by initializing a pseudo-random number. RANDOM_STATE = 22 data = pd.read_csv("samples/countries-cities/entities.tsv", sep="\t") transformer = RDF2VecTransformer( # Use one worker threads for Word2Vec to ensure random determinism. # Must be used with PYTHONHASHSEED. Word2Vec(workers=1), # Extract a maximum of 10 walks of a maximum depth of 4 for each entity # using two processes and use a random state to ensure that the same walks # are generated for the entities. walkers=[RandomWalker(4, 10, n_jobs=2, random_state=RANDOM_STATE)], verbose=1, ) # Train and save the Word2Vec model according to the KG, the entities, and # a walking strategy. embeddings, _ = transformer.fit_transform( # Defined the DBpedia endpoint server, as well as a set of predicates to # exclude from this KG. KG( "https://dbpedia.org/sparql", skip_predicates={"www.w3.org/1999/02/22-rdf-syntax-ns#type"}, literals=[ [ "http://dbpedia.org/ontology/wikiPageWikiLink", "http://www.w3.org/2004/02/skos/core#prefLabel",
def test_extract(self): canonical_walks = RandomWalker(4, float("inf")).extract( KG, generate_entities()) assert type(canonical_walks) == set