def join(join_type): dbpedia_person = graph1.entities("dbpo:Person",entities_col_name="person1") \ .expand('person1', [('dbpp:birthPlace', 'country1'), ('dbpp:name', 'name1')]) \ .filter({'country1': ['regex(str(?country1), "USA")']})\ .filter({'name1':['regex(str(?name1),"Abraham")']}).select_cols(['name1']) yago_person = graph2.feature_domain_range("rdf:type","person2","p")\ .expand('p', [ ('rdfs:label', 'label')])\ .expand('person2',[('yagoinfo:name',"name2"),('yago:isCitizenOf', 'country2')])\ .filter({'label': ['="person"@eng']}).filter({'country2': ['= yago:United_States']})\ .filter({'name2':['regex(str(?name2),"Abraham")']}) dbpl_person = graph3.feature_domain_range("dc:creator","paper","author")\ .expand('author', [ ('rdfs:label', 'name3')])\ .expand('paper', [('dcterm:issued','date')])\ .filter({'date': ['>= 2015'] })\ .filter({'name3':['regex(str(?name3),"Abraham")']})#.select_cols(['name3']) endpoint = 'http://10.161.202.101:8890/sparql/' output_format = HttpClientDataFormat.PANDAS_DF client = HttpClient(endpoint_url=endpoint, return_format=output_format) shared_ppl = dbpl_person.join(yago_person, 'name3', 'name2', join_type=join_type) shared_ppl2 = shared_ppl.join(dbpedia_person, 'name3', 'name1', join_type=join_type) print(shared_ppl2.to_sparql())
def test_query_with_limit(limit, offset, return_format=HttpClientDataFormat.DEFAULT, out_file=None): query = 'SELECT ?tweet (min(?tag) AS ?min_tags) FROM <http://twitter.com/> WHERE {{?tweet <http://rdfs.org/sioc/ns#has_creater> <http://twitter.com/9977822/> . ?tweet <http://twitter.com/ontology/hashashtag> ?tag}} GROUP BY ?tweet LIMIT {} OFFSET {};'.format(limit, offset) endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = return_format max_rows = 10 timeout = 120 default_graph_url = 'http://twitter.com' client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, default_graph_uri=default_graph_url, max_rows=max_rows ) for res in client.execute_query(query, output_format, out_file): print('data with type {} and length {} retrieved'.format(type(res).__name__, len(res)))
def test_small_results(): print('test_small_results:') query = 'SELECT ?tweet (min(?tag) AS ?min_tags) FROM <http://twitter.com/> WHERE {?tweet <http://rdfs.org/sioc/ns#has_creater> <http://twitter.com/9977822/> . ?tweet <http://twitter.com/ontology/hashashtag> ?tag} GROUP BY ?tweet;' endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.CSV max_rows = 1000 timeout = 120 default_graph_url = 'http://twitter.com' client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, default_graph_uri=default_graph_url, max_rows=max_rows ) for res in client.execute_query(query, HttpClientDataFormat.CSV, 'response_data.txt'): print('data with type {} and length {} retrieved'.format(type(res).__name__, len(res)))
def important_topics(): """ Returns the SPARQL query to identify the hot areas of research in a field of databases. First, we identify a list of the top conferences of the computer science field of interest. We then identify the authors who have published more than 20 papers in these conferences since the year 2000. Next, we find the titles of all papers published by these authors in the specified conferences since 2005. """ graph = KnowledgeGraph(graph_name='dblp', graph_uri='http://dblp.l3s.de', prefixes={ "xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/" }) endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, max_rows=max_rows) dataset = graph.entities('swrc:InProceedings', entities_col_name='paper')\ .expand(src_col_name='paper', predicate_list=[('dc:creator', 'author'), ('dcterm:issued', 'date'), ('swrc:series', 'conference'), ('dc:title', 'title')]) dataset = dataset.cache() authors = dataset.filter({'date':['>= 2000'], 'conference': ['IN (dblprc:vldb, dblprc:sigmod)']})\ .group_by(['author'])\ .count('paper', 'papers_count')\ .filter({'papers_count':['>= 20']}) titles = dataset.join(authors, 'author').filter({ 'date': ['>= 2005'] }).select_cols(['title']) print("SPARQL Query = \n{}".format(titles.to_sparql())) df = titles.execute(client, return_format=output_format) print(df)
def join(join_type): dbpedia_actors = graph1.feature_domain_range('dbpp:starring', 'film1', 'actor1') \ .expand('actor1', [('dbpp:birthPlace', 'actor_country1'), ('dbpp:name', 'name')]) \ .filter({'actor_country1': ['regex(str(?actor_country1), "USA")']}) yago_actors = graph2.feature_domain_range('yago:actedIn', 'actor2', 'film2') \ .expand('actor2', [('yago:isCitizenOf', 'actor_country2'), ('yagoinfo:name', 'name')]) \ .filter({'actor_country2': ['= yago:United_States']}) actors = dbpedia_actors.join(yago_actors, 'name', join_type=join_type) output_format = HttpClientDataFormat.PANDAS_DF client = HttpClient(endpoint_url=endpoint, return_format=output_format) df = actors.execute(client, return_format=output_format) print(df.shape) print(actors.to_sparql())
def test_twitter_query(): # TODO: remove endpoint URI endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 default_graph_url = 'http://twitter.com' client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, default_graph_uri=default_graph_url, max_rows=max_rows ) graph = KnowledgeGraph('twitter', 'http://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) dataset = graph.entities(class_name='sioct:microblogPost', entities_col_name='tweet') ds = dataset.expand(src_col_name='tweet', predicate_list=[RDFPredicate('sioc:has_creater', 'tweep')])\ .group_by(['tweep'])\ .count('tweet', 'tweets_count')\ .filter({'tweets_count': ['>= {}'.format(200), '<= {}'.format(300)]}) ds = ds.expand('tweep', [RDFPredicate('sioc:has_creater', 'tweet', directionality=PredicateDirection.INCOMING)]).\ expand('tweet', [ RDFPredicate('sioc:content', 'text', optional=False), RDFPredicate('dcterms:created', 'date', optional=True), RDFPredicate('to:hasmedia', 'multimedia', optional=True), RDFPredicate('to:hashashtag', 'hashtag', optional=True), RDFPredicate('sioc:mentions', 'users_mentioned', optional=True) ]) ds = ds.select_cols(['tweet', 'tweep', 'text', 'date', 'multimedia', 'hashtag', 'users_mentioned', 'tweets_count']) print("Sparql Query = \n{}".format(ds.to_sparql()))
def explore_dblp(): graph = KnowledgeGraph(graph_name='dblp', graph_uri='http://dblp.l3s.de', prefixes={ "xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/" }) endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, max_rows=max_rows) classes = graph.classes_and_freq().sort({'frequency': 'DESC'}) #class_with_max_freq = graph.classes_and_freq().max('frequency').to_sparql() attributes_of_papers = graph.features('swrc:InProceedings') attributes_of_papers_with_freq = graph.features_and_freq( 'swrc:InProceedings') papers = graph.entities('swrc:InProceedings') #papers_with_features = graph.entities_and_features('swrc:InProceedings').to_sparql() num_papers = graph.num_entities('swrc:InProceedings') print("{}".format(classes.to_sparql())) df = classes.execute(client, return_format=output_format) #print("{}".format(attributes_of_papers.to_sparql())) #df = attributes_of_papers.execute(client, return_format=output_format) print(df)
def movies_with_american_actors(): start = time.time() graph = KnowledgeGraph(graph_uri='http://dbpedia.org', prefixes={ 'dcterms': 'http://purl.org/dc/terms/', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'dbpprop': 'http://dbpedia.org/property/', 'dbpr': 'http://dbpedia.org/resource/', 'dbpo': 'http://dbpedia.org/ontology/' }) dataset = graph.feature_domain_range('dbpprop:starring', domain_col_name='film', range_col_name='actor')\ .expand('actor', [RDFPredicate('dbpprop:birthPlace', 'actor_country'), RDFPredicate('rdfs:label', 'actor_name')])\ .expand('film', [RDFPredicate('rdfs:label', 'film_name'), RDFPredicate('dcterms:subject', 'subject'), RDFPredicate('dbpprop:country', 'film_country'), RDFPredicate('dbpo:genre', 'genre', optional=True)])\ .cache() # 26928 Rows. -- 4273 msec. american_actors = dataset.filter( {'actor_country': ['regex(str(?actor_country), "USA")']}) # 1606 Rows. -- 7659 msec. prolific_actors = dataset.group_by(['actor'])\ .count('film', 'film_count', unique=True).filter({'film_count': ['>= 20']}) #663,769 Rows. -- 76704 msec. films = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\ .join(dataset, join_col_name1='actor') #.select_cols(['film_name', 'actor_name', 'genre']) sparql_query = films.to_sparql() print(sparql_query) endpoint = 'http://10.161.202.101:8890/sparql/' output_format = HttpClientDataFormat.PANDAS_DF client = HttpClient(endpoint_url=endpoint, return_format=output_format) # [663769 rows x 8 columns] df = films.execute(client, return_format=output_format) print("duration = {} sec".format(time.time() - start)) print(df)
def test_simple_query(): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:MicroblogPost', new_dataset_name='tweets', entities_col_name='tweet') sparql_query = dataset.to_sparql() print("sparql_query to return tweets =\n{}\n".format(sparql_query)) endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 default_graph_url = 'http://twitter.com/' client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, default_graph_uri=default_graph_url, max_rows=max_rows) #df = dataset.execute(client, return_format=output_format) duration = start - time.time() print("Done in {} secs".format(duration))
def movies_with_american_actors_optional(): graph = KnowledgeGraph(graph_uri='http://dbpedia.org', prefixes={'dcterms': 'http://purl.org/dc/terms/', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'dbpprop': 'http://dbpedia.org/property/', 'dbpr': 'http://dbpedia.org/resource/'}) dataset = graph.feature_domain_range('dbpprop:starring', domain_col_name='movie', range_col_name='actor')\ .expand('actor', [ RDFPredicate('dbpprop:birthPlace', 'actor_country', optional=True), RDFPredicate('rdfs:label', 'actor_name', optional=True)])\ .expand('movie', [ RDFPredicate('rdfs:label', 'movie_name', optional=True), RDFPredicate('dcterms:subject', 'subject', optional=True), RDFPredicate('dbpprop:country', 'movie_country', optional=True)])\ .cache() # 26928 Rows. -- 4273 msec. american_actors = dataset.filter({'actor_country': ['regex(str(?actor_country), "USA")']}) # 1606 Rows. -- 7659 msec. prolific_actors = dataset.group_by(['actor'])\ .count('movie', 'movie_count', unique=True).filter({'movie_count': ['>= 20', '<=30']}) # 663769 Rows. -- 76511 msec. movies = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\ .join(dataset, join_col_name1='actor') sparql_query = movies.to_sparql() print(sparql_query) endpoint = 'http://10.161.202.101:8890/sparql/' output_format = HttpClientDataFormat.PANDAS_DF client = HttpClient(endpoint_url=endpoint, return_format=output_format) df = dataset.execute(client, return_format=output_format) print(df)
from rdfframes.knowledge_graph import KnowledgeGraph from rdfframes.dataset.rdfpredicate import RDFPredicate from rdfframes.client.http_client import HttpClientDataFormat, HttpClient # External API imports: ampligraph library from ampligraph.latent_features import ComplEx from ampligraph.evaluation import evaluate_performance, mrr_score, hits_at_n_score from ampligraph.evaluation import train_test_split_no_unseen # Client and the SPARQL Endpoint endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, default_graph_uri=default_graph_url, max_rows=max_rows) # Get all triples where the object is a URI dataset = graph.feature_domain_range(s, p, o).filter({o: ['isURI']}) # execute df = dataset.execute(client, return_format=output_format) # Train/test split and create ComplEx model from ampligraph library triples = df.to_numpy() X_train, X_test = train_test_split_no_unseen(triples, test_size=10000) # use ComplEx model to build the embedding model = ComplEx(batches_count=50,epochs=300,k=100,eta=20, optimizer='adam',optimizer_params={'lr':1e-4}, loss='multiclass_nll',regularizer='LP', regularizer_params={'p':3, 'lambda':1e-5}, seed=0,verbose=True)
# RDFFrames imports, graph, prefixes, and client import pandas as pd from rdfframes.client.http_client import HttpClientDataFormat, HttpClient from rdfframes.knowledge_graph import KnowledgeGraph graph = KnowledgeGraph( graph_uri = 'http://dblp.l3s.de', prefixes = {"xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/"}) output_format = HttpClientDataFormat.PANDAS_DF client = HttpClient(endpoint_url=endpoint, port=port,return_format=output_format) # RDFFrames code for creating the dataframe papers = graph.entities('swrc:InProceedings', paper) papers = papers.expand('paper',[('dc:creator', 'author'),('dcterm:issued', 'date'), ('swrc:series', 'conference'), ('dc:title', 'title')]).cache() authors = papers.filter({'date': ['>=2005'],'conference': ['In(dblp:vldb, dblp:sigmod)']}).group_by(['author']) . count('paper', 'n_papers').filter({'n_papers': '>=20', 'date': ['>=2005']}) titles = papers.join(authors, 'author', InnerJoin).select_cols(['title']) df = titles.execute(client, return_format=output_format) # Preprocessing and cleaning from nltk.corpus import stopwords df['clean_title'] = df['title'].str.replace("[^a-zA-Z#]", " ") df['clean_title'] = df['clean_title'].apply(lambda x: x.lower()) df['clean_title'] = df['clean_title'].apply(lambda x: ' '.join([w for w in str(x).split() if len(w)>3])) stop_words = stopwords.words('english')
from rdfframes.knowledge_graph import KnowledgeGraph from rdfframes.client.http_client import HttpClientDataFormat, HttpClient from rdfframes.client.sparql_endpoint_client import SPARQLEndpointClient from rdfframes.utils.constants import JoinType __author__ = "Ghadeer" endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, max_rows=max_rows) client = SPARQLEndpointClient(endpoint) graph1 = KnowledgeGraph(graph_name='dbpedia') graph2 = KnowledgeGraph(graph_name='yago', graph_uri='http://yago-knowledge.org/', prefixes={ 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'yago': 'http://yago-knowledge.org/resource/', 'yagoinfo':