def important_vldb_authors(): """ Returns the SPARQL query that finds all authors that have more than 20 vldb papers using dblp data. """ graph = KnowledgeGraph( graph_name = 'dblp', graph_uri='http://dblp.l3s.de', prefixes={ "xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/" }) dataset = graph.entities(class_name='swrc:InProceedings', new_dataset_name='papers', entities_col_name='paper') dataset = dataset.expand(src_col_name='paper', predicate_list=[ RDFPredicate('dc:title', 'title'), RDFPredicate('dc:creator', 'author'), RDFPredicate('swrc:series', 'conference')])\ .filter(conditions_dict={'conference': ['= <https://dblp.l3s.de/d2r/resource/conferences/vldb>']}) grouped_dataset = dataset.group_by(['author'])\ .count('paper', 'papers_count')\ .filter(conditions_dict={'papers_count': ['>= {}'.format(20)]}) grouped_dataset = grouped_dataset.select_cols(['author', 'papers_count']) print("SPARQL Query = \n{}".format(grouped_dataset.to_sparql()))
def test_expandable_expandable_3_joins(join_type): start = time.time() # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset1', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False), RDFPredicate('sioc:content', 'text', False) ]) dataset2 = graph.entities(class_name='sioc:UserAccount', new_dataset_name='dataset2', entities_col_name='tweep') dataset2 = dataset2.expand(src_col_name='tweep', predicate_list=[ RDFPredicate('sioc:has_name', 'name', False), RDFPredicate('sioc:has_follower', 'follower', False) ]) dataset2.join(dataset, 'tweep', 'tweep', 'tweep', join_type) dataset3 = graph.entities(class_name='sioc:UserAccount', new_dataset_name='dataset3', entities_col_name='tweeter') dataset3 = dataset3.expand( src_col_name='tweeter', predicate_list=[RDFPredicate('sioc:has_id', 'id', False)]) dataset3.join(dataset2, 'tweeter', 'follower', 'follower', join_type) sparql_query = dataset3.to_sparql() print("SPARQL query with {} =\n{}\n".format(join_type, sparql_query))
def important_topics(): """ Returns the SPARQL query to identify the hot areas of research in a field of databases. First, we identify a list of the top conferences of the computer science field of interest. We then identify the authors who have published more than 20 papers in these conferences since the year 2000. Next, we find the titles of all papers published by these authors in the specified conferences since 2005. """ graph = KnowledgeGraph( graph_name = 'dblp', graph_uri='http://dblp.l3s.de', prefixes={ "xsd": "http://www.w3.org/2001/XMLSchema#", "swrc": "http://swrc.ontoware.org/ontology#", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc": "http://purl.org/dc/elements/1.1/", "dcterm": "http://purl.org/dc/terms/", "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/" }) endpoint = 'http://10.161.202.101:8890/sparql/' port = 8890 output_format = HttpClientDataFormat.PANDAS_DF max_rows = 1000000 timeout = 12000 client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout, max_rows=max_rows ) dataset = graph.entities('swrc:InProceedings', entities_col_name='paper')\ .expand(src_col_name='paper', predicate_list=[ RDFPredicate('dc:creator', 'author'), RDFPredicate('dcterm:issued', 'date'), RDFPredicate('swrc:series', 'conference'), RDFPredicate('dc:title', 'title')]) dataset = dataset.cache() authors = dataset.filter({'date':['>= 2000'], 'conference': ['IN (dblprc:vldb, dblprc:sigmod)']})\ .group_by(['author'])\ .count('paper', 'papers_count')\ .filter({'papers_count':['>= 20']}) titles = dataset.join(authors, 'author').filter({'date': ['>= 2005']}).select_cols(['title']) print("SPARQL Query = \n{}".format(titles.to_sparql())) df = titles.execute(client, return_format=output_format) print(df)
def test_grouped_expandable_join(join_type): # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", "to": "http://twitter.com/ontology/", "dcterms": "http://purl.org/dc/terms/", "xsd": "http://www.example.org/", "foaf": "http://xmlns.com/foaf/0.1/" }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset1', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False), RDFPredicate('sioc:content', 'text', False) ]) dataset2 = graph.entities(class_name='sioct:microblogPost', new_dataset_name='tweets', entities_col_name='tweet') dataset2 = dataset2.expand( src_col_name='tweet', predicate_list=[RDFPredicate('sioc:has_creater', 'tweeter')]).group_by( ['tweeter' ]).count('tweet', 'tweets_count').filter(conditions_dict={ 'tweets_count': ['>= {}'.format(200), '<= {}'.format(300)] }) dataset2 = dataset2.expand( src_col_name='tweeter', predicate_list=[RDFPredicate('rdf:type', 'sioc:UserAccount')]) dataset2.join(dataset, 'tweeter', 'tweep', 'user', join_type) dataset2.select_cols(['user']) sparql_query = dataset2.to_sparql() print("SPARQL query with {} =\n{}\n".format(join_type, sparql_query))
def test_grouped_grouped_join(join_type): # create a knowledge graph to store the graph uri and prefixes graph = KnowledgeGraph('twitter', 'https://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc": "http://rdfs.org/sioc/ns#", "sioct": "http://rdfs.org/sioc/types#", }) # return all the instances of the tweet class dataset = graph.entities(class_name='sioct:microblogPost', new_dataset_name='dataset1', entities_col_name='tweet') dataset = dataset.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc:has_creater', 'tweep', False), RDFPredicate('sioc:content', 'text', False)])\ .group_by(['tweep']).count('tweet', 'tweets_count')\ .filter({'tweets_count': ['>= {}'.format(1000)]}) graph2 = KnowledgeGraph('twitter', 'https://twitter.com/', prefixes={ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "sioc2": "http://rdfs.org/sioc2/ns#", "sioct2": "http://rdfs.org/sioc2/types#", }) dataset2 = graph2.entities(class_name='sioct2:twitterPost', new_dataset_name='tweets', entities_col_name='tweet') dataset2 = dataset2.expand(src_col_name='tweet', predicate_list=[ RDFPredicate('sioc2:has_creater', 'tweeter') ]).group_by(['tweeter']).count('tweet', 'tweets_count2', unique=False)\ .filter(conditions_dict={'tweets_count2': ['>= {}'.format(200), '<= {}'.format(300)]}) dataset.join(dataset2, 'tweep', 'tweeter', 'user', join_type) dataset.select_cols(['user']) sparql_query = dataset.to_sparql() print("SPARQL query with {} =\n{}\n".format(join_type, sparql_query))