Пример #1
0
def join(join_type):
    dbpedia_person = graph1.entities("dbpo:Person",entities_col_name="person1")  \
         .expand('person1', [('dbpp:birthPlace', 'country1'), ('dbpp:name', 'name1')]) \
        .filter({'country1': ['regex(str(?country1), "USA")']})\
            .filter({'name1':['regex(str(?name1),"Abraham")']}).select_cols(['name1'])

    yago_person = graph2.feature_domain_range("rdf:type","person2","p")\
        .expand('p', [ ('rdfs:label', 'label')])\
            .expand('person2',[('yagoinfo:name',"name2"),('yago:isCitizenOf', 'country2')])\
                .filter({'label': ['="person"@eng']}).filter({'country2': ['= yago:United_States']})\
                    .filter({'name2':['regex(str(?name2),"Abraham")']})

    dbpl_person = graph3.feature_domain_range("dc:creator","paper","author")\
        .expand('author', [ ('rdfs:label', 'name3')])\
            .expand('paper', [('dcterm:issued','date')])\
                .filter({'date': ['>= 2015'] })\
                    .filter({'name3':['regex(str(?name3),"Abraham")']})#.select_cols(['name3'])

    endpoint = 'http://10.161.202.101:8890/sparql/'
    output_format = HttpClientDataFormat.PANDAS_DF
    client = HttpClient(endpoint_url=endpoint, return_format=output_format)
    shared_ppl = dbpl_person.join(yago_person,
                                  'name3',
                                  'name2',
                                  join_type=join_type)
    shared_ppl2 = shared_ppl.join(dbpedia_person,
                                  'name3',
                                  'name1',
                                  join_type=join_type)
    print(shared_ppl2.to_sparql())
Пример #2
0
def test_query_with_limit(limit, offset, return_format=HttpClientDataFormat.DEFAULT, out_file=None):
    query = 'SELECT ?tweet (min(?tag) AS ?min_tags) FROM <http://twitter.com/> WHERE {{?tweet <http://rdfs.org/sioc/ns#has_creater> <http://twitter.com/9977822/> . ?tweet <http://twitter.com/ontology/hashashtag> ?tag}} GROUP BY ?tweet LIMIT {} OFFSET {};'.format(limit, offset)
    endpoint = 'http://10.161.202.101:8890/sparql/'
    port = 8890
    output_format = return_format
    max_rows = 10
    timeout = 120
    default_graph_url = 'http://twitter.com'
    client = HttpClient(endpoint_url=endpoint,
                        port=port,
                        return_format=output_format,
                        timeout=timeout,
                        default_graph_uri=default_graph_url,
                        max_rows=max_rows
                        )
    for res in client.execute_query(query, output_format, out_file):
        print('data with type {} and length {} retrieved'.format(type(res).__name__, len(res)))
Пример #3
0
def test_small_results():
    print('test_small_results:')
    query = 'SELECT ?tweet (min(?tag) AS ?min_tags)  FROM <http://twitter.com/> WHERE {?tweet <http://rdfs.org/sioc/ns#has_creater> <http://twitter.com/9977822/> . ?tweet <http://twitter.com/ontology/hashashtag> ?tag} GROUP BY ?tweet;'
    endpoint = 'http://10.161.202.101:8890/sparql/'
    port = 8890
    output_format = HttpClientDataFormat.CSV
    max_rows = 1000
    timeout = 120
    default_graph_url = 'http://twitter.com'
    client = HttpClient(endpoint_url=endpoint,
                        port=port,
                        return_format=output_format,
                        timeout=timeout,
                        default_graph_uri=default_graph_url,
                        max_rows=max_rows
                        )
    for res in client.execute_query(query, HttpClientDataFormat.CSV, 'response_data.txt'):
        print('data with type {} and length {} retrieved'.format(type(res).__name__, len(res)))
Пример #4
0
def important_topics():
    """
    Returns the SPARQL query to identify the hot areas of research in a field of databases.
    First, we identify a list of the top conferences of the computer science field of interest.
    We then identify the authors who have published more than 20 papers in these conferences since the year 2000.
    Next, we find the titles of all papers published by these authors in the specified conferences since 2005.
    """
    graph = KnowledgeGraph(graph_name='dblp',
                           graph_uri='http://dblp.l3s.de',
                           prefixes={
                               "xsd":
                               "http://www.w3.org/2001/XMLSchema#",
                               "swrc":
                               "http://swrc.ontoware.org/ontology#",
                               "rdf":
                               "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                               "dc":
                               "http://purl.org/dc/elements/1.1/",
                               "dcterm":
                               "http://purl.org/dc/terms/",
                               "dblprc":
                               "http://dblp.l3s.de/d2r/resource/conferences/"
                           })
    endpoint = 'http://10.161.202.101:8890/sparql/'
    port = 8890
    output_format = HttpClientDataFormat.PANDAS_DF
    max_rows = 1000000
    timeout = 12000
    client = HttpClient(endpoint_url=endpoint,
                        port=port,
                        return_format=output_format,
                        timeout=timeout,
                        max_rows=max_rows)

    dataset = graph.entities('swrc:InProceedings', entities_col_name='paper')\
        .expand(src_col_name='paper', predicate_list=[('dc:creator', 'author'), ('dcterm:issued', 'date'),
            ('swrc:series', 'conference'), ('dc:title', 'title')])
    dataset = dataset.cache()

    authors = dataset.filter({'date':['>= 2000'], 'conference': ['IN (dblprc:vldb, dblprc:sigmod)']})\
        .group_by(['author'])\
        .count('paper', 'papers_count')\
        .filter({'papers_count':['>= 20']})

    titles = dataset.join(authors, 'author').filter({
        'date': ['>= 2005']
    }).select_cols(['title'])

    print("SPARQL Query = \n{}".format(titles.to_sparql()))

    df = titles.execute(client, return_format=output_format)
    print(df)
Пример #5
0
def join(join_type):
    dbpedia_actors = graph1.feature_domain_range('dbpp:starring', 'film1', 'actor1') \
        .expand('actor1', [('dbpp:birthPlace', 'actor_country1'), ('dbpp:name', 'name')]) \
        .filter({'actor_country1': ['regex(str(?actor_country1), "USA")']})

    yago_actors = graph2.feature_domain_range('yago:actedIn', 'actor2', 'film2') \
        .expand('actor2', [('yago:isCitizenOf', 'actor_country2'), ('yagoinfo:name', 'name')]) \
        .filter({'actor_country2': ['= yago:United_States']})

    actors = dbpedia_actors.join(yago_actors, 'name', join_type=join_type)
    output_format = HttpClientDataFormat.PANDAS_DF
    client = HttpClient(endpoint_url=endpoint, return_format=output_format)
    df = actors.execute(client, return_format=output_format)
    print(df.shape)
    print(actors.to_sparql())
Пример #6
0
def test_twitter_query():
    # TODO: remove endpoint URI
    endpoint = 'http://10.161.202.101:8890/sparql/'
    port = 8890
    output_format = HttpClientDataFormat.PANDAS_DF
    max_rows = 1000000
    timeout = 12000
    default_graph_url = 'http://twitter.com'
    client = HttpClient(endpoint_url=endpoint,
                        port=port,
                        return_format=output_format,
                        timeout=timeout,
                        default_graph_uri=default_graph_url,
                        max_rows=max_rows
                        )

    graph = KnowledgeGraph('twitter',
                           'http://twitter.com/',
                           prefixes={
                               "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                               "sioc": "http://rdfs.org/sioc/ns#",
                               "sioct": "http://rdfs.org/sioc/types#",
                               "to": "http://twitter.com/ontology/",
                               "dcterms": "http://purl.org/dc/terms/",
                               "xsd": "http://www.example.org/",
                               "foaf": "http://xmlns.com/foaf/0.1/"
                           })

    dataset = graph.entities(class_name='sioct:microblogPost',
                             entities_col_name='tweet')
    ds = dataset.expand(src_col_name='tweet', predicate_list=[RDFPredicate('sioc:has_creater', 'tweep')])\
        .group_by(['tweep'])\
        .count('tweet', 'tweets_count')\
        .filter({'tweets_count': ['>= {}'.format(200), '<= {}'.format(300)]})

    ds = ds.expand('tweep', [RDFPredicate('sioc:has_creater', 'tweet', directionality=PredicateDirection.INCOMING)]).\
        expand('tweet', [
        RDFPredicate('sioc:content', 'text', optional=False),
        RDFPredicate('dcterms:created', 'date', optional=True),
        RDFPredicate('to:hasmedia', 'multimedia', optional=True),
        RDFPredicate('to:hashashtag', 'hashtag', optional=True),
        RDFPredicate('sioc:mentions', 'users_mentioned', optional=True)
    ])

    ds = ds.select_cols(['tweet', 'tweep', 'text', 'date', 'multimedia', 'hashtag', 'users_mentioned', 'tweets_count'])

    print("Sparql Query = \n{}".format(ds.to_sparql()))
Пример #7
0
def explore_dblp():
    graph = KnowledgeGraph(graph_name='dblp',
                           graph_uri='http://dblp.l3s.de',
                           prefixes={
                               "xsd":
                               "http://www.w3.org/2001/XMLSchema#",
                               "swrc":
                               "http://swrc.ontoware.org/ontology#",
                               "rdf":
                               "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                               "dc":
                               "http://purl.org/dc/elements/1.1/",
                               "dcterm":
                               "http://purl.org/dc/terms/",
                               "dblprc":
                               "http://dblp.l3s.de/d2r/resource/conferences/"
                           })

    endpoint = 'http://10.161.202.101:8890/sparql/'
    port = 8890
    output_format = HttpClientDataFormat.PANDAS_DF
    max_rows = 1000000
    timeout = 12000
    client = HttpClient(endpoint_url=endpoint,
                        port=port,
                        return_format=output_format,
                        timeout=timeout,
                        max_rows=max_rows)

    classes = graph.classes_and_freq().sort({'frequency': 'DESC'})
    #class_with_max_freq = graph.classes_and_freq().max('frequency').to_sparql()
    attributes_of_papers = graph.features('swrc:InProceedings')
    attributes_of_papers_with_freq = graph.features_and_freq(
        'swrc:InProceedings')
    papers = graph.entities('swrc:InProceedings')
    #papers_with_features = graph.entities_and_features('swrc:InProceedings').to_sparql()
    num_papers = graph.num_entities('swrc:InProceedings')

    print("{}".format(classes.to_sparql()))
    df = classes.execute(client, return_format=output_format)

    #print("{}".format(attributes_of_papers.to_sparql()))
    #df = attributes_of_papers.execute(client, return_format=output_format)

    print(df)
def movies_with_american_actors():
    start = time.time()
    graph = KnowledgeGraph(graph_uri='http://dbpedia.org',
                           prefixes={
                               'dcterms': 'http://purl.org/dc/terms/',
                               'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
                               'dbpprop': 'http://dbpedia.org/property/',
                               'dbpr': 'http://dbpedia.org/resource/',
                               'dbpo': 'http://dbpedia.org/ontology/'
                           })

    dataset = graph.feature_domain_range('dbpprop:starring', domain_col_name='film', range_col_name='actor')\
        .expand('actor', [RDFPredicate('dbpprop:birthPlace', 'actor_country'), RDFPredicate('rdfs:label', 'actor_name')])\
        .expand('film', [RDFPredicate('rdfs:label', 'film_name'), RDFPredicate('dcterms:subject', 'subject'),
                         RDFPredicate('dbpprop:country', 'film_country'), RDFPredicate('dbpo:genre', 'genre', optional=True)])\
        .cache()
    # 26928 Rows. -- 4273 msec.
    american_actors = dataset.filter(
        {'actor_country': ['regex(str(?actor_country), "USA")']})

    # 1606 Rows. -- 7659 msec.
    prolific_actors = dataset.group_by(['actor'])\
        .count('film', 'film_count', unique=True).filter({'film_count': ['>= 20']})

    #663,769 Rows. -- 76704 msec.
    films = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\
        .join(dataset, join_col_name1='actor')
    #.select_cols(['film_name', 'actor_name', 'genre'])

    sparql_query = films.to_sparql()

    print(sparql_query)

    endpoint = 'http://10.161.202.101:8890/sparql/'
    output_format = HttpClientDataFormat.PANDAS_DF

    client = HttpClient(endpoint_url=endpoint, return_format=output_format)
    # [663769 rows x 8 columns]
    df = films.execute(client, return_format=output_format)
    print("duration = {} sec".format(time.time() - start))
    print(df)
Пример #9
0
def test_simple_query():
    start = time.time()
    # create a knowledge graph to store the graph uri and prefixes
    graph = KnowledgeGraph('twitter',
                           'https://twitter.com/',
                           prefixes={
                               "rdf":
                               "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                               "sioc": "http://rdfs.org/sioc/ns#",
                               "sioct": "http://rdfs.org/sioc/types#",
                               "to": "http://twitter.com/ontology/",
                               "dcterms": "http://purl.org/dc/terms/",
                               "xsd": "http://www.example.org/",
                               "foaf": "http://xmlns.com/foaf/0.1/"
                           })
    # return all the instances of the tweet class
    dataset = graph.entities(class_name='sioct:MicroblogPost',
                             new_dataset_name='tweets',
                             entities_col_name='tweet')
    sparql_query = dataset.to_sparql()
    print("sparql_query to return tweets =\n{}\n".format(sparql_query))

    endpoint = 'http://10.161.202.101:8890/sparql/'
    port = 8890
    output_format = HttpClientDataFormat.PANDAS_DF
    max_rows = 1000000
    timeout = 12000
    default_graph_url = 'http://twitter.com/'
    client = HttpClient(endpoint_url=endpoint,
                        port=port,
                        return_format=output_format,
                        timeout=timeout,
                        default_graph_uri=default_graph_url,
                        max_rows=max_rows)

    #df = dataset.execute(client, return_format=output_format)
    duration = start - time.time()
    print("Done in {} secs".format(duration))
Пример #10
0
def movies_with_american_actors_optional():
    graph = KnowledgeGraph(graph_uri='http://dbpedia.org',
                           prefixes={'dcterms': 'http://purl.org/dc/terms/',
                                     'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
                                     'dbpprop': 'http://dbpedia.org/property/',
                                     'dbpr': 'http://dbpedia.org/resource/'})

    dataset = graph.feature_domain_range('dbpprop:starring', domain_col_name='movie', range_col_name='actor')\
        .expand('actor', [
            RDFPredicate('dbpprop:birthPlace', 'actor_country', optional=True),
            RDFPredicate('rdfs:label', 'actor_name', optional=True)])\
        .expand('movie', [
            RDFPredicate('rdfs:label', 'movie_name', optional=True),
            RDFPredicate('dcterms:subject', 'subject', optional=True),
            RDFPredicate('dbpprop:country', 'movie_country', optional=True)])\
        .cache()
    # 26928 Rows. -- 4273 msec.
    american_actors = dataset.filter({'actor_country': ['regex(str(?actor_country), "USA")']})

    # 1606 Rows. -- 7659 msec.
    prolific_actors = dataset.group_by(['actor'])\
        .count('movie', 'movie_count', unique=True).filter({'movie_count': ['>= 20', '<=30']})

    # 663769 Rows. -- 76511 msec.
    movies = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\
        .join(dataset, join_col_name1='actor')

    sparql_query = movies.to_sparql()

    print(sparql_query)

    endpoint = 'http://10.161.202.101:8890/sparql/'
    output_format = HttpClientDataFormat.PANDAS_DF

    client = HttpClient(endpoint_url=endpoint, return_format=output_format)
    df = dataset.execute(client, return_format=output_format)
    print(df)
Пример #11
0
 from rdfframes.knowledge_graph import KnowledgeGraph
 from rdfframes.dataset.rdfpredicate import RDFPredicate
 from rdfframes.client.http_client import HttpClientDataFormat, HttpClient    
 
 # External API imports: ampligraph library
 
 from ampligraph.latent_features import ComplEx
 from ampligraph.evaluation import evaluate_performance, mrr_score, hits_at_n_score
 from ampligraph.evaluation import train_test_split_no_unseen 
 
 # Client and the SPARQL Endpoint
 
 endpoint = 'http://10.161.202.101:8890/sparql/'
 port = 8890
 output_format = HttpClientDataFormat.PANDAS_DF
 client = HttpClient(endpoint_url=endpoint, port=port, return_format=output_format, timeout=timeout,
                     default_graph_uri=default_graph_url, max_rows=max_rows)
 
 # Get all triples where the object is a URI
 dataset = graph.feature_domain_range(s, p, o).filter({o: ['isURI']})
 
 # execute 
 df = dataset.execute(client, return_format=output_format)
   
 # Train/test split and create ComplEx model from ampligraph library
 
 triples = df.to_numpy()
 X_train, X_test = train_test_split_no_unseen(triples, test_size=10000)
 
 # use ComplEx model to build the embedding 
 model = ComplEx(batches_count=50,epochs=300,k=100,eta=20, optimizer='adam',optimizer_params={'lr':1e-4}, 
         loss='multiclass_nll',regularizer='LP', regularizer_params={'p':3, 'lambda':1e-5}, seed=0,verbose=True)
Пример #12
0
# RDFFrames imports, graph, prefixes, and client

  import pandas as pd
  from rdfframes.client.http_client import HttpClientDataFormat, HttpClient
  from rdfframes.knowledge_graph import KnowledgeGraph
  graph = KnowledgeGraph(
          graph_uri = 'http://dblp.l3s.de',
          prefixes = {"xsd": "http://www.w3.org/2001/XMLSchema#",
                    "swrc": "http://swrc.ontoware.org/ontology#",
                    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                    "dc": "http://purl.org/dc/elements/1.1/",
                    "dcterm": "http://purl.org/dc/terms/",
                    "dblprc": "http://dblp.l3s.de/d2r/resource/conferences/"})
  output_format = HttpClientDataFormat.PANDAS_DF
  client = HttpClient(endpoint_url=endpoint, port=port,return_format=output_format)

  # RDFFrames code for creating the dataframe
  papers = graph.entities('swrc:InProceedings', paper)
  papers = papers.expand('paper',[('dc:creator', 'author'),('dcterm:issued', 'date'), ('swrc:series', 'conference'),
                        ('dc:title', 'title')]).cache()
  authors = papers.filter({'date': ['>=2005'],'conference': ['In(dblp:vldb, dblp:sigmod)']}).group_by(['author'])
                   . count('paper', 'n_papers').filter({'n_papers': '>=20', 'date': ['>=2005']})
  titles = papers.join(authors, 'author', InnerJoin).select_cols(['title'])
  df = titles.execute(client, return_format=output_format)

  # Preprocessing and cleaning
  from nltk.corpus import stopwords
  df['clean_title'] = df['title'].str.replace("[^a-zA-Z#]", " ")
  df['clean_title'] = df['clean_title'].apply(lambda x: x.lower())
  df['clean_title'] = df['clean_title'].apply(lambda x: ' '.join([w for w in str(x).split() if len(w)>3])) 
  stop_words        = stopwords.words('english')
Пример #13
0
from rdfframes.knowledge_graph import KnowledgeGraph
from rdfframes.client.http_client import HttpClientDataFormat, HttpClient
from rdfframes.client.sparql_endpoint_client import SPARQLEndpointClient
from rdfframes.utils.constants import JoinType

__author__ = "Ghadeer"

endpoint = 'http://10.161.202.101:8890/sparql/'
port = 8890
output_format = HttpClientDataFormat.PANDAS_DF
max_rows = 1000000
timeout = 12000

client = HttpClient(endpoint_url=endpoint,
                    port=port,
                    return_format=output_format,
                    timeout=timeout,
                    max_rows=max_rows)

client = SPARQLEndpointClient(endpoint)
graph1 = KnowledgeGraph(graph_name='dbpedia')
graph2 = KnowledgeGraph(graph_name='yago',
                        graph_uri='http://yago-knowledge.org/',
                        prefixes={
                            'rdfs':
                            'http://www.w3.org/2000/01/rdf-schema#',
                            'rdf':
                            'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
                            'yago':
                            'http://yago-knowledge.org/resource/',
                            'yagoinfo':