get_graph_query = """SELECT ?label ?graph WHERE { GRAPH ?graph {dbpedia:Angela_Merkel rdfs:label ?label} }""" print(ks.run_sparql_query(get_graph_query)) print("") birth_location_query = """SELECT ?p WHERE { { dbpedia:Angela_Merkel dbo:birthPlace ?p } UNION { dbpedia:Willy_Brandt dbo:birthPlace ?p} }""" print(ks.run_sparql_query(birth_location_query)) print("") mention_result = ks.run_mention_query( "http://en.wikinews.org/wiki/SEALs_say_US_officer's_cover-up_was_reported_by_fake_SEAL#char=62,67", "nwr:pos") print(mention_result) print("") resource_result = ks.run_resource_query( "http://en.wikinews.org/wiki/SEALs_say_US_officer's_cover-up_was_reported_by_fake_SEAL", "ks:hasMention") print("") print(len(resource_result)) text_result = ks.run_files_query( "http://en.wikinews.org/wiki/SEALs_say_US_officer's_cover-up_was_reported_by_fake_SEAL" ) print(text_result) print(len(text_result))
print("Answer for question 1: {0}".format(counter)) # second question sparql_query = "SELECT DISTINCT ?m WHERE { dbpedia:Angela_Merkel gaf:denotedBy ?m }" sparql_result = ks.run_sparql_query(sparql_query) resource_uris = [] for binding in sparql_result: mention_uri = binding['m'] resource_uri = ks.mention_uri_to_resource_uri(mention_uri) if resource_uri not in resource_uris: resource_uris.append(resource_uri) counter = 0 all_mappings = ks.get_all_resource_category_mappings( ks.top_level_category_names) for resource_uri in resource_uris: if "Economy and business" in all_mappings[resource_uri]: counter += 1 print("Answer for question 2: {0}".format(counter)) # third question resource_uri = "http://en.wikinews.org/wiki/Christian_Wulff_elected_Germany's_new_president" text = ks.run_files_query(resource_uri) sentences = nltk.sent_tokenize(text) sentence = sentences[0] tokenized = nltk.word_tokenize(sentence) pos_tagged = nltk.pos_tag(tokenized) chunked = nltk.ne_chunk(pos_tagged) print(chunked)
import knowledgestore.ks as ks import nltk from nltk.corpus import wordnet as wn from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import gensim print('\nSETUP') with open("data_set_split.pickle", "rb") as f: data_set = pickle.load(f) train = data_set['Sports']['train'] articles = [] for i in range(10): articles.append(ks.run_files_query(train[i][0])) # # computing most frequent bigrams # print('\nMOST FREQUENT BIGRAMS') # text = ' '.join(articles) # tokens = nltk.word_tokenize(text) # tokens = [token for token in tokens if token not in string.punctuation] # bigrams = nltk.bigrams(tokens) # freq_dist = nltk.FreqDist(bigrams) # frequency_list = [] # for bigram, freq in freq_dist.items(): # frequency_list.append([bigram, freq]) # frequency_list.sort(key = lambda x: x[1], reverse=True) # for i in range(10): # print(frequency_list[i]) #