def main(): parser = argparse.ArgumentParser( description= "Translate an input string by looking up DBpedia entries with that name." ) parser.add_argument("input", type=str, help="The temponym to translate") args = parser.parse_args() query = sparql_resource_query_str(args.input) sparql = SPARQLWrapper.SPARQLWrapper("http://dbpedia.org/sparql") sparql.setQuery(query) sparql.setReturnFormat(SPARQLWrapper.JSON) results = sparql.query().convert() tuples = set() if "results" in results and "bindings" in results["results"]: for result in results["results"]["bindings"]: label = result["label"] translation = label["value"] language = label["xml:lang"] tuples.add((language, translation)) print(sorted(list(tuples)))
def years(): sparql = SPARQLWrapper.SPARQLWrapper("http://dbpedia.org/sparql") year_counter = counter = collections.Counter() unfound = [] total = 0 with open("processed_24_dec_2015_backup.txt") as o: for line in o: total += 1 words = [] line = line.strip() fixed_line = re.sub( 'Original Mix|Explicit|Radio Version|Dirty Radio Edit|Radio Edit|Original Radio Edit|\(Download\)|, Richard Bedford|Remastered Version|Remastered Version|Remastered Album Version|Original Version| - Original Mix|Single Version| - Live|Single Edit|LP Version|BBC Radio 1 Live Lounge|Original London Cast|Michael Reed|Theme from|The| - Edit|\(feat .*\)|\(Feat .*\)|\(featuring .*\)|Theme From| - Original$|/ Mono|Digital Remaster|Original mix|/Soundtrack Version|Extended Version|New Sound Remastered|Explicit Version|\(Mono\)|Album Version|45 Version|Radio Mix|\(.*\)|New Stereo Mix|Stereo Remastered Version|Original Album Version|Original Mono Version|Remixed Version|Soundtrack Version|Radio edit|\[.*\]|/ Stereo$|Club Mix|Album Verision|Alternate Version|Dance Mix|Revised Album Version', '', line) blacklist = ["REMASTER", "REMASTERED", "Y"] for p in fixed_line.translate(None, ',()').split(" – "): for w in p.split(" "): if w.isalnum() and not w.isdigit() and not w.upper( ) in blacklist and len(w) > 2: words.append("\"" + w.upper() + "\"") search_string = " AND ".join(words) query = """ PREFIX dbo: <http://dbpedia.org/ontology/> select ?s1, ?date where { ?s1 dbo:abstract ?o1 . ?o1 bif:contains ' (%s) ' option ( score ?sc ) . ?s1 dbo:releaseDate ?date . } order by desc ( ?sc * 3e-1 + sql:rnk_scale ( <LONG::IRI_RANK> ( ?s1 ) ) ) limit 1 offset 0 """ % (search_string) sparql.setQuery(query) sparql.setReturnFormat(SPARQLWrapper.JSON) results = sparql.query().convert() # print query b = results["results"]["bindings"] if len(b) > 0: print "FOUND__" print line print b[0]["date"]["value"] year = b[0]["date"]["value"].split("-")[0] year_counter[year] += 1 print b[0]["s1"]["value"] else: print "\tUNFOUND__" print "\t", line print "\t", search_string unfound.append(line) # print if total >= 5000: break for x in year_counter.most_common(): print x[1], "\t", x[0] print "\n".join(unfound)
def runQuery(query): endpoint = 'https://query.wikidata.org/sparql' sparql = SPARQLWrapper.SPARQLWrapper(endpoint) sparql.setQuery(query) sparql.setReturnFormat(SPARQLWrapper.JSON) results = sparql.query().convert() return results['results']['bindings']
def runQuery(query): endpoint = 'https://query.wikidata.org/sparql' sparql = SPARQLWrapper.SPARQLWrapper(endpoint, agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36') sparql.setQuery(query) sparql.setReturnFormat(SPARQLWrapper.JSON) results = sparql.query().convert() return results['results']['bindings']
def prop_has_range_or_comment(prop_value): ''' Function to check whether the property has 'range' or 'comment' attribute associated with it. ''' # return (0, 0) has_comment = 0 has_range = 0 prop = prop_value['prop'] sparql = SPARQLWrapper("http://dbpedia.org/sparql") query1 = """ select distinct ?prop ?value where { <""" + prop + """> ?prop ?value } """ sparql.setQuery(query1) sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: prop = result["prop"]["value"] if "range" in prop: has_range = 1 if "comment" in prop: has_comment = 1 return (has_range, has_comment)
def get_distractors(resource, resource_type): sparql = SPARQLWrapper("http://dbpedia.org/sparql") query1 = """ select ?similar (count(?p) as ?similarity) where { values ?res {<http://dbpedia.org/resource/""" + resource + """>} ?similar ?p ?o ; a <""" + resource_type + """> . ?res ?p ?o . } group by ?similar ?res having (count(?p) > 1) order by desc(?similarity) LIMIT 30 """ sparql.setQuery(query1) sparql.setReturnFormat(JSON) results = sparql.query().convert() del similar_resources[:] for result in results["results"]["bindings"]: res = result["similar"]["value"] value = result["similarity"]["value"] similar_resources.append([res, int(value), 0, 0])
def total_one_degree_paths(res1, res2): sparql = SPARQLWrapper("http://dbpedia.org/sparql") query1 = """ select count(distinct ?var3) as ?cnt where { { SELECT distinct ?var3 WHERE { <http://dbpedia.org/resource/""" + res1 + """> ?prop1 ?var3 . <""" + res2 + """> ?pr ?var3. } } UNION { SELECT distinct ?var3 WHERE { <http://dbpedia.org/resource/""" + res1 + """> ?prop1 ?var3 . ?var3 ?prop <""" + res2 + """> . } } } """ sparql.setQuery(query1) sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: return result["cnt"]["value"]
def run_query(query, endpoint): """ Run a sparql query against an endpoint and return the results converted to JSON dictionary. """ sparql = SPARQLWrapper.SPARQLWrapper(endpoint) sparql.setQuery(query) sparql.setReturnFormat(SPARQLWrapper.JSON) return sparql.query().convert()
def __init__(self, base_url): if not base_url.endswith('/'): base_url += '/' self.base_url = base_url # Create query and update wrapper self.wrappers = { op: sprqlw.SPARQLWrapper(''.join((self.base_url, op))) for op in ('query', 'update') }
def get_results(endpoint_url, query): user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1]) # TODO adjust user agent; see https://w.wiki/CX6 sparql = SPARQLWrapper(endpoint_url, agent=user_agent) sparql.setQuery(query) sparql.setReturnFormat(JSON) return sparql.query().convert()
def get_sparql_results(sparql_query_string): sparql = SPARQLWrapper.SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setQuery(sparql_query_string) try: # stream with the results in XML, see <http://www.w3.org/TR/rdf-sparql-XMLres/> sparql.setReturnFormat(SPARQLWrapper.JSON) result = sparql.query().convert() except: result = None return result
def _do_query(self, statement, access_point): sparql = SPARQLWrapper.SPARQLWrapper( self._query_url.format(url=self._base_url, ap=access_point)) sparql.setReturnFormat(SPARQLWrapper.XML) query = self._get_prefix_str(access_point) + statement #print query sparql.setQuery(query) #for line in sparql.query().response: # print line.rstrip() return self._parse_response(sparql.query().response)
def update_profile(profile): if len(profile.add | profile.remove) == 0: return '200 No update' remote = SPARQLWrapper.SPARQLWrapper(updateUrl) remote.addParameter('email', user) remote.addParameter('password', passw) remote.setMethod(SPARQLWrapper.POST) remote.setQuery(write_update_query(profile)) results = remote.queryAndConvert().decode('utf-8') return results
def query_trainings(uris=None, faculty=None, link_text=None, link_address=None, rank=None): 'trainings' : 'http://vivo.brown.edu/ontology/profile#hasTraining', 'specialty' : 'http://vivo.brown.edu/ontology/profile#hasSpecialty', 'hospital' : 'http://vivo.brown.edu/ontology/profile#hasHospital', 'organization' : 'http://vivo.brown.edu/ontology/profile#hasOrganization', filters = [] if faculty: filters.append( make_filter('?link', '<http://vivo.brown.edu/ontology/vivo-brown/drrbWebPageOf>', '?fac', '<{}>'.format(faculty) ) ) if link_text: filters.append( make_filter('?link', '<http://vivoweb.org/ontology/core#linkAnchorText>', '?link_text', '{}'.format(json.dumps(link_text)) ) ) if link_address: filters.append( make_filter('?link', '<http://vivoweb.org/ontology/core#linkURI>', '?link_address', '{}'.format(json.dumps(link_address)) ) ) if rank: filters.append( make_filter('?link', '<http://vivoweb.org/ontology/core#rank>', '?rank', '{}'.format(json.dumps(rank)) ) ) if uris: filters.append( make_filter(None,None,'?link',''.join( ['<{}>'.format(u) for u in uris ]) ) ) query = """ PREFIX core: <http://vivoweb.org/ontology/core#> DESCRIBE ?train WHERE {{ ?link a core:URLLink . {0} {1} }} """.format(''.join([ f['filter'] for f in filters if f.get('filter') ]), ''.join([ f['values'] for f in filters if f.get('values') ]) ) remote = SPARQLWrapper.SPARQLWrapper(queryUrl, updateUrl) remote.addParameter('email', user) remote.addParameter('password', passw) remote.setMethod(SPARQLWrapper.POST) remote.setQuery( query ) results = remote.queryAndConvert() resources = defaultdict(lambda: defaultdict(list)) for r in results.triples((None,None,None)): resources[r[0].toPython()][r[1].toPython()].append(r[2].toPython()) out = [] for r in resources: res = models.WebLink(uri=r) res.load(resources[r]) out.append(res) return out
def clear_graph(sparql_endpoint=SPARQL_ENDPOINT, graph=EVAL_DATA_GRAPH): sparql = SPARQLWrapper.SPARQLWrapper(sparql_endpoint) sparql.setMethod(SPARQLWrapper.POST) q = 'CLEAR GRAPH %s' % (URIRef(graph).n3(), ) logger.info('Clearing graph %s on endpoint %s', graph, sparql_endpoint) sparql.setQuery(q) try: sparql.query() except urllib2.HTTPError: # argh, don't ask me why, but it seems we get a 406 on success # TODO: report to SPARQLWrapper? pass
def query_collaborators(uris=None, faculty=None, label=None, fullName=None, alphaName=None): filters = [] if faculty: filters.append({'filter': write_statement( ("<{}>".format(faculty), '<http://vivoweb.org/ontology/core#hasCollaborator>', '?uri') ) }) if label: filters.append( make_filter('?uri', '<http://www.w3.org/2000/01/rdf-schema#label>', '?label', '{}'.format(json.dumps(label)) ) ) if fullName: filters.append( make_filter('?uri', '<http://vivo.brown.edu/ontology/vivo-brown/fullName>', '?fullName', '{}'.format(json.dumps(fullName)) ) ) if alphaName: filters.append( make_filter('?uri', '<http://vivo.brown.edu/ontology/vivo-brown/alphaName>', '?alphaName', '{}'.format(json.dumps(alphaName)) ) ) if uris: filters.append( make_filter(None,None,'?uri',''.join( ['<{}>'.format(u) for u in uris ]) ) ) query = """ PREFIX vivo: <http://vivoweb.org/ontology/core#> DESCRIBE ?uri WHERE {{ ?uri a vivo:FacultyMember . {0} {1} }} """.format(''.join([ f['filter'] for f in filters if f.get('filter') ]), ''.join([ f['values'] for f in filters if f.get('values') ]) ) remote = SPARQLWrapper.SPARQLWrapper(queryUrl, updateUrl) remote.addParameter('email', user) remote.addParameter('password', passw) remote.setMethod(SPARQLWrapper.POST) remote.setQuery( query ) results = remote.queryAndConvert() resources = defaultdict(lambda: defaultdict(list)) for r in results.triples((None,None,None)): resources[r[0].toPython()][r[1].toPython()].append(r[2].toPython()) out = [] for r in resources: res = models.Collaborator(uri=r) res.load(resources[r]) out.append(res) return out
def query_wikidata_service(searchterm, language_code): query = """SELECT ?item ?itemLabel ?subclass_of ?subclass_ofLabel ?category_of ?category_ofLabel ?instance_of ?instance_ofLabel WHERE { SERVICE wikibase:mwapi { bd:serviceParam wikibase:api "EntitySearch" . bd:serviceParam wikibase:endpoint "www.wikidata.org" . bd:serviceParam mwapi:search '""" + searchterm + """' . bd:serviceParam mwapi:language '""" + language_code + """' . bd:serviceParam wikibase:limit 1 . ?item wikibase:apiOutputItem mwapi:item .} SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } OPTIONAL { ?item (wdt:P279) ?subclass_of.} OPTIONAL { ?item (wdt:P910) ?category_of.} OPTIONAL { ?item (wdt:P31) ?instance_of.}}""" url = 'https://query.wikidata.org/sparql' sparql = SPARQLWrapper(url) sparql.setQuery(query) sparql.setReturnFormat(JSON) return sparql.query().convert()
def get_similar_resources(resource): sparql = SPARQLWrapper("http://dbpedia.org/sparql") query1 = """ select distinct ?prop ?value where { <http://dbpedia.org/resource/""" + resource + """> ?prop ?value } """ sparql.setQuery(query1) sparql.setReturnFormat(JSON) results = sparql.query().convert() resource_type = get_resource_type(results) get_distractors(resource, resource_type) # Alchemy API Part starts abstract = get_abstract(results) concepts = alchemy_concepts(abstract) for concept in concepts: for res in similar_resources: if concept[0] == res[0]: res[2] = concept[1] for res in similar_resources: res[3] = int(total_one_degree_paths(resource, res[0])) similar_resources.sort(key=lambda x: (-x[2], -x[1], -x[3])) # Alchemy API part ends tot_val = len(similar_resources) tot = '"total": "' + str(tot_val) + '", ' ans = '{' + tot + ' "error": "0" , "resources": [' res = "" i = 1 for x in similar_resources: res += """ { "rank": \"""" + str(i) + """\", "dbpedia": \"""" + x[0] + """\", "similarity": \"""" + str(x[1]) + """\", "alchemy": \"""" + str(x[2]) + """\", "paths": \"""" + str(x[3]) + """\" },""" i += 1 ans += res ans = ans[0:-1] ans += ']}' json_obj = json.loads(ans, strict=False) ans = json.dumps(json_obj, indent=4) print ans
def query_faculty(shortId): uri = shortIdToUri(shortId) remote = SPARQLWrapper.SPARQLWrapper(queryUrl, updateUrl) remote.addParameter('email', user) remote.addParameter('password', passw) remote.setMethod(SPARQLWrapper.POST) remote.setQuery( "DESCRIBE ?uri WHERE {{ VALUES ?uri {{ <{0}> }} }}".format(uri) ) results = remote.queryAndConvert() out = defaultdict(list) for r in results.triples((None,None,None)): out[r[1].toPython()].append(r[2].toPython()) profile = models.FacultyProfile(uri) profile.load(out) return profile
def chech_dbpedia(item): sparql = SPARQLWrapper.SPARQLWrapper("http://dbpedia.org/sparql") try: sparql.setQuery(""" PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?label WHERE {{ <http://dbpedia.org/resource/{}> rdfs:label ?label }} """.format(item[::-1])) sparql.setReturnFormat(SPARQLWrapper.JSON) results = sparql.query().convert() except: return False else: if results == []: return False return True
def querySPARQLtoRDF(query): repeats = 0 while True: try: sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINTS[repeats]) sparql.setQuery(query) sparql.setReturnFormat(SPARQLWrapper.RDF) results = sparql.query().convert() return results['results']['bindings'] except Exception, e: print 'Repeating query. Error', e repeats += 1 if repeats > len(SPARQL_ENDPOINTS) - 1: print 'Following query could not finish normally:', query return []
def query_faculty_association(shortId, assocProp): remote = SPARQLWrapper.SPARQLWrapper(queryUrl, updateUrl) remote.addParameter('email', user) remote.addParameter('password', passw) remote.setMethod(SPARQLWrapper.POST) remote.setQuery( """ DESCRIBE ?assoc WHERE {{ ?uri <{1}> ?assoc. VALUES ?uri {{ <{0}> }} }}""".format( shortIdToUri(shortId), assocProp) ) results = remote.queryAndConvert() out = defaultdict(lambda: defaultdict(list)) for r in results.triples((None,None,None)): out[r[0]][r[1].toPython()].append(r[2].toPython()) return out
def SparqlRequest(rq_code, rq_uri, defgraph=None, fmt=None): rq_results = None logging.debug('URI: %s'%(rq_uri)) logging.debug('%s'%(rq_code)) try: sparql = SPARQLWrapper.SPARQLWrapper(rq_uri) sparql.setQuery(rq_code) if fmt: sparql.setReturnFormat(fmt) rq_results = sparql.query() except Exception as e: logging.error('Error: %s'%e) errtype,val,traceback = sys.exc_info() logging.error('sys.exc_info:\n(%s)\n%s'%(errtype,val)) if traceback: logging.info('traceback:\n%s>'%(traceback)) logging.error('%s'%str(rq_code)) return rq_results
def fetch_mathematicians(num): sparql = SPARQLWrapper.SPARQLWrapper('https://query.wikidata.org/sparql') sparql.setQuery( """SELECT ?mathematician ?mathematicianLabel (COUNT(DISTINCT ?sitelink) AS ?sites) ?Mathematics_Genealogy_Project_ID WHERE {{ SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }} ?mathematician wdt:P549 ?Mathematics_Genealogy_Project_ID. ?sitelink schema:about ?mathematician. }} GROUP BY ?mathematician ?mathematicianLabel ?Mathematics_Genealogy_Project_ID ORDER BY DESC(?sites) LIMIT {:d}""".format(num)) sparql.setReturnFormat(SPARQLWrapper.JSON) results = sparql.query().convert() return [{ 'wiki_id': int(result['mathematician']['value'].split('/')[-1][1:]), 'id': int(result['Mathematics_Genealogy_Project_ID']['value']), 'score': int(result['sites']['value']) } for result in results['results']['bindings']]
def get(namehint: str) -> List[PlaceSuggestion]: if namehint == '': return list() escapednamehint = (namehint.lower().replace('\n', '\\n').replace( '\"', '\\\"').replace('\t', '\\t')) sw = SPARQLWrapper.SPARQLWrapper("http://dbpedia.org/sparql", returnFormat=SPARQLWrapper.JSON) q = f""" PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX dbp: <http://dbpedia.org/property/> PREFIX dbr: <http://dbpedia.org/resource/> PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> SELECT DISTINCT ?plc (LANG(?nm) as ?lang) ?lat ?long ?nm ?abs WHERE {{ ?plc rdfs:label ?nm ; dbo:abstract ?abs ; (dbp:latitude|geo:lat) ?lat ; (dbp:longitude|geo:long) ?long . FILTER( LANG(?nm)=LANG(?abs) ) . FILTER( STRSTARTS( LCASE(STR( ?nm )), "{escapednamehint}" ) ) . }} LIMIT 15 """ # sw.setTimeout(30) sw.setQuery(q) results = sw.query().convert() return [ PlaceSuggestion(result["lang"]["value"], result["lat"]["value"], result["long"]["value"], result["nm"]["value"], result["abs"]["value"], result["plc"]["value"]) for result in results["results"]["bindings"] ]
def query_training(shortId): uri = shortIdToUri(shortId) train_prop = property_map['trainings'] spec_prop = property_map['specialty'] hosp_prop = property_map['hospital'] org_prop = property_map['organization'] label_prop = property_map['label'] remote = SPARQLWrapper.SPARQLWrapper(queryUrl, updateUrl) remote.addParameter('email', user) remote.addParameter('password', passw) remote.setMethod(SPARQLWrapper.POST) remote.setQuery(""" DESCRIBE ?uri ?x1 ?x2 ?x3 ?x4 WHERE {{ ?uri <{1}> ?x1 . OPTIONAL {{?x1 <{2}> ?x2 .}} OPTIONAL {{?x1 <{3}> ?x3 .}} OPTIONAL {{?x1 <{4}> ?x4 .}} values ?uri {{ <{0}> }} }}""".format(uri, train_prop, spec_prop, hosp_prop, org_prop) ) results = remote.queryAndConvert() resources = defaultdict(lambda: defaultdict(list)) for r in results.triples((None,None,None)): resources[r[0].toPython()][r[1].toPython()].append(r[2].toPython()) fac = resources[uri] trainings = [ (train, resources[train]) for train in fac.get(train_prop, []) ] out = [] for rabid, data in trainings: data['rabid'] = rabid if data.get(spec_prop): data[spec_prop] = [ { 'rabid': spec, 'label': resources[spec][label_prop] } for spec in data[spec_prop] ] if data.get(hosp_prop): data[hosp_prop] = [ { 'rabid': hosp, 'label': resources[hosp][label_prop] } for hosp in data[hosp_prop] ] if data.get(org_prop): data[org_prop] = [ { 'rabid': org, 'label': resources[org][label_prop] } for org in data[org_prop] ] out.append(data) return out
def get_items(self, query, item_name="item"): def get_id(line): return line[item_name]["value"].replace(self.entity_url, "") sparql = SPARQLWrapper.SPARQLWrapper(self.endpoint) sparql.setQuery(query) sparql.setReturnFormat(SPARQLWrapper.JSON) result = sparql.query().convert()["results"]["bindings"] item_ids = (get_id(line) for line in result) while True: batch = list(itertools.islice(item_ids, 50)) if not batch: break r = self.site.api(action="wbgetentities", ids=batch) for raw_entity in r["entities"].values(): try: yield wbinteract.Item.from_json(self.site, raw_entity) except ValueError: pass
def mint_uri(): qtext = "ASK WHERE {{ {{ <{0}> ?p ?o. }} UNION {{ ?s ?p2 <{0}> }} }}" remote = SPARQLWrapper.SPARQLWrapper(queryUrl, updateUrl) remote.setReturnFormat('json') remote.addParameter('email', user) remote.addParameter('password', passw) remote.setMethod(SPARQLWrapper.POST) new_uri = False tries = 0 while not new_uri and tries < 50: uri = 'http://vivo.brown.edu/individual/n{}'.format( uuid.uuid4().hex) remote.setQuery( qtext.format(uri) ) resp = remote.queryAndConvert() if not resp['boolean']: new_uri = uri else: tries += 1 return new_uri
def get_graph_count(config): # logger.info('Querying {}/sparql'.format(config['fuseki'])) sparql = SPARQLWrapper.SPARQLWrapper('{}/sparql'.format(config['fuseki'])) sparql.setMethod(SPARQLWrapper.POST) # to avoid caching sparql.setReturnFormat(SPARQLWrapper.JSON) sparql.setQuery( textwrap.dedent(""" PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT (COUNT(?s) as ?conceptCount) WHERE { GRAPH <%s> { ?s a skos:Concept . } } """ % (config['graph']))) results = sparql.query().convert() count = results['results']['bindings'][0]['conceptCount']['value'] return int(count)
def create(wiki_directory, target_directory, subdir_start, subdir_end, process_no): global no_sents global logfile subdir_count = 0 sent_count = 0 logfile = open("../logs/log" + str(process_no) + ".txt", "w") if not os.path.exists(target_directory): os.makedirs(target_directory) tagger = PerceptronTagger() sparql = SPARQLWrapper.SPARQLWrapper("http://localhost:8890/sparql/") print("processing " + wiki_directory) logfile.write("processing " + wiki_directory + "\n") for subdir in os.listdir(wiki_directory): wiki_sub_directory = wiki_directory + "/" + subdir if not os.path.isdir(wiki_sub_directory): continue #process only subdirs from subdir_start to subdir_end if (subdir_count < subdir_start): subdir_count += 1 # print("skipping "+wiki_sub_directory) logfile.write("skipping " + wiki_sub_directory + "\n") continue if subdir_count >= subdir_end: break print("processing " + wiki_sub_directory) logfile.write("processing " + wiki_sub_directory + "\n") for file in os.listdir(wiki_directory + "/" + subdir): wiki_file_path = wiki_sub_directory + "/" + file target_subdir = target_directory + "/" + subdir sent_count = process_file(wiki_file_path, sparql, tagger, sent_count, target_subdir, file) # stop after single file # return no_sents = sent_count logfile.close()
def count_freq(resource_type, prop): ''' Function which counts how many times has the property appeared w.r.t. the resource type. ''' freq = 0 sparql = SPARQLWrapper("http://dbpedia.org/sparql") query1 = """ SELECT COUNT(DISTINCT ?entity) WHERE { ?entity <""" + prop + """> ?value. ?entity <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <""" + resource_type + """> } """ sparql.setQuery(query1) sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: freq = int(result["callret-0"]["value"]) return freq
def total_pages_for_type(resource_type): ''' Function which returns the total number of resources belonging to given resource type. ''' freq = 0 sparql = SPARQLWrapper("http://dbpedia.org/sparql") query1 = """ SELECT COUNT(DISTINCT ?entity) WHERE { ?entity <http://dbpedia.org/ontology/wikiPageID> ?value. ?entity <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <""" + resource_type + """> } """ sparql.setQuery(query1) sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: freq = int(result["callret-0"]["value"]) return freq
#!/usr/bin/env python3 from SPARQLWrapper import * import os import os.path as path import json import glob import time #BaseUrl="http://buda1.bdrc.io:13180/fuseki/bdrcrw/" BaseUrl="http://localhost:13180/fuseki/bdrcrw/" QueryEndpoint = SPARQLWrapper(BaseUrl+"query") QueryEndpoint.setRequestMethod(POSTDIRECTLY) QueryEndpoint.setMethod(POST) QueryEndpoint.setReturnFormat(JSON) UpdateEndpoint = SPARQLWrapper(BaseUrl+"update") UpdateEndpoint.setRequestMethod(POSTDIRECTLY) UpdateEndpoint.setMethod(POST) ThisPath = os.path.dirname(os.path.abspath(__file__)) def get_all_tests(testgroupname, specifictest=None): grouppath = path.join(ThisPath, testgroupname) res = [] if specifictest is not None: res.append(path.join(grouppath, specifictest)) return res for dirname in sorted(glob.glob(grouppath+"/*")): if path.isdir(dirname): res.append(dirname) return res
fh = open(CRAWLER_DIR+localName, "w") fh.write(response.read()) fh.close() return CRAWLER_DIR+localName def loadMetricConfiguration(): g = rdflib.Graph(); config = g.parse("config.ttl", format="turtle") return g.serialize(format="json-ld", indent=0) def formatMetricConfiguration(configStr): formattedStr = configStr.replace('\n', ' ').replace('\r', '').replace('"','\"') return formattedStr # MAIN sparql = SPARQLWrapper(LOD_LAUNDROMAT_SPARQL) sparql.setQuery('PREFIX llo: <http://lodlaundromat.org/ontology/> SELECT ?md5 WHERE { ?d llo:triples ?n . ?d llo:md5 ?md5 . FILTER (?n > 0) }') sparql.setReturnFormat(JSON) results = sparql.query().convert() if not os.path.exists(CRAWLER_DIR): os.makedirs(CRAWLER_DIR) metricsConf = formatMetricConfiguration(loadMetricConfiguration()) for result in results["results"]["bindings"]: document = LOD_LAUNDROMAT_DOWNLOAD + result['md5']['value'] print 'Downloading : '+document filename = download(document) logger_crawl.info("Metrics config: {0}".format(metricsConf))
def fes_date_filter(start_date='1900-01-01',stop_date='2100-01-01',constraint='overlaps'): if constraint == 'overlaps': start = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:TempExtent_begin', literal=stop_date) stop = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:TempExtent_end', literal=start_date) elif constraint == 'within': start = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:TempExtent_begin', literal=start_date) stop = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:TempExtent_end', literal=stop_date) return start,stop #Establish bounding box filter for Geographic Range of IBAs bbox = fes.BBox([-130.5, 47.9, 167.6, 74.7]) # In[50]: sparql = SPARQLWrapper("http://mmisw.org/sparql") queryString = """ PREFIX ioos: <http://mmisw.org/ont/ioos/parameter/> SELECT DISTINCT ?parameter ?definition ?unit ?property ?value WHERE {?parameter a ioos:Parameter . ?parameter ?property ?value . ?parameter ioos:Term ?term . ?parameter ioos:Definition ?definition . ?parameter ioos:Units ?unit . FILTER (regex(str(?property), "(exactMatch|closeMatch)", "i") && regex(str(?value), "temperature", "i") ) } ORDER BY ?parameter """ sparql.setQuery(queryString) sparql.setReturnFormat(JSON)
def handle_is_of_relations(resource, resource_type, total_pages): sparql = SPARQLWrapper("http://dbpedia.org/sparql") query1 = """ select ?prop ?value where { ?value ?prop <http://dbpedia.org/resource/""" + resource + """> } """ sparql.setQuery(query1) sparql.setReturnFormat(JSON) results = sparql.query().convert() for result in results["results"]["bindings"]: prop = result["prop"]["value"] value = result["value"]["value"] cleaned_property_label = get_label(prop) if blacklisted(cleaned_property_label): continue if "ontology" not in prop and "property" not in prop and "subject" not in prop: continue if prop in ans_dict: ans_dict[prop].setdefault('value', []).append(value) prop_val_count[prop] += 1 ans_dict[prop]['is_of_relation'] = 1 continue prop_info = dict.fromkeys(parameter_list, 0) prop_info['score'] = 0 prop_info['value'] = [] if "xml:lang" not in result["value"] or 'en' in result["value"]["xml:lang"]: prop_value = {} prop_value['prop'] = prop prop_value['value'] = value if prop in prop_val_count: prop_val_count[prop] += 1 else: prop_val_count[prop] = 1 cleaned_property_label = get_label(prop) prop_info['label'] = cleaned_property_label prop_info.setdefault('value', []).append(value) prop_info['blacklisted'] = 0 ''' if prop_info['blacklisted']: ans_dict[prop] = prop_info continue ''' google_autosuggest = google_autocomplete_ranker( resource, cleaned_property_label) prop_info['is_onto'] = is_onto(prop) prop_info['special_char'] = doesnt_contain_special_chars( cleaned_property_label) prop_info['no_of_words'] = no_of_words(cleaned_property_label) range_comment = prop_has_range_or_comment(prop_value) prop_info['has_range'] = range_comment[0] prop_info['has_comment'] = range_comment[1] prop_info['value_relevant'] = value_relevant(prop_value) prop_info['special_datatype'] = is_special_datatype(result) prop_info['google_keypress'] = google_autosuggest[0] prop_info['google_location'] = google_autosuggest[1] prop_info['is_of_relation'] = 1 prop_info['frequency'] = count_freq( resource_type, prop) / float(total_pages) ans_dict[prop] = prop_info
def start(resource): sparql = SPARQLWrapper("http://dbpedia.org/sparql") query1 = """ select distinct ?prop ?value where { <http://dbpedia.org/resource/""" + resource + """> ?prop ?value } """ sparql.setQuery(query1) sparql.setReturnFormat(JSON) results = sparql.query().convert() resource_type = get_resource_type(results) total_pages = total_pages_for_type(resource_type) for result in results["results"]["bindings"]: prop = result["prop"]["value"] value = result["value"]["value"] cleaned_property_label = get_label(prop) if "ontology" not in prop and "property" not in prop and "subject" not in prop: continue prop_info = dict.fromkeys(parameter_list, 0) prop_info['score'] = 0 prop_info['value'] = [] if "xml:lang" not in result["value"] or 'en' in result["value"]["xml:lang"]: prop_value = {} prop_value['prop'] = prop prop_value['value'] = value if prop in prop_val_count: prop_val_count[prop] += 1 else: normalized_label = cleaned_property_label.lower().replace( ' ', '') if normalized_label in normalized_labels: continue normalized_labels.append(normalized_label) prop_val_count[prop] = 1 if prop in ans_dict: ans_dict[prop].setdefault('value', []).append(value) continue prop_info['label'] = cleaned_property_label prop_info.setdefault('value', []).append(value) prop_info['blacklisted'] = 0 ''' if prop_info['blacklisted']: ans_dict[prop] = prop_info continue ''' google_autosuggest = google_autocomplete_ranker( resource, cleaned_property_label) prop_info['is_onto'] = is_onto(prop) prop_info['special_char'] = doesnt_contain_special_chars( cleaned_property_label) prop_info['no_of_words'] = no_of_words(cleaned_property_label) range_comment = prop_has_range_or_comment(prop_value) prop_info['has_range'] = range_comment[0] prop_info['has_comment'] = range_comment[1] prop_info['value_relevant'] = value_relevant(prop_value) prop_info['special_datatype'] = is_special_datatype(result) prop_info['google_keypress'] = google_autosuggest[0] prop_info['google_location'] = google_autosuggest[1] prop_info['is_of_relation'] = 0 prop_info['frequency'] = count_freq( resource_type, prop) / float(total_pages) ans_dict[prop] = prop_info #handle_is_of_relations(resource, resource_type, total_pages) for prop, count in prop_val_count.iteritems(): ans_dict[prop]['total_values'] = (1.0 - 1.0 / count) #score = raw_input("Enter score for: " + prop + " (from 1-5) \n") ans_dict[prop]['score'] = 0