def get_page_id(v): pageid = None #http://it.wikipedia.org/w/api.php?action=query&titles=Abbazia_di_San_Galgano&format=json #{"query":{"normalized":[{"from":"Abbazia_di_San_Galgano","to":"Abbazia di San Galgano"}],"pages":{"83117":{"pageid":83117,"ns":0,"title":"Abbazia di San Galgano"}}}} queryurl = UrlBuilder(domain=WPDOMAIN,path="w/api.php",params="action=query") queryurl.set_attr('titles',v) queryurl.set_attr('format','json') query=queryurl.build() for ntry in range(1,MAXTRIES): print "Request no. %d - Requesting %s" %(ntry,query) jsonpage = urllib2.urlopen(query) try: jobj = json.load(jsonpage) pageid = int(jobj['query']['pages'].keys()[0]) break except Exception as e: print e pageid = None time.sleep(5) continue return pageid
def query_api(): queryurl = UrlBuilder(domain=WPDOMAIN,path="w/api.php",params="action=query") queryurl.set_attr('generator','embeddedin') queryurl.set_attr('geititle',WPTNAME) queryurl.set_attr('einamespace','0') queryurl.set_attr('geilimit','500') queryurl.set_attr('format','xml') inlist=list() while True: print "Requesting %s" %queryurl.build() infile = urllib2.urlopen(queryurl.build()) inxml = infile.read() xml = parseString(inxml) pagelist=xml.getElementsByTagName("page") for page in pagelist: inlist.append(page.getAttribute("title")) querycont=xml.getElementsByTagName("embeddedin") if len(querycont) == 0: break geicontinue=querycont[0].getAttribute("geicontinue") queryurl.set_attr("geicontinue",geicontinue) time.sleep(5) return inlist
u'FineCostr', u'Demolizione', u'Sito', u'lat', u'long' ] """ Utility functions """ _jsonu = UrlBuilder( domain="json.it.dbpedia.org", path="annotate/resource/json/it%3A{wp-page}", params="filter=__type:template" ) _jsonu.set_attr('flags','-Extractors,Structure,') _jsonbaseurl=_jsonu.build() def get_jsonpedia_page(v): """ Gets the corrisponding JSONpedia page (only templates) for Wikipedia article titled 'v'. Tries MAXTRIES times or returns none. """ vsafe = v.replace(' ','_') jsonurl = _jsonbaseurl.replace('{wp-page}',urllib.quote(vsafe)) for ntry in range(1,MAXTRIES): try: print "Request no. %d: requesting: %s" %(ntry,jsonurl) jsonpage = urllib2.urlopen(jsonurl) break