def getNewItems(root): for d in root.xpath('//td[@class="listlevelthree"]/../td/a'): dossier = fetch((URL+d.attrib['href']).encode('utf8')) for e in dossier.xpath('//a[@class="com_acronym"]'): d_url = e.attrib['href'] if not db.dossiers.find_one({'meta.source': URL+d_url}): oeil_scrape(URL+d_url)
def getNewItems(root): for d in root.xpath('//td[@class="listlevelthree"]/../td/a'): dossier = fetch((URL + d.attrib['href']).encode('utf8')) for e in dossier.xpath('//a[@class="com_acronym"]'): d_url = e.attrib['href'] if not db.dossiers.find_one({'meta.source': URL + d_url}): oeil_scrape(URL + d_url)
def scrape(url): root = fetch(url) # TODO optimize this!! (reduce steps) if not exists(LAST_UPDATED_CACHE) or open(LAST_UPDATED_CACHE).read() != strip(root.xpath('//div[text()="Data updated on :"]/span/text()')[0]): print >>sys.stderr, '[!] Site modification found, scraping unfinished dossiers....' for d in db.dossiers.find({'procedure.stage_reached': {'$in': STAGES}},timeout=False): oeil_scrape(d['meta']['source']) print >>sys.stderr, '\t%s, %s' % (d['procedure']['reference'].encode('utf8'), d['procedure']['title'].encode('utf8')) f = open(LAST_UPDATED_CACHE, "w+") f.write(strip(root.xpath('//div[text()="Data updated on :"]/span/text()')[0])) f.close() print >>sys.stderr, '\n[!] Searching/scraping new items..' getNewItems(root) return True
def scrape(url): root = fetch(url) # TODO optimize this!! (reduce steps) if not exists(LAST_UPDATED_CACHE) or open(LAST_UPDATED_CACHE).read( ) != strip(root.xpath('//div[text()="Data updated on :"]/span/text()')[0]): print >> sys.stderr, '[!] Site modification found, scraping unfinished dossiers....' for d in db.dossiers.find({'procedure.stage_reached': { '$in': STAGES }}, timeout=False): oeil_scrape(d['meta']['source']) print >> sys.stderr, '\t%s, %s' % ( d['procedure']['reference'].encode('utf8'), d['procedure']['title'].encode('utf8')) f = open(LAST_UPDATED_CACHE, "w+") f.write( strip( root.xpath('//div[text()="Data updated on :"]/span/text()') [0])) f.close() print >> sys.stderr, '\n[!] Searching/scraping new items..' getNewItems(root) return True