return art def TidyURL( url ): """ Tidy up URL - trim off params, query, fragment... """ o = urlparse.urlparse( url ) url = urlparse.urlunparse( (o[0],o[1],o[2],'','','') ); return url def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" url = TidyURL(url) context = {} context['srcurl'] = url context['permalink'] = url context['srcorgname'] = u'times' context['lastseen'] = datetime.now() return context if __name__ == "__main__": # create a url opener which remembers cookies (as well as throttling and all the other uber-opener stuff) cj = cookielib.LWPCookieJar() opener = ScraperUtils.build_uber_opener(cookiejar=cj) # large maxerrors to handle video-only pages ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=200, prep=Prep, sesh=opener )