def test_tidy_url(self): data = [ ("http://menmedia.co.uk/asiannews/news/crime/s/1420665_man-wanted-in-connection-with-robbery-and-assault?rss=yes", "http://menmedia.co.uk/asiannews/news/crime/s/1420665_man-wanted-in-connection-with-robbery-and-assault"), ("http://www.belfasttelegraph.co.uk/news/health/diabetes-lsquocan-be-reversed-through-low-calorie-dietrsquo-16015584.html?r=RSS", "http://www.belfasttelegraph.co.uk/news/health/diabetes-lsquocan-be-reversed-through-low-calorie-dietrsquo-16015584.html"), ("http://nocruft.com/wibble-pibble","http://nocruft.com/wibble-pibble"), ] for url,tidied in data: self.assertEqual(ScraperUtils.tidy_url(url), tidied)
def testCanonicalURLs(self): # html, base_url, expected snippets = [ ('<head><link rel="canonical" href="http://example.com/products" /></head>', "", "http://example.com/products" ), ('<head><link href="http://example.com/products" rel="canonical" /></head>', "", "http://example.com/products"), ("""<HEAD><LINK foo="wibble" HRef ="http://example.com/products" class="pibble" REL = "canonical" / ></HEAD>""", "", "http://example.com/products"), ('<head><meta property="og:url" content="http://www.imdb.com/title/tt0117500/" /></head>', "", 'http://www.imdb.com/title/tt0117500/'), # test relative url ('<head><meta property="og:url" content="/title/tt0117500/" /></head>', "http://www.imdb.com/title/tt0117500/", 'http://www.imdb.com/title/tt0117500/'), # a live BBC example: # (urllib2.urlopen('http://www.bbc.co.uk/news/world-africa-13058694').read(), 'http://www.bbc.co.uk/news/world-africa-13058694'), # and one from the mirror: # (urllib2.urlopen('http://www.mirror.co.uk/news/top-stories/2011/05/11/william-and-kate-to-get-around-on-old-bikes-during-their-luxury-honeymoon-115875-23121689/').read(), # "http://www.mirror.co.uk/news/royal-wedding/2011/05/11/royal-honeymoon-prince-william-and-kate-middleton-to-get-around-seychelles-island-on-rickety-old-bikes-115875-23121689/" ), ] for html,base_url,expected in snippets: got = ScraperUtils.extract_canonical_url(html,base_url) self.assertEqual(got,expected)
body_div = article.cssselect('[itemprop~="articleBody"], [itemprop~="reviewBody"]')[0] # cruft removal for cruft in body_div.cssselect('.inline-pipes-list, #gigya-share-btns-2'): cruft.drop_tree() art['content'] = ukmedia.SanitiseHTML(unicode(lxml.html.tostring(body_div))) art['description'] = ukmedia.FirstPara( art['content'] ) art['srcorgname'] = u'independent' return art def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" # url = TidyURL(url) context = {} context['srcurl'] = url context['permalink'] = url context['srcorgname'] = u'independent' context['lastseen'] = datetime.now() return context if __name__ == "__main__": ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=150 )
m = art_idpat.search( url ) if m: return 'ft_' + m.group(1) m = blog_idpat.search( url ) if m: return 'ftblog_' + m.group(1) return None def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" context = {} context['srcurl'] = url context['permalink'] = url context['srcid'] = CalcSrcID( url ) context['srcorgname'] = u'ft' context['lastseen'] = datetime.now() # to clean the url... context = ScrubFunc( context, None ) return context if __name__ == "__main__": ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=50, prep=Prep )
srcid = CalcSrcID( context['srcurl'] ) if not srcid: return None # suppress it context['srcid'] = srcid return context def FindArticles(sesh): """ get a set of articles to scrape from the rss feeds """ articles = ScraperUtils.FindArticlesFromRSS( blog_feeds, u'skynews', ScrubFunc ) return articles def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" # NOTE: urls from the rss feed have a couple of extra components which # we _could_ strip out here... context = {} context['permalink'] = url context['srcurl'] = url context['srcid'] = CalcSrcID( url ) # context['srcorgname'] = u'skynews' context['lastseen'] = datetime.now() return context if __name__ == "__main__": ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract )
return art def TidyURL( url ): """ Tidy up URL - trim off params, query, fragment... """ o = urlparse.urlparse( url ) url = urlparse.urlunparse( (o[0],o[1],o[2],'','','') ); return url def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" url = TidyURL(url) context = {} context['srcurl'] = url context['permalink'] = url context['srcorgname'] = u'times' context['lastseen'] = datetime.now() return context if __name__ == "__main__": # create a url opener which remembers cookies (as well as throttling and all the other uber-opener stuff) cj = cookielib.LWPCookieJar() opener = ScraperUtils.build_uber_opener(cookiejar=cj) # large maxerrors to handle video-only pages ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=200, prep=Prep, sesh=opener )