def test_crawler_recurses(self): # Arrange html = """ <html><body><a href="http://testurl.com/testpage.html">Link text</a></body></html> """ initial_url = 'http://www.initialurl.com/' mock_urllib = create_autospec(urllib2) crawler = Crawler(mock_urllib) # Act crawler.crawl([initial_url]) # Assert expected_calls = [call.urlopen(initial_url), call.urlopen('http://testurl.com/testpage.html')] mock_urllib.assert_has_calls(expected_calls)
def testItReturnsTheCrawledUrls(self): crawler = Crawler() urlsToCrawl = ['http://google.se', 'http://aftonbladet.se'] for url in urlsToCrawl: crawler.add_to_crawl(url) result = crawler.crawl() self.assertEquals( urlsToCrawl, result, 'Not all urls that was supposed to be crawled was crawled.')
def test_crawler_recurses(self): # Arrange html = """ <html><body><a href="http://testurl.com/testpage.html">Link text</a></body></html> """ initial_url = 'http://www.initialurl.com/' mock_urllib = create_autospec(urllib2) crawler = Crawler(mock_urllib) # Act crawler.crawl([initial_url]) # Assert expected_calls = [ call.urlopen(initial_url), call.urlopen('http://testurl.com/testpage.html') ] mock_urllib.assert_has_calls(expected_calls)
def scrape_documents( min_count=0, url_seeds=GlobalConfiguration.DEFAULT_URL_SEEDS): doc_count = 0 s = Crawler(url_seeds) docs = s.crawl(min_count) while min_count <= 0 or doc_count < min_count: for doc in docs: temp_file = get_new_file() pickle.dump(doc, temp_file) temp_file.close() log.debug('saved image doc from %s', doc.url) doc_count += 1 if doc_count % 100 == 0: log.info('%d images and counting...', doc_count) log.info('finished indexing images.') log.info('%d documents indexed', doc_count)
from crawler.Crawler import Crawler if __name__ == '__main__': crawler = Crawler() crawler.crawl('http://www.prestigetime.com/')
def testItCanCrawl(self): crawler = Crawler() crawler.add_to_crawl('http://google.se') crawler.crawl()