def testPopNextURLAndMarkAsVisitedHandlesCount(self): # Populate the test database. session = self.database_handler.CreateSession() the_url = URL('http://www.microsoft.com/', 1) the_url.links_to = 500 session.add(the_url) the_url = URL('http://www.google.com/', 1) the_url.links_to = 1000 session.add(the_url) session.commit() # Test pop. crawler_thread = CrawlerThread( self.database_handler, None, self.url_lock) the_url = crawler_thread.PopNextURLAndMarkAsVisited() self.assertEqual('http://www.google.com/', the_url) # Test second pop. the_url = crawler_thread.PopNextURLAndMarkAsVisited() self.assertEqual('http://www.microsoft.com/', the_url)
def testHandleHtmlResourceIncrementsLinksTo(self): # Populate the test database. session = self.database_handler.CreateSession() the_url = URL('http://www.google.com/', 1) the_url.links_to = 1000 session.add(the_url) session.commit() # Create test file. file_handle = StringIO.StringIO(textwrap.dedent(""" <a href='http://www.google.com/'>Google</a> """)) file_handle.url = 'http://www.test.com' # Test handling of HTML resource. crawler_thread = CrawlerThread( self.database_handler, None, self.url_lock) crawler_thread.HandleHtmlResource(file_handle) query = session.query(URL) results = query.filter(URL.url == 'http://www.google.com/') self.assertEqual(1, results.count()) the_url = results.first() self.assertEqual(1001, the_url.links_to)