def testPageCrawl2(self): link = "http://stackoverflow.com/" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink(link="http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail", response_code=999) PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page(checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def testPageCrawl(self): link = "http://www.secondcityhockey.com" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018", response_code=999) # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999) # PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page(checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def testPageCrawl2(self): link = "http://stackoverflow.com/" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink( link= "http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail", response_code=999) PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page( checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def testPageCrawl(self): link = "http://www.secondcityhockey.com" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink( link= "http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018", response_code=999) # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999) # PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page( checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)