def testRequestAllLink(self): url = "http://www.jehovahs-witness.com" agent = "VegeBot-Careful" source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**", retries=0) links = LinkChecker.get_all_links_from_source(source) for link in links: paras = urlsplit(link) page_scheme, page_domain = paras[0], paras[1] print(LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme))
def testRemoveFootprint2(self): link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" page_source = LinkChecker.get_page_source(link) bs4_tree = LinkUtility.remove_archive_org_footprint(page_source.text) link_list = [] for child in bs4_tree.find_all(): if isinstance(child, bs4.Tag): if "href" in child.attrs: link_list.append(child["href"]) elif "src" in child.attrs: link_list.append(child["src"]) for item in link_list: print(item)
def testRequestAllLink(self): url = "http://www.jehovahs-witness.com" agent = "VegeBot-Careful" source = LinkChecker.get_page_source( url, agent=agent, from_src="*****@*****.**", retries=0) links = LinkChecker.get_all_links_from_source(source) for link in links: paras = urlsplit(link) page_scheme, page_domain = paras[0], paras[1] print( LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme))
def test_from_url(self): response = LinkChecker.get_page_source( link= "http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848" ) print(langid.classify(response.text))
def test_from_url(self): response = LinkChecker.get_page_source(link="http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848") print(langid.classify(response.text))
def test_get_all_links(self): link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" source = LinkChecker.get_page_source(link) all_links = LinkChecker.get_all_links_from_source(source) for link in all_links: print(link)
def testRequest(self): url = "http://127.0.0.1:8000/" agent = "VegeBot" source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**") print(source)
def check_internal_page(checker: SiteChecker, page: OnSiteLink, timeout=10) -> ([], []): internal_pages = [] external_pages = [] # # if isinstance(checker.robot_agent, robotparser.RobotFileParser): # if not checker.robot_agent.can_fetch(useragent=checker.agent, url=page.link): # return [], [] # print("checking internal_page", page) if isinstance(checker.robot_agent, Rules): try: if not checker.robot_agent.allowed(page.link, agent=checker.agent): return [], [] except: return [], [] use_lxml_parser = checker.use_lxml_parser() with checker.task_control_lock: time.sleep(checker.site_crawl_delay) response = LinkChecker.get_page_source(page.link, timeout, agent=checker.agent, from_src=checker.agent_from) if response is None or response.status_code == ResponseCode.LinkError: return [], [] paras = urlsplit(page.link) page_scheme, page_domain = paras[0], paras[1] links = LinkChecker.get_webpage_links_from_source(response, use_lxml_parser) for link in links: link_type = OnSiteLink.TypeOutbound valid_link = LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme) # if PageChecker.is_link_in_list(valid_link, new_pages): # continue try: link_paras = urlsplit(valid_link) link_scheme, link_domain, link_path = link_paras[0], link_paras[1], link_paras[2] if link_domain.lower().startswith("mailto:"): continue if not LinkChecker.might_be_link_html_page(link_path): continue except: continue # if str(link_domain).endswith(checker.root_domain): if checker.sub_domain_no_local in link_domain: # important change if checker.data_source.all_record > checker.max_page: continue link_type = OnSiteLink.TypeOnSite else: # external valid_link = link_scheme + "://" + link_domain if link_type == OnSiteLink.TypeOnSite: if checker.is_link_in_cache(valid_link): continue else: checker.add_link_to_cache(valid_link) internal_page = (valid_link, ResponseCode.LinkOK, page.link_level+1, OnSiteLink.TypeOnSite) internal_pages.append(internal_page) else: stripped = str(link_domain).lower().strip() if stripped in checker.external_cache_list: continue if len(checker.external_cache_list) < checker.external_cache_size: checker.external_cache_list.append(stripped) external_page = (stripped, ResponseCode.DNSError) external_pages.append(external_page) return internal_pages, external_pages
def testRequest(self): url = "http://127.0.0.1:8000/" agent = "VegeBot" source = LinkChecker.get_page_source( url, agent=agent, from_src="*****@*****.**") print(source)