def testRequestAllLink(self):
     url = "http://www.jehovahs-witness.com"
     agent = "VegeBot-Careful"
     source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**", retries=0)
     links = LinkChecker.get_all_links_from_source(source)
     for link in links:
         paras = urlsplit(link)
         page_scheme, page_domain = paras[0], paras[1]
         print(LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme))
 def testRemoveFootprint2(self):
     link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
     page_source = LinkChecker.get_page_source(link)
     bs4_tree = LinkUtility.remove_archive_org_footprint(page_source.text)
     link_list = []
     for child in bs4_tree.find_all():
         if isinstance(child, bs4.Tag):
             if "href" in child.attrs:
                 link_list.append(child["href"])
             elif "src" in child.attrs:
                 link_list.append(child["src"])
     for item in link_list:
         print(item)
 def testRemoveFootprint2(self):
     link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
     page_source = LinkChecker.get_page_source(link)
     bs4_tree = LinkUtility.remove_archive_org_footprint(page_source.text)
     link_list = []
     for child in bs4_tree.find_all():
         if isinstance(child, bs4.Tag):
             if "href" in child.attrs:
                 link_list.append(child["href"])
             elif "src" in child.attrs:
                 link_list.append(child["src"])
     for item in link_list:
         print(item)
Exemplo n.º 4
0
 def testRequestAllLink(self):
     url = "http://www.jehovahs-witness.com"
     agent = "VegeBot-Careful"
     source = LinkChecker.get_page_source(
         url,
         agent=agent,
         from_src="*****@*****.**",
         retries=0)
     links = LinkChecker.get_all_links_from_source(source)
     for link in links:
         paras = urlsplit(link)
         page_scheme, page_domain = paras[0], paras[1]
         print(
             LinkChecker.get_valid_link(page_domain, link.strip(),
                                        page_scheme))
 def test_from_url(self):
     response = LinkChecker.get_page_source(
         link=
         "http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848"
     )
     print(langid.classify(response.text))
 def test_from_url(self):
     response = LinkChecker.get_page_source(link="http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848")
     print(langid.classify(response.text))
 def test_get_all_links(self):
     link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
     source = LinkChecker.get_page_source(link)
     all_links = LinkChecker.get_all_links_from_source(source)
     for link in all_links:
         print(link)
 def testRequest(self):
     url = "http://127.0.0.1:8000/"
     agent = "VegeBot"
     source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**")
     print(source)
    def check_internal_page(checker: SiteChecker, page: OnSiteLink, timeout=10) -> ([], []):
        internal_pages = []
        external_pages = []
        #
        # if isinstance(checker.robot_agent, robotparser.RobotFileParser):
        #     if not checker.robot_agent.can_fetch(useragent=checker.agent, url=page.link):
        #         return [], []
        # print("checking internal_page", page)

        if isinstance(checker.robot_agent, Rules):
            try:
                if not checker.robot_agent.allowed(page.link, agent=checker.agent):
                    return [], []
            except:
                return [], []

        use_lxml_parser = checker.use_lxml_parser()
        with checker.task_control_lock:
            time.sleep(checker.site_crawl_delay)
            response = LinkChecker.get_page_source(page.link, timeout, agent=checker.agent, from_src=checker.agent_from)
        if response is None or response.status_code == ResponseCode.LinkError:
            return [], []
        paras = urlsplit(page.link)
        page_scheme, page_domain = paras[0], paras[1]

        links = LinkChecker.get_webpage_links_from_source(response, use_lxml_parser)

        for link in links:
            link_type = OnSiteLink.TypeOutbound
            valid_link = LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme)
            # if PageChecker.is_link_in_list(valid_link, new_pages):
            #     continue
            try:
                link_paras = urlsplit(valid_link)
                link_scheme, link_domain, link_path = link_paras[0], link_paras[1], link_paras[2]
                if link_domain.lower().startswith("mailto:"):
                    continue
                if not LinkChecker.might_be_link_html_page(link_path):
                    continue
            except:
                continue
            # if str(link_domain).endswith(checker.root_domain):
            if checker.sub_domain_no_local in link_domain:  # important change
                if checker.data_source.all_record > checker.max_page:
                    continue
                link_type = OnSiteLink.TypeOnSite
            else: # external
                valid_link = link_scheme + "://" + link_domain
            if link_type == OnSiteLink.TypeOnSite:
                if checker.is_link_in_cache(valid_link):
                    continue
                else:
                    checker.add_link_to_cache(valid_link)
                    internal_page = (valid_link, ResponseCode.LinkOK, page.link_level+1, OnSiteLink.TypeOnSite)
                    internal_pages.append(internal_page)
            else:
                stripped = str(link_domain).lower().strip()
                if stripped in checker.external_cache_list:
                    continue
                if len(checker.external_cache_list) < checker.external_cache_size:
                    checker.external_cache_list.append(stripped)
                external_page = (stripped, ResponseCode.DNSError)
                external_pages.append(external_page)
        return internal_pages, external_pages
Exemplo n.º 10
0
 def test_get_all_links(self):
     link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
     source = LinkChecker.get_page_source(link)
     all_links = LinkChecker.get_all_links_from_source(source)
     for link in all_links:
         print(link)
Exemplo n.º 11
0
 def testRequest(self):
     url = "http://127.0.0.1:8000/"
     agent = "VegeBot"
     source = LinkChecker.get_page_source(
         url, agent=agent, from_src="*****@*****.**")
     print(source)