def get_archives_lang(root_domain: str, thread_size=10, profile_check=300) -> list: url = LinkChecker.get_valid_link(root_domain, link="") profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=0-profile_check) today_stamp = datetime.utcnow().timestamp() for item in profiles: if isinstance(item, ArchiveStruct): timestamp = item.get_datestamp_unix_time() print(str(item), " converted:", str(timestamp)) return []
def testRequestAllLink(self): url = "http://www.jehovahs-witness.com" agent = "VegeBot-Careful" source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**", retries=0) links = LinkChecker.get_all_links_from_source(source) for link in links: paras = urlsplit(link) page_scheme, page_domain = paras[0], paras[1] print(LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme))
def get_archives_lang(root_domain: str, thread_size=10, profile_check=300) -> list: url = LinkChecker.get_valid_link(root_domain, link="") profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=0 - profile_check) today_stamp = datetime.utcnow().timestamp() for item in profiles: if isinstance(item, ArchiveStruct): timestamp = item.get_datestamp_unix_time() print(str(item), " converted:", str(timestamp)) return []
def testRequestAllLink(self): url = "http://www.jehovahs-witness.com" agent = "VegeBot-Careful" source = LinkChecker.get_page_source( url, agent=agent, from_src="*****@*****.**", retries=0) links = LinkChecker.get_all_links_from_source(source) for link in links: paras = urlsplit(link) page_scheme, page_domain = paras[0], paras[1] print( LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme))
def get_best_archive(root_domain: str, thread_size=100, profile_check=10, pass_threshold=0.8, res_limit=2000) -> ArchiveDetail: """ get the best profile from archive.org by doing profile spectrum analysis, given a root domain name. spectrum analysis: comparison between resources of current profile to all historic resources. :param root_domain: root domain in str, e.g: "google.co.uk" :param thread_size: number of thread to check resource link simultaneously :param profile_check: max number of profile to check :param pass_threshold: threshold define if a profile is good enough. :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc. :return: tuple (archive in ArchiveStruct, spectrum value) """ url = LinkChecker.get_valid_link(root_domain, link="") profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=-profile_check) timestamp = "" info = ArchiveOrg.get_domain_urls(url, limit=res_limit) res_count = len(info) archive = None current_rate = 0.0 min_broken_res_count = 0 good_rate_web_page = 0 good_rate_image = 0 good_rate_css = 0 good_rate_js = 0 good_rate_other = 0 total_web_page_min = 0 total_js_min = 0 total_css_min = 0 total_image_min = 0 total_other_min = 0 if res_count > 0: for profile in profiles: if isinstance(profile, ArchiveStruct): total_web_page = 0 total_js = 0 total_css = 0 total_image = 0 total_other = 0 broken_web_page = 0 broken_js = 0 broken_css = 0 broken_image = 0 broken_other = 0 test_pool = pool.ThreadPool(processes=thread_size) timestamp = profile.date_stamp print("checking:", str(profile)) links = [] for item in info: item.date_stamp = timestamp links.append(ArchiveOrg.get_archive_link(item)) results = [ test_pool.apply_async(func=test_response, args=(x, )) for x in links ] returned = [y.get() for y in results] test_pool.terminate() for result_good, link_cls in returned: if link_cls == LinkUtility.EXT_WEBPAGE: total_web_page += 1 if not result_good: broken_web_page += 1 elif link_cls == LinkUtility.EXT_CSS: total_css += 1 if not result_good: broken_css += 1 elif link_cls == LinkUtility.EXT_JS: total_js += 1 if not result_good: broken_js += 1 elif link_cls == LinkUtility.EXT_IMAGE: total_image += 1 if not result_good: broken_image += 1 else: total_other += 1 if not result_good: broken_other += 1 broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image passed = False total_broken_rate = 1 - broken_res_count / res_count if total_broken_rate >= pass_threshold: passed = True if total_broken_rate > current_rate: current_rate = total_broken_rate archive = profile good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page / total_web_page good_rate_image = 0 if total_image == 0 else 1 - broken_image / total_image good_rate_css = 0 if total_css == 0 else 1 - broken_css / total_css good_rate_js = 0 if total_js == 0 else 1 - broken_js / total_js good_rate_other = 0 if total_other == 0 else 1 - broken_other / total_other total_web_page_min = total_web_page total_js_min = total_js total_css_min = total_css total_image_min = total_image total_other_min = total_other min_broken_res_count = total_broken_rate print("total:", res_count, " broken res:", broken_res_count, " stamp: ", profile.date_stamp, " pass? ", passed, " rate:", total_broken_rate) return ArchiveDetail(root_domain, archive_link=ArchiveOrg.get_archive_link(archive), total_res=res_count, good_res_rate=min_broken_res_count, total_web_page=total_web_page_min, good_webpage_rate=good_rate_web_page, total_css=total_css_min, good_css_rate=good_rate_css, total_js=total_js_min, good_js_rate=good_rate_js, total_image=total_image_min, good_image_rate=good_rate_image, total_other=total_other_min, good_other_rate=good_rate_other)
def get_best_archive(root_domain: str, thread_size=100, profile_check=10, pass_threshold=0.8, res_limit=2000) -> ArchiveDetail: """ get the best profile from archive.org by doing profile spectrum analysis, given a root domain name. spectrum analysis: comparison between resources of current profile to all historic resources. :param root_domain: root domain in str, e.g: "google.co.uk" :param thread_size: number of thread to check resource link simultaneously :param profile_check: max number of profile to check :param pass_threshold: threshold define if a profile is good enough. :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc. :return: tuple (archive in ArchiveStruct, spectrum value) """ url = LinkChecker.get_valid_link(root_domain, link="") profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=-profile_check) timestamp ="" info = ArchiveOrg.get_domain_urls(url, limit=res_limit) res_count = len(info) archive = None current_rate = 0.0 min_broken_res_count = 0 good_rate_web_page = 0 good_rate_image = 0 good_rate_css = 0 good_rate_js = 0 good_rate_other = 0 total_web_page_min = 0 total_js_min = 0 total_css_min = 0 total_image_min = 0 total_other_min = 0 if res_count > 0: for profile in profiles: if isinstance(profile, ArchiveStruct): total_web_page = 0 total_js = 0 total_css = 0 total_image = 0 total_other = 0 broken_web_page = 0 broken_js = 0 broken_css = 0 broken_image = 0 broken_other = 0 test_pool = pool.ThreadPool(processes=thread_size) timestamp = profile.date_stamp print("checking:", str(profile)) links = [] for item in info: item.date_stamp = timestamp links.append(ArchiveOrg.get_archive_link(item)) results = [test_pool.apply_async(func=test_response, args=(x,)) for x in links] returned = [y.get() for y in results] test_pool.terminate() for result_good, link_cls in returned: if link_cls == LinkUtility.EXT_WEBPAGE: total_web_page += 1 if not result_good: broken_web_page += 1 elif link_cls == LinkUtility.EXT_CSS: total_css += 1 if not result_good: broken_css += 1 elif link_cls == LinkUtility.EXT_JS: total_js += 1 if not result_good: broken_js += 1 elif link_cls == LinkUtility.EXT_IMAGE: total_image += 1 if not result_good: broken_image += 1 else: total_other += 1 if not result_good: broken_other += 1 broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image passed = False total_broken_rate = 1-broken_res_count/res_count if total_broken_rate >= pass_threshold: passed = True if total_broken_rate > current_rate: current_rate = total_broken_rate archive = profile good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page/total_web_page good_rate_image = 0 if total_image == 0 else 1 - broken_image/total_image good_rate_css = 0 if total_css == 0 else 1 - broken_css/total_css good_rate_js = 0 if total_js == 0 else 1 - broken_js/total_js good_rate_other = 0 if total_other == 0 else 1 - broken_other/total_other total_web_page_min = total_web_page total_js_min = total_js total_css_min = total_css total_image_min = total_image total_other_min = total_other min_broken_res_count = total_broken_rate print("total:", res_count, " broken res:", broken_res_count, " stamp: ", profile.date_stamp, " pass? ", passed, " rate:", total_broken_rate) return ArchiveDetail(root_domain, archive_link=ArchiveOrg.get_archive_link(archive), total_res=res_count, good_res_rate=min_broken_res_count, total_web_page=total_web_page_min, good_webpage_rate=good_rate_web_page, total_css=total_css_min, good_css_rate=good_rate_css, total_js=total_js_min, good_js_rate=good_rate_js, total_image=total_image_min, good_image_rate=good_rate_image, total_other=total_other_min, good_other_rate=good_rate_other)
def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None, controller: SiteCheckerController=None, max_level=10, max_page=1000, delegate=None, output_buff_size=2000, output_queue=None, output_all_external=False, result_delegate=None, memory_control_terminate_event=None, check_robot_text=True, **kwargs): """ :param full_link: The full link of a domain, e.g: https://www.google.co.uk :param domain: domain to crawl :param max_level: stop crawling if it reaches this level :param max_page: maximum pages to check within a site, also stop crawling :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999 :param result_delegate: send site_info upon finish :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process. :return: """ FeedbackInterface.__init__(self, **kwargs) #super(SiteChecker, self).__init__(**kwargs) if full_link is None or len(full_link) == 0: raise ValueError() original_path = "" try: paras = urlsplit(full_link) self.scheme, self.domain, original_path = paras[0], paras[1], paras[2] except: pass domain_data = LinkChecker.get_root_domain(full_link, False) self.root_domain = domain_data[1] self.sub_domain = domain_data[4] self.domain_suffix = domain_data[5] self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix) if self.scheme == "": self.scheme = "http" if self.domain == "": self.domain = self.root_domain self.orginal_link = full_link self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme) self.max_level = max_level self.max_page = max_page self.page_count = 0 # keep track page done self._page_count_shadow = 0 # track previous count self._all_page_count_shadow = 0 #track previous count in datasource self.internal_page_count = 0 self.internal_page_last_count = 0 self.page_allocated = 0 self.current_level = 0 # if this = 0, it is root domain/home_page self._stop_event = Event() valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link) self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self, stop_event=self._stop_event, buf_size=int(output_buff_size/2), dir_path=get_db_buffer_default_dir(), convert_output=False) self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False) self._memory_control_terminate_event = memory_control_terminate_event self.task_control_lock = threading.RLock() if data_source is None: #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self) self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self) else: self.data_source = data_source # a list of OnSiteLink self.delegate = delegate if LinkChecker.might_be_link_html_page(original_path): self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1)) self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1)) self.cache_list = [] # internal page cache self.page_need_look_up_temp = 0 self.cache_list.append(self.domain_link) if "www." not in self.sub_domain: self.cache_list.append(self.scheme + "://www."+self.sub_domain) self.cache_list.append(self.scheme + "://" + self.domain) self.page_need_look_up = self.data_source.count_all() self.cache_size = 500 # create a small cache list to avoid going to check link in file system with lots of read and write self._double_check_cache_lock = threading.RLock() self._double_check_cache = deque(maxlen=self.cache_size) self.external_cache_list = [] self.external_cache_size = 500 # cache that hold external sites self.external_links_checked = 0 self.add_internal_page_OK_only = True self.output_queue = output_queue self.output_all_external = output_all_external self.controller = controller self.result_delegate = result_delegate self.page_count_lock = threading.RLock() self.internal_page_count_lock = threading.RLock() self.level_lock = threading.RLock() self.page_look_up_lock = threading.RLock() self.external_link_check_lock = threading.RLock() self._finihsed = False self.task_control_max = 1 self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \ "if you have an enquiry, please email to: [email protected])" self.agent_from = "*****@*****.**" if check_robot_text: self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme) else: self.robot_agent = None self.site_crawl_delay = 0.60 if isinstance(self.robot_agent, Rules): delay_temp = self.robot_agent.delay(self.agent) if delay_temp is not None and delay_temp != self.site_crawl_delay: self.site_crawl_delay = delay_temp self.task_control_counter = 1 self._speed_penalty_count = 0 self._speed_penalty_threshold = 10 self._progress_logging_speed = 120 self._output_period = 120 self._output_batch_size = 100 self._death_wish_sent = False SiteChecker._is_lxml_parser_exist() self._output_thread = None self._output_queue = None self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event) self._status = "Start" self._populate_with_state() # restore laste known state
def check_internal_page(checker: SiteChecker, page: OnSiteLink, timeout=10) -> ([], []): internal_pages = [] external_pages = [] # # if isinstance(checker.robot_agent, robotparser.RobotFileParser): # if not checker.robot_agent.can_fetch(useragent=checker.agent, url=page.link): # return [], [] # print("checking internal_page", page) if isinstance(checker.robot_agent, Rules): try: if not checker.robot_agent.allowed(page.link, agent=checker.agent): return [], [] except: return [], [] use_lxml_parser = checker.use_lxml_parser() with checker.task_control_lock: time.sleep(checker.site_crawl_delay) response = LinkChecker.get_page_source(page.link, timeout, agent=checker.agent, from_src=checker.agent_from) if response is None or response.status_code == ResponseCode.LinkError: return [], [] paras = urlsplit(page.link) page_scheme, page_domain = paras[0], paras[1] links = LinkChecker.get_webpage_links_from_source(response, use_lxml_parser) for link in links: link_type = OnSiteLink.TypeOutbound valid_link = LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme) # if PageChecker.is_link_in_list(valid_link, new_pages): # continue try: link_paras = urlsplit(valid_link) link_scheme, link_domain, link_path = link_paras[0], link_paras[1], link_paras[2] if link_domain.lower().startswith("mailto:"): continue if not LinkChecker.might_be_link_html_page(link_path): continue except: continue # if str(link_domain).endswith(checker.root_domain): if checker.sub_domain_no_local in link_domain: # important change if checker.data_source.all_record > checker.max_page: continue link_type = OnSiteLink.TypeOnSite else: # external valid_link = link_scheme + "://" + link_domain if link_type == OnSiteLink.TypeOnSite: if checker.is_link_in_cache(valid_link): continue else: checker.add_link_to_cache(valid_link) internal_page = (valid_link, ResponseCode.LinkOK, page.link_level+1, OnSiteLink.TypeOnSite) internal_pages.append(internal_page) else: stripped = str(link_domain).lower().strip() if stripped in checker.external_cache_list: continue if len(checker.external_cache_list) < checker.external_cache_size: checker.external_cache_list.append(stripped) external_page = (stripped, ResponseCode.DNSError) external_pages.append(external_page) return internal_pages, external_pages