def check_whois_with_dns(page: OnSiteLink): real_response_code = ResponseCode.DNSError skip_whois_check = False try: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] suffix = root_result[5] if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST: skip_whois_check = True else: if LinkChecker.is_domain_DNS_OK(sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError skip_whois_check = True elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError skip_whois_check = True # response = LinkChecker.get_response(page.link, timeout) # check 404 error page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain except Exception as ex: # ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link) skip_whois_check = True finally: if not skip_whois_check and real_response_code == ResponseCode.DNSError: return check_whois(page) else: return page.link, page.response_code
def check_external_page(checker: SiteChecker, page: OnSiteLink, timeout=10): """ check DNS Error Only :param checker: :param page: :param timeout: :return: """ # response = LinkChecker.get_response(page.link, timeout) #real_response_code = response[0] #real_response_code = ResponseCode.LinkOK #print("-------checking external " + page.link) try: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] if len(sub_domain) == 0 or root_domain in checker.external_cache_list: return else: if len(checker.external_cache_list) < checker.external_cache_size: checker.external_cache_list.append(root_domain) real_response_code = page.response_code if LinkChecker.is_domain_DNS_OK(sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError # response = LinkChecker.get_response(page.link, timeout) # check 404 error page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain #print(" ready to output external:", str(page)) if checker.output_all_external or ResponseCode.domain_might_be_expired(real_response_code): # if checker.delegate is not None: # checker.delegate(new_page) if checker.output_queue is not None: with checker._queue_lock: checker.output_queue.put(page) except Exception as ex: PrintLogger.print(ex) ErrorLogger.log_error("PageChecker", ex, "check_external_page() " + page.link)