def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link,
                            path=path,
                            ref_link="/",
                            shadow_ref_link="/",
                            source=path,
                            res_type=LinkUtility.EXT_WEBPAGE,
                            level=0)
         explorer = ArchiveExplorer(
             original_domain=root_domain,
             link=link,
             external_stop_event=stop_event,
             download_base_dir=FilePath.get_default_archive_dir(),
             max_thread=10,
             max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain,
                                                   thread_size=100,
                                                   profile_check=10,
                                                   pass_threshold=0.9,
                                                   res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
 def testScrapePage(self):
     # link = "http://web.archive.org/web/20111102054835/http://www.agfdh.org:80/"
     link = "http://web.archive.org/web/20150425143742/http://susodigital.com/"
     #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
     stop_event = multiprocessing.Event()
     inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
     root_domain = LinkChecker.get_root_domain(domain)[1]
     path = "/index.html"
     link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0)
     explorer = ArchiveExplorer(original_domain=root_domain, link=link,
                                external_stop_event=stop_event,
                                download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2)
     explorer.run()
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     archive_detail = explorer.get_archive_detail()
     CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
示例#5
0
 def get_archive_detail(self) -> ArchiveDetail:
     good_webpage_percent = 0 if self._total_webpage_count == 0 else 1 - self._broken_webpage_count / self._total_webpage_count
     good_image_percent = 0 if self._total_image_count == 0 else 1 - self._broken_image_count / self._total_image_count
     good_js_percent = 0 if self._total_js_count == 0 else 1 - self._broken_js_count / self._total_js_count
     good_css_percent = 0 if self._total_css_count == 0 else 1 - self._broken_css_count / self._total_css_count
     good_others_percent = 0 if self._total_others_count == 0 else 1 - self._broken_others_count / self._total_others_count
     all_broken = self._broken_js_count + self._broken_css_count + self._broken_image_count + self._broken_others_count + self._broken_webpage_count
     good_overall_percent = 0 if self._total_res_done == 0 else 1 - all_broken / self._total_res_done
     return ArchiveDetail(self._original_domain,
                          self._archive_link,
                          self._total_res_done,
                          good_res_rate=good_overall_percent,
                          total_web_page=self._total_webpage_count,
                          good_webpage_rate=good_webpage_percent,
                          total_css=self._total_css_count,
                          good_css_rate=good_css_percent,
                          total_js=self._total_js_count,
                          good_js_rate=good_js_percent,
                          total_image=self._total_image_count,
                          good_image_rate=good_image_percent,
                          total_other=self._total_others_count,
                          good_other_rate=good_others_percent)
示例#6
0
    def get_best_archive(root_domain: str,
                         thread_size=100,
                         profile_check=10,
                         pass_threshold=0.8,
                         res_limit=2000) -> ArchiveDetail:
        """
        get the best profile from archive.org by doing profile spectrum analysis, given a root domain name.
        spectrum analysis: comparison between resources of current profile to all historic resources.
        :param root_domain: root domain in str, e.g: "google.co.uk"
        :param thread_size: number of thread to check resource link simultaneously
        :param profile_check: max number of profile to check
        :param pass_threshold: threshold define if a profile is good enough.
        :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc.
        :return: tuple (archive in ArchiveStruct, spectrum value)
        """
        url = LinkChecker.get_valid_link(root_domain, link="")
        profiles = ArchiveOrg.get_url_info(url,
                                           min_size=1,
                                           limit=-profile_check)
        timestamp = ""
        info = ArchiveOrg.get_domain_urls(url, limit=res_limit)
        res_count = len(info)
        archive = None
        current_rate = 0.0
        min_broken_res_count = 0
        good_rate_web_page = 0
        good_rate_image = 0
        good_rate_css = 0
        good_rate_js = 0
        good_rate_other = 0

        total_web_page_min = 0
        total_js_min = 0
        total_css_min = 0
        total_image_min = 0
        total_other_min = 0
        if res_count > 0:
            for profile in profiles:
                if isinstance(profile, ArchiveStruct):
                    total_web_page = 0
                    total_js = 0
                    total_css = 0
                    total_image = 0
                    total_other = 0

                    broken_web_page = 0
                    broken_js = 0
                    broken_css = 0
                    broken_image = 0
                    broken_other = 0

                    test_pool = pool.ThreadPool(processes=thread_size)
                    timestamp = profile.date_stamp
                    print("checking:", str(profile))
                    links = []
                    for item in info:
                        item.date_stamp = timestamp
                        links.append(ArchiveOrg.get_archive_link(item))
                    results = [
                        test_pool.apply_async(func=test_response, args=(x, ))
                        for x in links
                    ]
                    returned = [y.get() for y in results]
                    test_pool.terminate()
                    for result_good, link_cls in returned:
                        if link_cls == LinkUtility.EXT_WEBPAGE:
                            total_web_page += 1
                            if not result_good:
                                broken_web_page += 1
                        elif link_cls == LinkUtility.EXT_CSS:
                            total_css += 1
                            if not result_good:
                                broken_css += 1
                        elif link_cls == LinkUtility.EXT_JS:
                            total_js += 1
                            if not result_good:
                                broken_js += 1
                        elif link_cls == LinkUtility.EXT_IMAGE:
                            total_image += 1
                            if not result_good:
                                broken_image += 1
                        else:
                            total_other += 1
                            if not result_good:
                                broken_other += 1
                    broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image
                    passed = False
                    total_broken_rate = 1 - broken_res_count / res_count
                    if total_broken_rate >= pass_threshold:
                        passed = True
                    if total_broken_rate > current_rate:
                        current_rate = total_broken_rate
                        archive = profile
                        good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page / total_web_page
                        good_rate_image = 0 if total_image == 0 else 1 - broken_image / total_image
                        good_rate_css = 0 if total_css == 0 else 1 - broken_css / total_css
                        good_rate_js = 0 if total_js == 0 else 1 - broken_js / total_js
                        good_rate_other = 0 if total_other == 0 else 1 - broken_other / total_other

                        total_web_page_min = total_web_page
                        total_js_min = total_js
                        total_css_min = total_css
                        total_image_min = total_image
                        total_other_min = total_other
                        min_broken_res_count = total_broken_rate
                    print("total:", res_count, " broken res:",
                          broken_res_count, " stamp: ", profile.date_stamp,
                          " pass? ", passed, " rate:", total_broken_rate)
        return ArchiveDetail(root_domain,
                             archive_link=ArchiveOrg.get_archive_link(archive),
                             total_res=res_count,
                             good_res_rate=min_broken_res_count,
                             total_web_page=total_web_page_min,
                             good_webpage_rate=good_rate_web_page,
                             total_css=total_css_min,
                             good_css_rate=good_rate_css,
                             total_js=total_js_min,
                             good_js_rate=good_rate_js,
                             total_image=total_image_min,
                             good_image_rate=good_rate_image,
                             total_other=total_other_min,
                             good_other_rate=good_rate_other)