def test_response(link: str) -> bool:
    status_code, content_type = LinkChecker.get_response(link)
    if status_code != 200:
        print(link, "status bad:", status_code, " content: ", content_type)
        return False
    else:
        print(link, "status good:", status_code, " content: ", content_type)
        return True
def test_response(link: str) -> bool:
    status_code, content_type = LinkChecker.get_response(link)
    if status_code != 200:
        print(link, "status bad:", status_code, " content: ", content_type)
        return False
    else:
        print(link, "status good:", status_code, " content: ", content_type)
        return True
    def scrape_web_res(self, page: LinkAttrs):
        print("look:", page.link, "level: ", page.level)
        try:
            if len(page.path) > ArchiveExplorer.MAX_PATH_LEN:  # max file path in any file system
                raise OSError("file path is too long:" + page.path)
            response_code, content_type = LinkChecker.get_response(page.link)
            if response_code not in [ResponseCode.LinkOK, ResponseCode.LinkFound, ResponseCode.LinkRedirect]:
                raise ConnectionError("res is not available: " + page.link)
            if page.res_type in [LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS, LinkUtility.EXT_JS]:  # parse a webpage
                save_text = self._parse_text_res(page)
                self._file_manager.write_to_file(page.path, save_text)
            # elif page.res_type != LinkUtility.EXT_OTHER:  # TODO: download normal resources
            #     response = LinkChecker.get_common_web_resource(page.link)
            #     if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT:
            #         self._downloader.write_to_file(page.path, response.content, mode="b")
            #     else:
            #         self._downloader.write_to_file(page.path, response.text, mode="t")
            else:
                # response = LinkChecker.get_common_web_resource(page.link)
                # self._downloader.write_to_file(page.path, response.content, mode="b")
                self._file_manager.download_file(sub_path=page.path, url=page.link, timeout=self._timeout,
                                                 redirect=self._max_redirect, retries=self._max_retries)
        except Exception as ex:
            print("exception:", ex)
            print("broken res:", page)
            with self._sync_lock:
                self._file_manager.write_to_error_log(page.to_tuple(str(ex)))
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._broken_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._broken_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._broken_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._broken_js_count += 1
                else:
                    self._broken_others_count += 1

                self._broken_res_list.append(page)
        finally:
            with self._sync_lock:
                self._total_res_done += 1
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._total_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._total_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._total_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._total_js_count += 1
                else:
                    self._total_others_count += 1
Exemplo n.º 4
0
def test_response(link: str) -> (bool, str):
    # link_cls, ext = LinkUtility.get_link_class(link)
    status_code, content_type = LinkChecker.get_response(link)
    link_cls, ext = LinkUtility.get_link_class(link.rstrip('/'))
    # print("checking link:", link, " link cls:", link_cls, " ext:", ext)
    # if "image" in content_type:
    #     link_cls = LinkUtility.EXT_WEBPAGE
    # elif "html" in content_type:
    #     link_cls = LinkUtility.EXT_WEBPAGE
    # elif "css" in content_type:
    #     link_cls = LinkUtility.EXT_CSS
    # elif "javascript" in content_type:
    #     link_cls = LinkUtility.EXT_JS
    # else:
    #     link_cls = LinkUtility.EXT_OTHER

    if status_code != 200:
        # print(link, "status bad:", status_code, " content: ", content_type)
        return False, link_cls
    else:
        # print(link, "status good:", status_code, " content: ", content_type)
        return True, link_cls
def test_response(link: str) -> (bool, str):
    # link_cls, ext = LinkUtility.get_link_class(link)
    status_code, content_type = LinkChecker.get_response(link)
    link_cls, ext = LinkUtility.get_link_class(link.rstrip('/'))
    # print("checking link:", link, " link cls:", link_cls, " ext:", ext)
    # if "image" in content_type:
    #     link_cls = LinkUtility.EXT_WEBPAGE
    # elif "html" in content_type:
    #     link_cls = LinkUtility.EXT_WEBPAGE
    # elif "css" in content_type:
    #     link_cls = LinkUtility.EXT_CSS
    # elif "javascript" in content_type:
    #     link_cls = LinkUtility.EXT_JS
    # else:
    #     link_cls = LinkUtility.EXT_OTHER

    if status_code != 200:
        # print(link, "status bad:", status_code, " content: ", content_type)
        return False, link_cls
    else:
        # print(link, "status good:", status_code, " content: ", content_type)
        return True, link_cls
Exemplo n.º 6
0
    def scrape_web_res(self, page: LinkAttrs):
        print("look:", page.link, "level: ", page.level)
        try:
            if len(
                    page.path
            ) > ArchiveExplorer.MAX_PATH_LEN:  # max file path in any file system
                raise OSError("file path is too long:" + page.path)
            response_code, content_type = LinkChecker.get_response(page.link)
            if response_code not in [
                    ResponseCode.LinkOK, ResponseCode.LinkFound,
                    ResponseCode.LinkRedirect
            ]:
                raise ConnectionError("res is not available: " + page.link)
            if page.res_type in [
                    LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS,
                    LinkUtility.EXT_JS
            ]:  # parse a webpage
                save_text = self._parse_text_res(page)
                self._file_manager.write_to_file(page.path, save_text)
            # elif page.res_type != LinkUtility.EXT_OTHER:  # TODO: download normal resources
            #     response = LinkChecker.get_common_web_resource(page.link)
            #     if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT:
            #         self._downloader.write_to_file(page.path, response.content, mode="b")
            #     else:
            #         self._downloader.write_to_file(page.path, response.text, mode="t")
            else:
                # response = LinkChecker.get_common_web_resource(page.link)
                # self._downloader.write_to_file(page.path, response.content, mode="b")
                self._file_manager.download_file(sub_path=page.path,
                                                 url=page.link,
                                                 timeout=self._timeout,
                                                 redirect=self._max_redirect,
                                                 retries=self._max_retries)
        except Exception as ex:
            print("exception:", ex)
            print("broken res:", page)
            with self._sync_lock:
                self._file_manager.write_to_error_log(page.to_tuple(str(ex)))
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._broken_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._broken_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._broken_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._broken_js_count += 1
                else:
                    self._broken_others_count += 1

                self._broken_res_list.append(page)
        finally:
            with self._sync_lock:
                self._total_res_done += 1
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._total_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._total_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._total_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._total_js_count += 1
                else:
                    self._total_others_count += 1