Пример #1
0
    def setUp(self):
        self.env = TestDlrobotEnv("data.ssl")

        TDownloadEnv.clear_cache_folder()
        THttpRequester.ENABLE = False
        logger = setup_logging(log_file_name="dlrobot.log")
        THttpRequester.initialize(logger)
Пример #2
0
    def make_one_step(self, start_pages):
        self.logger.info("=== step {0} =========".format(self.step_name))
        self.logger.info(self.website.main_page_url)
        self.url_to_weight = dict()
        self.start_time = time.time()
        if self.is_last_step:
            self.website.create_export_folder()

        self.pages_to_process = dict(start_pages)
        self.processed_pages = set()

        if self.include_sources == "always":
            assert not self.is_last_step  # todo: should we export it?
            self.url_to_weight.update(self.pages_to_process)

        if self.need_search_engine_before():
            self.use_search_engine(self.website.main_page_url)
            self.pages_to_process.update(self.url_to_weight)

        self.add_links_from_sitemap_xml()

        self.apply_function_to_links(self.robot_link_func)

        if self.step_name == "sitemap":
            self.add_regional_main_pages()

        self.profiler = {
            "elapsed_time":  time.time() - self.start_time,
            "step_request_rate": THttpRequester.get_request_rate(self.start_time),
            "site_request_rate": THttpRequester.get_request_rate()
        }
        self.logger.debug("{}".format(str(self.profiler)))
        self.delete_url_mirrors_by_www_and_protocol_prefix()
        self.logger.info('{0} source_url links -> {1} target links'.format(len(start_pages), len(self.url_to_weight)))
Пример #3
0
    def test_unicode(self):

        try:
            THttpRequester.initialize(setup_logging())
            s = THttpRequester.make_http_request("http://5%20июня%20запретят%20розничную%20продажу%20алкоголя", "GET")
        except THttpRequester.RobotHttpException as exp:
            # no UnicodeException for this url
            pass
Пример #4
0
    def test_423mb_video(self):
        url = "https://epp.genproc.gov.ru/documents/1664002/25630699/%D0%AD%D1%81%D1%82%D0%B0%D1%84%D0%B5%D1%82%D0%B0%2B%D0%B4%D0%BE%D0%B1%D1%80%D1%8B%D1%85%2B%D0%B4%D0%B5%D0%BB.mp4/08c1ddfb-c48f-8c7f-9c2e-f0c66363c393?version=1.10&t=1608287244923&download=true"
        normal_url, headers, data = THttpRequester.make_http_request(
            url, "GET")
        self.assertEqual(0, len(data))

        normal_url, headers, data = THttpRequester.make_http_request(
            url, "HEAD")
        self.assertEqual(0, len(data))
Пример #5
0
 def setUp(self, website_folder):
     self.env = TestDlrobotEnv("data.{}".format(
         os.path.basename(website_folder)))
     shutil.copy2(
         os.path.join(os.path.dirname(__file__), website_folder,
                      "project.txt"), self.env.data_folder)
     THttpRequester.ENABLE = False
     self.logger = setup_logging(log_file_name="dlrobot.log")
     THttpRequester.initialize(self.logger)
Пример #6
0
 def test_ssl(self):
     sites = [
         "http://www.yandex.ru", "http://chukotka.sledcom.ru/",
         "http://www.aot.ru", "http://officefinder.rs", "http://ozerny.ru",
         "http://ksl.spb.sudrf.ru", "http://spbogdo.ru", "http://akrvo.ru",
         "http://primorie.fas.gov.ru"
     ]
     for site in sites:
         THttpRequester.make_http_request(site, "GET")  # no exceptions
Пример #7
0
 def setUp(self):
     self.server_address = '127.0.0.1:{}'.format(self.web_site_port)
     self.web_server = TestHTTPServer(self.web_site_port)
     threading.Thread(target=start_server, args=(self.web_server,)).start()
     time.sleep(1)
     self.env = TestDlrobotEnv("data.timeout")
     TDownloadEnv.clear_cache_folder()
     self.logger = setup_logging(log_file_name="dlrobot.log")
     THttpRequester.initialize(self.logger)
Пример #8
0
    def click_all_selenium(self, main_url, check_link_func):
        self.logger.debug("find_links_with_selenium url={} ".format(main_url))
        THttpRequester.consider_request_policy(main_url, "GET_selenium")
        elements = self.get_selenium_driver().navigate_and_get_links_js(main_url, self.config.selenium_timeout)
        if elements is None:
            self.logger.error("cannot get child elements using javascript for url={}".format(main_url))
            return
        page_html = self.get_selenium_driver().the_driver.page_source
        if page_html is None:
            self.logger.error("cannot get html source_url for url={}".format(main_url))
            return
        self.logger.debug("html_size={}, elements_count={}".format(len(page_html), len(elements)))
        processed_elements = set()

        self.find_languages_links(elements, processed_elements)
        html_title = self.get_selenium_driver().the_driver.title
        link_infos = dict()
        not_empty_links = set()
        for element_index, element, in enumerate(elements):
            link_info = self.build_link_info(main_url, page_html, element_index, element, html_title)
            link_infos[element['id']] = link_info
            if not self.can_follow_this_link(link_info):
                processed_elements.add(element['id'])
            else:
                if link_info.target_url is not None:
                    not_empty_links.add(link_info.target_url)

        if len(not_empty_links) > 30 and not_empty_links.issubset(self.unique_hrefs):
            self.logger.debug("skip page, since its links are similar to the previous page (speed optimization)")
            return
        else:
            for x in not_empty_links:
                if x not in self.unique_hrefs:
                    pass
            self.unique_hrefs.update(not_empty_links)
            self.unique_hrefs.add(main_url)

        self.crawled_web_pages_count += 1
        for element_index, element, in enumerate(elements):
            if element['id'] not in processed_elements:
                processed_elements.add(element['id'])
                self.process_selenium_element(link_infos[element['id']], element, check_link_func)

        # получаем еще раз ссылки, может быть, что-то новое отрисовал javascript, хотя,
        # может быть, надо брать ссылки не после, а до скролдауна и сравнивать их по href, а не по id,
        # т.е. до того как javaскрипт начал скрывать их (поближе  к чистой странице, как будто мы ее скачали curl)

        elements = self.get_selenium_driver().get_links_js(timeout=self.config.selenium_timeout)
        if elements is None:
            self.logger.error("cannot get child elements using javascript for url={} (second)".format(main_url))
            return
        for element_index, element, in enumerate(elements):
            if element['id'] not in processed_elements:
                link_info = self.build_link_info(main_url, page_html, element_index, element, html_title)
                if self.can_follow_this_link(link_info):
                    self.process_selenium_element(link_info, element, check_link_func)
Пример #9
0
 def __init__(self, args):
     self.args = args
     self.logger = setup_logging(log_file_name=args.logfile,
                                 logger_name="dlr")
     self.config = TRobotConfig.read_by_config_type(self.args.config_type)
     self.config.update_from_program_args(self.args)
     self.logger.debug("crawling_timeout={}".format(
         self.config.crawling_timeout))
     TDownloadEnv.init_conversion(self.logger)
     THttpRequester.initialize(self.logger)
     if args.clear_cache_folder:
         TDownloadEnv.clear_cache_folder()
Пример #10
0
def main():
    logger = setup_logging()
    args = parse_args()
    THttpRequester.initialize(logger)
    for url in args.urls:
        file = TDownloadedFile(url, args.use_cache)
        if args.action == "links":
            print_links(file)
        elif args.action == "text":
            print_text(file)
        elif args.action == "utf8_html":
            print_utf8_html(file)
        else:
            raise Exception("unknown action")
Пример #11
0
    def __init__(self):
        self.args = parse_args()
        self.logger = setup_logging(log_file_name=self.args.logfile)
        if self.args.input_offices is not None:
            offices = TOfficeTableInMemory()
            offices.read_from_local_file(self.args.input_offices)
            self.web_sites = TDeclarationWebSiteList(self.logger,
                                                     offices=offices)
        else:
            self.web_sites = TDeclarationWebSiteList(self.logger)

        self.temp_dlrobot_project: TRobotProject
        self.temp_dlrobot_project = None
        THttpRequester.initialize(self.logger)
Пример #12
0
    def navigate(self, url):
        #to reduce memory usage
        if self.driver_processed_urls_count > 100:
            self.stop_executable()
            self.start_executable()
            self.driver_processed_urls_count = 0
        self.driver_processed_urls_count += 1

        #leave only one window tab, close other tabs
        self.close_not_first_tab()
        self.logger.debug("selenium navigate to {}, window tabs count={}".format(url, len(self.the_driver.window_handles)))
        self.the_driver.switch_to.window(self.the_driver.window_handles[0])

        # navigation
        try:
            self.the_driver.set_page_load_timeout(self.page_load_timeout)
            self.the_driver.get(url)
        except IndexError as exp:
            raise THttpRequester.RobotHttpException("general IndexError inside urllib.request.urlopen",
                                                    url, 520, "GET")
        except TimeoutException as exp:
            title = self.the_driver.title
            if len(title) == 0:
                raise
        self.check_http_code(url)
Пример #13
0
    def click_selenium_if_no_href(self, main_url, element, element_index, check_link_func):
        tag_name = element.tag_name
        link_text = element.text.strip('\n\r\t ')  # initialize here, can be broken after click
        page_html = self.get_selenium_driver().the_driver.page_source
        THttpRequester.consider_request_policy(main_url + " elem_index=" + str(element_index), "click_selenium")

        link_info = TLinkInfo(TClickEngine.selenium, main_url, None,
                              source_html=page_html, anchor_text=link_text, tag_name=tag_name,
                              element_index=element_index,
                              source_page_title=self.get_selenium_driver().the_driver.title)

        self.get_selenium_driver().click_element(element, link_info)

        if self.normalize_and_check_link(link_info, check_link_func):
            if link_info.downloaded_file is not None:
                self.add_downloaded_file_wrapper(link_info)
            elif link_info.target_url is not None:
                self.add_link_wrapper(link_info)
Пример #14
0
 def check_http_code(self, url):
     if len(self.the_driver.page_source) < 700:
         title = self.the_driver.title.strip()
         if title.startswith('4') or title.startswith('3') or title.startswith('5'):
             words = title.split(' ')
             if words[0] in POPULAR_ERROR_HTTP_CODES:
                 message = " ".join(words[1:])
                 if message == POPULAR_ERROR_HTTP_CODES[words[0]]:
                     raise THttpRequester.RobotHttpException(message, url, words[0], "GET")
Пример #15
0
 def recognize_protocol_and_www_selenium(self):
     for url in urllib_parse_pro.get_url_modifications(self.input_site_url):
         try:
             self.parent_project.selenium_driver.navigate(url)
             time.sleep(3)
             title = self.parent_project.selenium_driver.the_driver.title
             html = self.parent_project.selenium_driver.the_driver.page_source
             self.init_main_page_url_from_redirected_url(
                 self.parent_project.selenium_driver.the_driver.current_url,
                 title, html)
             return
         except WebDriverException as exp:
             self.logger.error(
                 "cannot fetch {}  with selenium, sleep 3 sec".format(url))
             time.sleep(3)
     raise THttpRequester.RobotHttpException(
         "there is no way to access {}".format(self.input_site_url),
         self.input_site_url, 404, "GET")
Пример #16
0
 def test_redirect_popular_site(self):
     THttpRequester.initialize(setup_logging())
     redirected_url, headers = THttpRequester.request_url_headers_with_global_cache("http://www.meduza.io")
     self.assertIsNotNone(headers)
     self.assertEqual(redirected_url, 'https://meduza.io/')
Пример #17
0
 def test_redirects(self):
     dummy1, dummy2, data = THttpRequester.make_http_request_urllib(
         self.build_url("redirect1"), "GET", 10)
     self.assertEqual(data.decode('utf8').startswith("<html>"), True)
Пример #18
0
 def test_gibdd(self):
     try:
         THttpRequester.initialize(setup_logging())
         s = THttpRequester.make_http_request("http://gibdd.ru", "GET")
     except THttpRequester.RobotHttpException as exp:
         self.assertEqual(exp.http_code, 520)
Пример #19
0
 def test_video(self):
     url = "https://www.w3schools.com/html/mov_bbb.mp4"
     normal_url, headers, data = THttpRequester.make_http_request(
         url, "GET")
     self.assertEqual(0, len(data))