def setUp(self): self.env = TestDlrobotEnv("data.ssl") TDownloadEnv.clear_cache_folder() THttpRequester.ENABLE = False logger = setup_logging(log_file_name="dlrobot.log") THttpRequester.initialize(logger)
def make_one_step(self, start_pages): self.logger.info("=== step {0} =========".format(self.step_name)) self.logger.info(self.website.main_page_url) self.url_to_weight = dict() self.start_time = time.time() if self.is_last_step: self.website.create_export_folder() self.pages_to_process = dict(start_pages) self.processed_pages = set() if self.include_sources == "always": assert not self.is_last_step # todo: should we export it? self.url_to_weight.update(self.pages_to_process) if self.need_search_engine_before(): self.use_search_engine(self.website.main_page_url) self.pages_to_process.update(self.url_to_weight) self.add_links_from_sitemap_xml() self.apply_function_to_links(self.robot_link_func) if self.step_name == "sitemap": self.add_regional_main_pages() self.profiler = { "elapsed_time": time.time() - self.start_time, "step_request_rate": THttpRequester.get_request_rate(self.start_time), "site_request_rate": THttpRequester.get_request_rate() } self.logger.debug("{}".format(str(self.profiler))) self.delete_url_mirrors_by_www_and_protocol_prefix() self.logger.info('{0} source_url links -> {1} target links'.format(len(start_pages), len(self.url_to_weight)))
def test_unicode(self): try: THttpRequester.initialize(setup_logging()) s = THttpRequester.make_http_request("http://5%20июня%20запретят%20розничную%20продажу%20алкоголя", "GET") except THttpRequester.RobotHttpException as exp: # no UnicodeException for this url pass
def test_423mb_video(self): url = "https://epp.genproc.gov.ru/documents/1664002/25630699/%D0%AD%D1%81%D1%82%D0%B0%D1%84%D0%B5%D1%82%D0%B0%2B%D0%B4%D0%BE%D0%B1%D1%80%D1%8B%D1%85%2B%D0%B4%D0%B5%D0%BB.mp4/08c1ddfb-c48f-8c7f-9c2e-f0c66363c393?version=1.10&t=1608287244923&download=true" normal_url, headers, data = THttpRequester.make_http_request( url, "GET") self.assertEqual(0, len(data)) normal_url, headers, data = THttpRequester.make_http_request( url, "HEAD") self.assertEqual(0, len(data))
def setUp(self, website_folder): self.env = TestDlrobotEnv("data.{}".format( os.path.basename(website_folder))) shutil.copy2( os.path.join(os.path.dirname(__file__), website_folder, "project.txt"), self.env.data_folder) THttpRequester.ENABLE = False self.logger = setup_logging(log_file_name="dlrobot.log") THttpRequester.initialize(self.logger)
def test_ssl(self): sites = [ "http://www.yandex.ru", "http://chukotka.sledcom.ru/", "http://www.aot.ru", "http://officefinder.rs", "http://ozerny.ru", "http://ksl.spb.sudrf.ru", "http://spbogdo.ru", "http://akrvo.ru", "http://primorie.fas.gov.ru" ] for site in sites: THttpRequester.make_http_request(site, "GET") # no exceptions
def setUp(self): self.server_address = '127.0.0.1:{}'.format(self.web_site_port) self.web_server = TestHTTPServer(self.web_site_port) threading.Thread(target=start_server, args=(self.web_server,)).start() time.sleep(1) self.env = TestDlrobotEnv("data.timeout") TDownloadEnv.clear_cache_folder() self.logger = setup_logging(log_file_name="dlrobot.log") THttpRequester.initialize(self.logger)
def click_all_selenium(self, main_url, check_link_func): self.logger.debug("find_links_with_selenium url={} ".format(main_url)) THttpRequester.consider_request_policy(main_url, "GET_selenium") elements = self.get_selenium_driver().navigate_and_get_links_js(main_url, self.config.selenium_timeout) if elements is None: self.logger.error("cannot get child elements using javascript for url={}".format(main_url)) return page_html = self.get_selenium_driver().the_driver.page_source if page_html is None: self.logger.error("cannot get html source_url for url={}".format(main_url)) return self.logger.debug("html_size={}, elements_count={}".format(len(page_html), len(elements))) processed_elements = set() self.find_languages_links(elements, processed_elements) html_title = self.get_selenium_driver().the_driver.title link_infos = dict() not_empty_links = set() for element_index, element, in enumerate(elements): link_info = self.build_link_info(main_url, page_html, element_index, element, html_title) link_infos[element['id']] = link_info if not self.can_follow_this_link(link_info): processed_elements.add(element['id']) else: if link_info.target_url is not None: not_empty_links.add(link_info.target_url) if len(not_empty_links) > 30 and not_empty_links.issubset(self.unique_hrefs): self.logger.debug("skip page, since its links are similar to the previous page (speed optimization)") return else: for x in not_empty_links: if x not in self.unique_hrefs: pass self.unique_hrefs.update(not_empty_links) self.unique_hrefs.add(main_url) self.crawled_web_pages_count += 1 for element_index, element, in enumerate(elements): if element['id'] not in processed_elements: processed_elements.add(element['id']) self.process_selenium_element(link_infos[element['id']], element, check_link_func) # получаем еще раз ссылки, может быть, что-то новое отрисовал javascript, хотя, # может быть, надо брать ссылки не после, а до скролдауна и сравнивать их по href, а не по id, # т.е. до того как javaскрипт начал скрывать их (поближе к чистой странице, как будто мы ее скачали curl) elements = self.get_selenium_driver().get_links_js(timeout=self.config.selenium_timeout) if elements is None: self.logger.error("cannot get child elements using javascript for url={} (second)".format(main_url)) return for element_index, element, in enumerate(elements): if element['id'] not in processed_elements: link_info = self.build_link_info(main_url, page_html, element_index, element, html_title) if self.can_follow_this_link(link_info): self.process_selenium_element(link_info, element, check_link_func)
def __init__(self, args): self.args = args self.logger = setup_logging(log_file_name=args.logfile, logger_name="dlr") self.config = TRobotConfig.read_by_config_type(self.args.config_type) self.config.update_from_program_args(self.args) self.logger.debug("crawling_timeout={}".format( self.config.crawling_timeout)) TDownloadEnv.init_conversion(self.logger) THttpRequester.initialize(self.logger) if args.clear_cache_folder: TDownloadEnv.clear_cache_folder()
def main(): logger = setup_logging() args = parse_args() THttpRequester.initialize(logger) for url in args.urls: file = TDownloadedFile(url, args.use_cache) if args.action == "links": print_links(file) elif args.action == "text": print_text(file) elif args.action == "utf8_html": print_utf8_html(file) else: raise Exception("unknown action")
def __init__(self): self.args = parse_args() self.logger = setup_logging(log_file_name=self.args.logfile) if self.args.input_offices is not None: offices = TOfficeTableInMemory() offices.read_from_local_file(self.args.input_offices) self.web_sites = TDeclarationWebSiteList(self.logger, offices=offices) else: self.web_sites = TDeclarationWebSiteList(self.logger) self.temp_dlrobot_project: TRobotProject self.temp_dlrobot_project = None THttpRequester.initialize(self.logger)
def navigate(self, url): #to reduce memory usage if self.driver_processed_urls_count > 100: self.stop_executable() self.start_executable() self.driver_processed_urls_count = 0 self.driver_processed_urls_count += 1 #leave only one window tab, close other tabs self.close_not_first_tab() self.logger.debug("selenium navigate to {}, window tabs count={}".format(url, len(self.the_driver.window_handles))) self.the_driver.switch_to.window(self.the_driver.window_handles[0]) # navigation try: self.the_driver.set_page_load_timeout(self.page_load_timeout) self.the_driver.get(url) except IndexError as exp: raise THttpRequester.RobotHttpException("general IndexError inside urllib.request.urlopen", url, 520, "GET") except TimeoutException as exp: title = self.the_driver.title if len(title) == 0: raise self.check_http_code(url)
def click_selenium_if_no_href(self, main_url, element, element_index, check_link_func): tag_name = element.tag_name link_text = element.text.strip('\n\r\t ') # initialize here, can be broken after click page_html = self.get_selenium_driver().the_driver.page_source THttpRequester.consider_request_policy(main_url + " elem_index=" + str(element_index), "click_selenium") link_info = TLinkInfo(TClickEngine.selenium, main_url, None, source_html=page_html, anchor_text=link_text, tag_name=tag_name, element_index=element_index, source_page_title=self.get_selenium_driver().the_driver.title) self.get_selenium_driver().click_element(element, link_info) if self.normalize_and_check_link(link_info, check_link_func): if link_info.downloaded_file is not None: self.add_downloaded_file_wrapper(link_info) elif link_info.target_url is not None: self.add_link_wrapper(link_info)
def check_http_code(self, url): if len(self.the_driver.page_source) < 700: title = self.the_driver.title.strip() if title.startswith('4') or title.startswith('3') or title.startswith('5'): words = title.split(' ') if words[0] in POPULAR_ERROR_HTTP_CODES: message = " ".join(words[1:]) if message == POPULAR_ERROR_HTTP_CODES[words[0]]: raise THttpRequester.RobotHttpException(message, url, words[0], "GET")
def recognize_protocol_and_www_selenium(self): for url in urllib_parse_pro.get_url_modifications(self.input_site_url): try: self.parent_project.selenium_driver.navigate(url) time.sleep(3) title = self.parent_project.selenium_driver.the_driver.title html = self.parent_project.selenium_driver.the_driver.page_source self.init_main_page_url_from_redirected_url( self.parent_project.selenium_driver.the_driver.current_url, title, html) return except WebDriverException as exp: self.logger.error( "cannot fetch {} with selenium, sleep 3 sec".format(url)) time.sleep(3) raise THttpRequester.RobotHttpException( "there is no way to access {}".format(self.input_site_url), self.input_site_url, 404, "GET")
def test_redirect_popular_site(self): THttpRequester.initialize(setup_logging()) redirected_url, headers = THttpRequester.request_url_headers_with_global_cache("http://www.meduza.io") self.assertIsNotNone(headers) self.assertEqual(redirected_url, 'https://meduza.io/')
def test_redirects(self): dummy1, dummy2, data = THttpRequester.make_http_request_urllib( self.build_url("redirect1"), "GET", 10) self.assertEqual(data.decode('utf8').startswith("<html>"), True)
def test_gibdd(self): try: THttpRequester.initialize(setup_logging()) s = THttpRequester.make_http_request("http://gibdd.ru", "GET") except THttpRequester.RobotHttpException as exp: self.assertEqual(exp.http_code, 520)
def test_video(self): url = "https://www.w3schools.com/html/mov_bbb.mp4" normal_url, headers, data = THttpRequester.make_http_request( url, "GET") self.assertEqual(0, len(data))