def request_the_same(url): global HTTP_GET_REQUESTS_COUNT file1 = TDownloadedFile(url) assert HTTP_GET_REQUESTS_COUNT == 1 file2 = TDownloadedFile(url) assert HTTP_GET_REQUESTS_COUNT == 1
def copy_files(args, toloka_results): assert args.positive_folder is not None assert args.negative_folder is not None with TRobotProject(args.project, ROBOT_STEPS) as project: project.read_project(fetch_morda_url=False) office_info = project.offices[0] index = 0 domain = strip_html_url(office_info.morda_url) for export_record in office_info.exported_files: index += 1 cached_file = export_record['cached_file'] url = export_record['url'] print () extension = TDownloadedFile(url).file_extension out_file = "{}_{}_{}{}".format(domain, index, int(time.time()), extension) tol_res = toloka_results.get(cached_file) if tol_res == "YES": folder = args.positive_folder elif tol_res == "NO": folder = args.negative_folder else: folder = None if folder is not None: out_file = os.path.join(folder, out_file) print ("{} -> {}".format(url, out_file)) shutil.copy(cached_file, out_file)
def add_link_wrapper(self, link_info: TLinkInfo): assert link_info.target_url is not None try: downloaded_file = TDownloadedFile(link_info.target_url) except RobotHttpException as err: self.logger.error(err) return href = link_info.target_url self.website.url_nodes[link_info.source_url].add_child_link(href, link_info.to_json()) link_info.weight = max(link_info.weight, self.step_urls[href]) self.step_urls[href] = link_info.weight if href not in self.website.url_nodes: if link_info.target_title is None and downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: link_info.target_title = get_html_title(downloaded_file.data) self.website.url_nodes[href] = TUrlInfo(title=link_info.target_title, step_name=self.get_step_name()) self.website.url_nodes[href].parent_nodes.add(link_info.source_url) if self.is_last_step(): self.website.export_env.export_file_if_relevant(downloaded_file, link_info) if self.step_passport.get('transitive', False): if href not in self.processed_pages: if downloaded_file.file_extension == DEFAULT_HTML_EXTENSION: self.pages_to_process[href] = link_info.weight self.logger.debug("add link {0}".format(href))
def request_timeouted(url): got_timeout_exception = False try: TDownloadedFile(url) except RobotHttpException as exp: got_timeout_exception = True assert got_timeout_exception
def request_too_many_404(url): codes = list() for i in range(4): try: x = TDownloadedFile(url) except RobotHttpException as exp: codes.append(exp.http_code) assert codes == [404, 404, 404, 429]
def add_page_links(self, url, use_selenium=True, use_urllib=True): try: downloaded_file = TDownloadedFile(url) except RobotHttpException as err: self.logger.error(err) return if downloaded_file.file_extension != DEFAULT_HTML_EXTENSION: return html_parser = None already_processed_by_urllib = None try: if use_urllib: html_parser = THtmlParser(downloaded_file.data) already_processed_by_urllib = self.website.find_a_web_page_with_a_similar_html(self, url, html_parser.html_text) except Exception as e: self.logger.error('cannot parse html, exception {}'.format(url, e)) return try: if use_urllib and already_processed_by_urllib is None: find_links_in_html_by_text(self, url, html_parser) else: if use_urllib: self.logger.debug( 'skip processing {} in find_links_in_html_by_text, a similar file is already processed on this step: {}'.format(url, already_processed_by_urllib)) if not use_selenium and len(list(html_parser.soup.findAll('a'))) < 10: self.logger.debug('temporal switch on selenium, since this file can be fully javascripted') use_selenium = True if use_selenium: # switch off selenium is almost a panic mode (too many links) if downloaded_file.get_file_extension_only_by_headers() != DEFAULT_HTML_EXTENSION: # selenium reads only http headers, so downloaded_file.file_extension can be DEFAULT_HTML_EXTENSION self.logger.debug("do not browse {} with selenium, since it has wrong http headers".format(url)) else: click_all_selenium(self, url, self.website.parent_project.selenium_driver) except (RobotHttpException, WebDriverException, InvalidSwitchToTargetException) as e: self.logger.error('add_links failed on url={}, exception: {}'.format(url, e))
def create_toloka_pool(project_path, toloka_stream): with TRobotProject(logging, project_path, ROBOT_STEPS, None) as project: project.read_project(fetch_morda_url=False) office_info = project.offices[0] toloka_stream.write("INPUT:url\tINPUT:file_link\tINPUT:file_extension\tINPUT:html\n") ec = TExternalConverters() cnt = 0 all_files = 0 for export_record in office_info.exported_files: all_files += 1 sys.stderr.write("{}/{}\n".format(all_files, len(office_info.exported_files))) sys.stderr.flush() url = export_record['url'] cached_file = export_record['cached_file'] extension = TDownloadedFile(url).file_extension temp_file = "dummy" + extension shutil.copy(cached_file, temp_file) html = ec.convert_to_html_with_soffice(temp_file) os.unlink(temp_file) if html is not None: html = html.replace("\t", " ").replace("\n", " ").replace("\r", " ") toloka_stream.write("\t".join((url, cached_file, extension, html)) + "\n\n") cnt += 1 sys.stderr.write("written {} lines of of {}".format(cnt, all_files))
fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) return logger if __name__ == '__main__': logger = setup_logging() TDownloadEnv.clear_cache_folder() web_addr = sys.argv[1] host, port = web_addr.split(":") HTTP_SERVER = http.server.HTTPServer((host, int(port)), THttpServer) server_thread = threading.Thread(target=start_server) server_thread.start() time.sleep(1) url = web_addr + "/somepath" wrong_extension = get_file_extension_only_by_headers(url) assert wrong_extension == ".doc" # see minvr.ru for this error downloaded_file = TDownloadedFile(url) right_extension = downloaded_file.file_extension #read file contents to determine it's type assert right_extension == DEFAULT_HTML_EXTENSION assert HTTP_GET_REQUESTS_COUNT == 1 assert HTTP_HEAD_REQUESTS_COUNT == 1 # test redirects dummy1, dummy2, data = make_http_request_urllib(logger, web_addr + "/redirect1", "GET") assert data.decode('utf8').startswith("<html>") HTTP_SERVER.shutdown()
logger.setLevel(logging.DEBUG) # create formatter and add it to the handlers formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') if os.path.exists(logfilename): os.remove(logfilename) # create file handler which logs even debug messages fh = logging.FileHandler(logfilename, encoding="utf8") fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) return logger def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--url", dest='url', required=True) return parser.parse_args() if __name__ == '__main__': logger = setup_logging("dlrobot.log") args = parse_args() TDownloadEnv.clear_cache_folder() TRequestPolicy.ENABLE = False #see https://stackoverflow.com/questions/38015537/python-requests-exceptions-sslerror-dh-key-too-small #for http://primorie.fas.gov.ru file = TDownloadedFile(args.url) assert file is not None
def init_morda_url_if_necessary(self): if len(self.url_nodes) == 0: title = get_html_title(TDownloadedFile(self.morda_url).data) self.url_nodes[self.morda_url] = TUrlInfo(title=title)