#!/usr/bin/env python3 from tbselenium.tbdriver import TorBrowserDriver import pickle tbpath = "tor-browser_en-US" with open('onions.sav', 'rb') as f: potential_onions = pickle.load(f) print("Loaded {} onions".format(len(potential_onions))) driver = TorBrowserDriver(tbpath) driver.set_page_load_timeout(60) good_onions = [] for onion in potential_onions: try: driver.load_url(onion) good_onions.append(onion) except Exception as e: print(e) print("Good onions") for onion in good_onions: print(onion) with open('good-onions.sav', 'wb') as f: pickle.dump(good_onions, f)
class Crawler: """Crawls your onions, but also manages Tor, drives Tor Browser, and uses information from your Tor cell log and stem to collect cell sequences.""" def __init__( self, take_ownership=True, # Tor dies when the Crawler does torrc_config={"CookieAuth": "1"}, tor_log="/var/log/tor/tor.log", tor_cell_log="/var/log/tor/tor_cell_seq.log", control_port=9051, socks_port=9050, run_in_xvfb=True, tbb_path=join("/opt", "tbb", "tor-browser_en-US"), tb_log_path=join(_log_dir, "firefox.log"), tb_tor_cfg=USE_RUNNING_TOR, page_load_timeout=20, wait_on_page=5, wait_after_closing_circuits=0, restart_on_sketchy_exception=True, additional_control_fields={}, db_handler=None): self.logger = setup_logging(_log_dir, "crawler") # Set stem logging level to INFO - "high level library activity" stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO) self.torrc_config = torrc_config self.socks_port = find_free_port(socks_port, control_port) self.torrc_config.update({"SocksPort": str(self.socks_port)}) self.control_port = find_free_port(control_port, self.socks_port) self.torrc_config.update({"ControlPort": str(self.control_port)}) self.torrc_config.update({"Log": "INFO file {}".format(tor_log)}) self.logger.info("Starting tor process with config " "{torrc_config}.".format(**locals())) self.tor_process = launch_tor_with_config( config=self.torrc_config, take_ownership=take_ownership) self.authenticate_to_tor_controlport() self.logger.info("Opening cell log stream...") self.cell_log = open(tor_cell_log, "rb") if run_in_xvfb: self.logger.info("Starting Xvfb...") self.run_in_xvfb = True self.virtual_framebuffer = start_xvfb() self.logger.info("Starting Tor Browser...") self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=tb_tor_cfg, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.wait_after_closing_circuits = wait_after_closing_circuits self.page_load_timeout = page_load_timeout self.tb_driver.set_page_load_timeout(page_load_timeout) self.wait_on_page = wait_on_page self.restart_on_sketchy_exception = restart_on_sketchy_exception self.control_data = self.get_control_data(page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields) self.db_handler = db_handler if db_handler: self.crawlid = self.db_handler.add_crawl(self.control_data) def authenticate_to_tor_controlport(self): self.logger.info("Authenticating to the tor controlport...") try: self.controller = Controller.from_port(port=self.control_port) except stem.SocketError as exc: panic("Unable to connect to tor on port {self.control_port}: " "{exc}".format(**locals())) try: self.controller.authenticate() except stem.connection.MissingPassword: panic("Unable to authenticate to tor controlport. Please add " "`CookieAuth 1` to your tor configuration file.") def get_control_data(self, page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields): """Gather metadata about the crawler instance.""" control_data = {} # Configuration settings control_data["page_load_timeout"] = page_load_timeout control_data["wait_on_page"] = wait_on_page control_data["wait_after_closing_circuits"] = \ wait_after_closing_circuits if additional_control_fields: control_data.update(additional_control_fields) # System facts control_data["kernel"] = platform.system() control_data["kernel_version"] = platform.release() control_data["os"] = platform.version() control_data["python_version"] = platform.python_version() ip = urlopen("https://api.ipify.org").read().decode() control_data["ip"] = ip # This API seems to be unstable and we haven't found a suitable # alternative :( try: asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip)) asn_geoip = literal_eval(asn_geoip.read().decode()) control_data["asn"] = asn_geoip.get("ip").get("as").get("asn") control_data["city"] = asn_geoip.get("ip").get("city") control_data["country"] = asn_geoip.get("ip").get("country") except urllib.error.HTTPError: self.logger.warning("Unable to query ASN API and thus some " "control data may be missing from this run.") control_data["tor_version"] = self.controller.get_version().version_str control_data["tb_version"] = self.tb_driver.tb_version # Tor will have multiple entry nodes in its state file, but will # choose the first sequential one that is up as its entry guard. entry_nodes = self.controller.get_info("entry-guards").split('\n') control_data["entry_node"] = next( re.search("[0-9A-F]{40}", g).group(0) for g in entry_nodes if re.search("up", g)) control_data["crawler_version"] = _version return control_data def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() return self def __del__(self): self.close() def close(self): self.logger.info("Beginning Crawler exit process...") if "tb_driver" in dir(self): self.logger.info("Closing Tor Browser...") self.tb_driver.quit() if "virtual_framebuffer" in dir(self): self.logger.info("Closing the virtual framebuffer...") # A bug in pyvirtualdisplay triggers a KeyError exception when closing a # virtual framebuffer if the $DISPLAY environment variable is not set. try: stop_xvfb(self.virtual_framebuffer) except KeyError: pass if "cell_log" in dir(self): self.logger.info("Closing the Tor cell stream...") self.cell_log.close() if "tor_process" in dir(self): self.logger.info("Killing the tor process...") self.tor_process.kill() self.logger.info("Crawler exit completed.") def collect_onion_trace(self, url, hsid=None, extra_fn=None, trace_dir=None, iteration=0): """Crawl an onion service and collect a complete cell sequence for the activity at the time. Also, record additional information about the circuits with stem. Optionally, pass a function to execute additional actions after the page has loaded.""" # Todo: create collect_trace method that works for regular sites as # well assert ".onion" in url, ("This method is only suitable for crawling " "onion services.") self.logger.info("{url}: closing existing circuits before starting " "crawl.".format(**locals())) for circuit in self.controller.get_circuits(): self.controller.close_circuit(circuit.id) sleep(self.wait_after_closing_circuits) if not trace_dir: trace_dir = self.make_ts_dir() trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration) trace_path = join(trace_dir, trace_name) start_idx = self.get_cell_log_pos() try: self.crawl_url(url) rend_circ_ids = self.get_rend_circ_ids(url) if extra_fn: self.execute_extra_fn(url, trace_path, start_idx) except CrawlerLoggedError: return "failed" except CrawlerNoRendCircError: self.save_debug_log(url, trace_path, start_idx) return "failed" except: self.logger.exception("{url}: unusual exception " "encountered:".format(**locals())) # Also log active circuit info self.controller.get_circuits() exc_type, exc_value, exc_traceback = exc_info() if exc_type in _sketchy_exceptions: self.save_debug_log(url, trace_path, start_idx) if self.restart_on_sketchy_exception: self.restart_tb() return "failed" self.logger.info("{url}: saving full trace...".format(**locals())) end_idx = self.get_cell_log_pos() full_trace = self.get_full_trace(start_idx, end_idx) # Save the trace to the database or write to file if self.db_handler: try: new_example = { 'hsid': hsid, 'crawlid': self.crawlid, 't_scrape': get_timestamp("db") } except NameError: panic("If using the database, and calling collect_onion_trace " "directly, you must specify the hsid of the site.") exampleid = self.db_handler.add_example(new_example) self.db_handler.add_trace(str(full_trace), exampleid) else: with open(trace_path + "-full", "wb") as fh: fh.write(full_trace) return "succeeded" def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"): """Creates a timestamped folder to hold a group of traces.""" raw_dirpath = join(parent_dir, raw_dir_name) ts = get_timestamp("log") ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True) symlink_cur_to_latest(raw_dirpath, ts) with open(join(ts_dir, "control.pickle"), "wb") as fh: pickle.dump(self.control_data, fh) return ts_dir def get_cell_log_pos(self): """Returns the current position of the last byte in the Tor cell log.""" return self.cell_log.seek(0, SEEK_END) def crawl_url(self, url): """Load a web page in Tor Browser and optionally pass a function to execute custom actions on it.""" self.logger.info("{url}: starting page load...".format(**locals())) try: self.tb_driver.load_url(url, wait_on_page=self.wait_on_page, wait_for_page_body=True) except TimeoutException: self.logger.warning("{url}: timed out.".format(**locals())) raise CrawlerLoggedError except http.client.CannotSendRequest: self.logger.warning("{url}: cannot send request--improper " "connection state.".format(**locals())) raise CrawlerLoggedError # Make sure we haven't just hit an error page or nothing loaded try: if (self.tb_driver.is_connection_error_page or self.tb_driver.current_url == "about:newtab"): raise CrawlerReachedErrorPage except CrawlerReachedErrorPage: self.logger.warning("{url}: reached connection error " "page.".format(**locals())) raise CrawlerLoggedError self.logger.info("{url}: successfully loaded.".format(**locals())) def get_rend_circ_ids(self, url): """Returns the rendezvous circuit id(s) associated with a given onion service.""" self.logger.info("{url}: collecting circuit " "information...".format(**locals())) active_circs = self.controller.get_circuits() rend_circ_ids = set() for circ in active_circs: if (circ.purpose == "HS_CLIENT_REND" and circ.socks_username and circ.socks_username in url): rend_circ_ids.add(circ.id) # If everything goes perfect, we should only see one. Multiple indicate # the first failed. Zero indicates one closed abruptly (or there's an # error with stem--still waiting on data to confirm or deny). rend_circ_ct = len(rend_circ_ids) self.logger.info( "{url}: {rend_circ_ct} associated rendezvous circuits " "discovered.".format(**locals())) if rend_circ_ct == 0: raise CrawlerNoRendCircError return rend_circ_ids def execute_extra_fn(self, url, trace_path, start_idx): self.logger.info("{url}: executing extra function " "code...".format(**locals())) extra_fn(self, url, trace_path, start_idx) self.logger.info("{url}: extra function executed " "successfully.".format(**locals())) def save_debug_log(self, url, trace_path, start_idx): self.logger.warning("{url}: saving debug log...".format(**locals())) exc_time = self.get_cell_log_pos() trace = self.get_full_trace(start_idx, exc_time) with open(trace_path + "@debug", "wb") as fh: fh.write(trace) def get_full_trace(self, start_idx, end_idx): """Returns the Tor DATA cells transmitted over a circuit during a specified time period.""" # Sanity check assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile " "position") assert end_idx > start_idx, ("logfile section end_idx must come " "after start_idx") self.cell_log.seek(start_idx, SEEK_SET) return self.cell_log.read(end_idx - start_idx) def restart_tb(self): """Restarts the Tor Browser.""" self.logger.info("Restarting the Tor Browser...") self.tb_driver.quit() self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=USE_RUNNING_TOR, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.logger.info("Tor Browser restarted...") def collect_set_of_traces(self, url_set, extra_fn=None, trace_dir=None, iteration=0, shuffle=True, retry=True, url_to_id_mapping=None): """Collect a set of traces.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = url_set trace_dir = None elif not trace_dir: trace_dir = self.make_ts_dir() set_size = len(url_set) self.logger.info("Saving set of {set_size} traces to " "{trace_dir}.".format(**locals())) # Converts both sets (from pickle files) and dicts (whose keys are # URLs--from database) to URL lists url_set = list(url_set) if shuffle: random.shuffle(url_set) failed_urls = [] for url_idx in range(set_size): self.logger.info("Collecting trace {} of " "{set_size}...".format(url_idx + 1, **locals())) url = url_set[url_idx] if self.db_handler: hsid = url_to_id_mapping[url] else: hsid = None if (self.collect_onion_trace(url, hsid=hsid, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration) == "failed" and retry): failed_urls.append(url) if failed_urls: failed_ct = len(failed_urls) self.logger.info("Retrying {failed_ct} of {set_size} traces that " "failed.".format(**locals())) self.collect_set_of_traces(failed_urls, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration, shuffle=shuffle, retry=False, url_to_id_mapping=url_to_id_mapping) def crawl_monitored_nonmonitored(self, monitored_class, nonmonitored_class, extra_fn=None, shuffle=True, retry=True, monitored_name="monitored", nonmonitored_name="nonmonitored", url_to_id_mapping=None, ratio=40): """Crawl a monitored class ratio times interspersed between the crawling of a(n ostensibly larger) non-monitored class.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = nonmonitored_class url_to_id_mapping.update(monitored_class) trace_dir, mon_trace_dir, nonmon_trace_dir = (None, ) * 3 else: trace_dir = self.make_ts_dir() mon_trace_dir = join(trace_dir, monitored_name) mkdir(mon_trace_dir) nonmon_trace_dir = join(trace_dir, nonmonitored_name) mkdir(nonmon_trace_dir) # db: calling list on a dict returns a list of its keys (URLs) # pickle: calling list on set is necessary to make it shuffleable nonmonitored_class = list(nonmonitored_class) monitored_class = list(monitored_class) nonmonitored_class_ct = len(nonmonitored_class) chunk_size = int(nonmonitored_class_ct / ratio) if shuffle: random.shuffle(nonmonitored_class) random.shuffle(monitored_class) for iteration in range(ratio): self.logger.info("Beginning iteration {i} of {ratio} in the " "{monitored_name} class".format(i=iteration + 1, **locals())) self.collect_set_of_traces(monitored_class, trace_dir=mon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping) slice_lb = iteration * chunk_size slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct) self.logger.info("Crawling services {} through {slice_ub} of " "{nonmonitored_class_ct} in the " "{nonmonitored_name} " "class".format(slice_lb + 1, **locals())) self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub], trace_dir=nonmon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping)
class DescargarPdf: def __init__(self): self.tbb_dir = "/usr/local/share/tor-browser_en-US" self.usuario = [] self.contraseñaTxT = [] self.conversor = '?convertedTo=pdf' def iniciarTor(self): self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log') def iniciarSecion(self): self.element = self.zLibraty.find_element_by_name("email") self.element.send_keys(self.correo) sleep(2) self.element2 = self.zLibraty.find_elements_by_class_name( "form-control")[1] self.element2.send_keys(self.contraseña) self.element2.send_keys(Keys.RETURN) def paginaDescargas(self): print("estoy en la funcion paginaDescagas") sleep(4) self.zLibraty.get(self.url) self.html = self.zLibraty.page_source def paginaPrinsipal(self, añoInicial, añoFinal): self.urlAños = 'http://zlibraryexau2g3p.onion/s/?yearFrom=' + str( añoInicial) + '&yearTo=' + str(añoFinal) self.url = self.urlAños def cambiarPagina(self, x): self.url += '&page=' + str(x) def Crearcsv(self): self.carpetaUrl = '/home/dgc7/Documentos/zlibrary/libros1920-1921/url' try: os.mkdir(self.carpetaUrl) except OSError as e: if e.errno != errno.EEXIST: raise self.escrivirUrlWed = csv.writer( open('/home/dgc7/Documentos/zlibrary/libros1920-1921/url/url2.csv', 'w')) self.imprimirUrlPdf = csv.writer( open( '/home/dgc7/Documentos/zlibrary/libros1920-1921/url/urlDowload2.csv', 'w')) def credenciales(self, numeroUsuario): print("llegue") self.correo = self.usuario[numeroUsuario] self.contraseña = self.contraseñaTxT[numeroUsuario] self.urlLoguin = 'http://zlibraryexau2g3p.onion' self.zLibraty.get(self.urlLoguin) def UsuariosYcontraseñas(self): self.dir = '/home/dgc7/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt' self.data = open(self.dir, 'r+') for self.i in range(0, 200): if self.i % 2 == 0: self.usuario.append(self.data.readline()) if self.i % 2 != 0: self.contraseñaTxT.append(self.data.readline()) def urlPdf(self, ): self.boleanoPdf = 0 self.respaldoContador = 0 self.contadorUsuarios = usuarioUsadosLeer() self.contadorLibros = datosDescarga(4) self.contadorLibros2 = self.contadorLibros % 10 self.Crearcsv() self.soup = BeautifulSoup(self.html, 'html.parser') try: for self.urlwed in self.soup.find_all(itemprop="name"): self.contador = 0 self.urlwed = self.urlwed.find('a', href=re.compile('')) self.urlDowload = self.urlwed.get('href') self.urlpdfGeleneralH = re.sub('/book/', 'https://b-ok.cc/book/', self.urlDowload) self.urlDowload = re.sub( '/book/', 'http://zlibraryexau2g3p.onion/book/', self.urlDowload) self.escrivirUrlWed.writerow([self.urlDowload]) print(self.urlDowload) voleano = validarFormato(self.urlpdfGeleneralH) guardarNumeroDescargas(self.contadorLibros) print(self.respaldoContador) if self.contadorLibros == self.respaldoContador: for self.urlRedirec in range(0, 1): self.zLibraty.get(self.urlDowload) sleep(5) self.htmlPdf = self.zLibraty.page_source self.soupRedirec = BeautifulSoup( self.htmlPdf, 'html.parser') self.urlDowloadPDF = self.soupRedirec.find( class_="btn btn-primary dlButton addDownloadedBook" ) self.urlDowloadPDF = self.urlDowloadPDF.get('href') self.urlDowloadPDF = re.sub( '/dl/', 'http://zlibraryexau2g3p.onion/dl/', self.urlDowloadPDF) self.imprimirUrlPdf.writerow([self.urlDowloadPDF]) print(self.urlDowloadPDF) print("vamos a por el if") sleep(10) if voleano == True: self.zLibraty.set_page_load_timeout(8) try: self.zLibraty.get(self.urlDowloadPDF) except: self.zLibraty.set_page_load_timeout(70) self.zLibraty.refresh() print("funciona PDF ") voleano = False sleep(5) self.contadorLibros += 1 self.contadorLibros2 += 1 else: try: self.zLibraty.set_page_load_timeout(5) try: self.zLibraty.get(self.urlDowloadPDF) except: sleep(4) pyautogui.press("down") sleep(2) pyautogui.press("enter") self.zLibraty.set_page_load_timeout(70) except: print( "\nerror al controlasr el teclado y dar enter\n" ) raise sleep(5) self.zLibraty.refresh() self.contadorLibros += 1 self.contadorLibros2 += 1 sleep(20) tiempoDescarga() informaiconPdf(self.urlpdfGeleneralH) self.respaldoContador += 1 if self.contadorLibros == self.respaldoContador: if self.contadorLibros2 % 10 == 0: print((self.contadorLibros2 - 1) % 10) self.contador += 1 pyautogui.hotkey("ctrl", "shift", "u") sleep(2) pyautogui.press("enter") sleep(7) pyautogui.press("enter") sleep(15) self.contadorUsuarios += 1 print(self.contadorUsuarios) try: self.zLibraty.switch_to_window( self.zLibraty.window_handles[0]) except: print("error al cambian de ventana") usuarioUsadosReescrivir(self.contadorUsuarios) print("por aqui¿¿¿¿¿¿") self.credenciales(self.contadorUsuarios) print("no por aqui¿¿¿¿¿¿") sleep(23) self.iniciarSecion() sleep(7) self.contadorLibros2 = 0 sleep(15) print("numero de li bros por usuario ", self.contadorLibros2) if self.contador == 5: self.contador = 0 except OSError as e: print(e.strerror) print("error en la urlPdf:::::") guardarNumeroDescargas(self.contadorLibros) usuarioUsadosReescrivir(self.contadorUsuarios) print(self.contadorLibros) raise print("termine la pagina") def DescargarContenido(self, _html): self.contenido = _html def serrarTor(self): self.zLibraty.close()
class Crawler: """Crawls your onions, but also manages Tor, drives Tor Browser, and uses information from your Tor cell log and stem to collect cell sequences.""" def __init__(self, take_ownership=True, # Tor dies when the Crawler does torrc_config={"CookieAuth": "1"}, tor_log="/var/log/tor/tor.log", tor_cell_log="/var/log/tor/tor_cell_seq.log", control_port=9051, socks_port=9050, run_in_xvfb=True, tbb_path=join("/opt","tbb","tor-browser_en-US"), tb_log_path=join(_log_dir,"firefox.log"), tb_tor_cfg=USE_RUNNING_TOR, page_load_timeout=20, wait_on_page=5, wait_after_closing_circuits=0, restart_on_sketchy_exception=True, additional_control_fields={}, db_handler=None): self.logger = setup_logging(_log_dir, "crawler") # Set stem logging level to INFO - "high level library activity" stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO) self.torrc_config = torrc_config self.socks_port = find_free_port(socks_port, control_port) self.torrc_config.update({"SocksPort": str(self.socks_port)}) self.control_port = find_free_port(control_port, self.socks_port) self.torrc_config.update({"ControlPort": str(self.control_port)}) self.torrc_config.update({"Log": "INFO file {}".format(tor_log)}) self.logger.info("Starting tor process with config " "{torrc_config}.".format(**locals())) self.tor_process = launch_tor_with_config(config=self.torrc_config, take_ownership=take_ownership) self.authenticate_to_tor_controlport() self.logger.info("Opening cell log stream...") self.cell_log = open(tor_cell_log, "rb") if run_in_xvfb: self.logger.info("Starting Xvfb...") self.run_in_xvfb = True self.virtual_framebuffer = start_xvfb() self.logger.info("Starting Tor Browser...") self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=tb_tor_cfg, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.wait_after_closing_circuits = wait_after_closing_circuits self.page_load_timeout = page_load_timeout self.tb_driver.set_page_load_timeout(page_load_timeout) self.wait_on_page = wait_on_page self.restart_on_sketchy_exception = restart_on_sketchy_exception self.control_data = self.get_control_data(page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields) self.db_handler = db_handler if db_handler: self.crawlid = self.db_handler.add_crawl(self.control_data) def authenticate_to_tor_controlport(self): self.logger.info("Authenticating to the tor controlport...") try: self.controller = Controller.from_port(port=self.control_port) except stem.SocketError as exc: panic("Unable to connect to tor on port {self.control_port}: " "{exc}".format(**locals())) try: self.controller.authenticate() except stem.connection.MissingPassword: panic("Unable to authenticate to tor controlport. Please add " "`CookieAuth 1` to your tor configuration file.") def get_control_data(self, page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields): """Gather metadata about the crawler instance.""" control_data = {} # Configuration settings control_data["page_load_timeout"] = page_load_timeout control_data["wait_on_page"] = wait_on_page control_data["wait_after_closing_circuits"] = \ wait_after_closing_circuits if additional_control_fields: control_data.update(additional_control_fields) # System facts control_data["kernel"] = platform.system() control_data["kernel_version"] = platform.release() control_data["os"] = platform.version() control_data["python_version"] = platform.python_version() ip = urlopen("https://api.ipify.org").read().decode() control_data["ip"] = ip # This API seems to be unstable and we haven't found a suitable # alternative :( try: asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip)) asn_geoip = literal_eval(asn_geoip.read().decode()) control_data["asn"] = asn_geoip.get("ip").get("as").get("asn") control_data["city"] = asn_geoip.get("ip").get("city") control_data["country"] = asn_geoip.get("ip").get("country") except urllib.error.HTTPError: self.logger.warning("Unable to query ASN API and thus some " "control data may be missing from this run.") control_data["tor_version"] = self.controller.get_version().version_str control_data["tb_version"] = self.tb_driver.tb_version # Tor will have multiple entry nodes in its state file, but will # choose the first sequential one that is up as its entry guard. entry_nodes = self.controller.get_info("entry-guards").split('\n') control_data["entry_node"] = next(re.search("[0-9A-F]{40}", g).group(0) for g in entry_nodes if re.search("up", g)) control_data["crawler_version"] = _version return control_data def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() return self def __del__(self): self.close() def close(self): self.logger.info("Beginning Crawler exit process...") if "tb_driver" in dir(self): self.logger.info("Closing Tor Browser...") self.tb_driver.quit() if "virtual_framebuffer" in dir(self): self.logger.info("Closing the virtual framebuffer...") # A bug in pyvirtualdisplay triggers a KeyError exception when closing a # virtual framebuffer if the $DISPLAY environment variable is not set. try: stop_xvfb(self.virtual_framebuffer) except KeyError: pass if "cell_log" in dir(self): self.logger.info("Closing the Tor cell stream...") self.cell_log.close() if "tor_process" in dir(self): self.logger.info("Killing the tor process...") self.tor_process.kill() self.logger.info("Crawler exit completed.") def collect_onion_trace(self, url, hsid=None, extra_fn=None, trace_dir=None, iteration=0): """Crawl an onion service and collect a complete cell sequence for the activity at the time. Also, record additional information about the circuits with stem. Optionally, pass a function to execute additional actions after the page has loaded.""" # Todo: create collect_trace method that works for regular sites as # well assert ".onion" in url, ("This method is only suitable for crawling " "onion services.") self.logger.info("{url}: closing existing circuits before starting " "crawl.".format(**locals())) for circuit in self.controller.get_circuits(): self.controller.close_circuit(circuit.id) sleep(self.wait_after_closing_circuits) if not trace_dir: trace_dir = self.make_ts_dir() trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration) trace_path = join(trace_dir, trace_name) start_idx = self.get_cell_log_pos() try: self.crawl_url(url) rend_circ_ids = self.get_rend_circ_ids(url) if extra_fn: self.execute_extra_fn(url, trace_path, start_idx) except CrawlerLoggedError: return "failed" except CrawlerNoRendCircError: self.save_debug_log(url, trace_path, start_idx) return "failed" except: self.logger.exception("{url}: unusual exception " "encountered:".format(**locals())) # Also log active circuit info self.controller.get_circuits() exc_type, exc_value, exc_traceback = exc_info() if exc_type in _sketchy_exceptions: self.save_debug_log(url, trace_path, start_idx) if self.restart_on_sketchy_exception: self.restart_tb() return "failed" self.logger.info("{url}: saving full trace...".format(**locals())) end_idx = self.get_cell_log_pos() full_trace = self.get_full_trace(start_idx, end_idx) # Save the trace to the database or write to file if self.db_handler: try: new_example = {'hsid': hsid, 'crawlid': self.crawlid, 't_scrape': get_timestamp("db")} except NameError: panic("If using the database, and calling collect_onion_trace " "directly, you must specify the hsid of the site.") exampleid = self.db_handler.add_example(new_example) self.db_handler.add_trace(str(full_trace), exampleid) else: with open(trace_path+"-full", "wb") as fh: fh.write(full_trace) return "succeeded" def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"): """Creates a timestamped folder to hold a group of traces.""" raw_dirpath = join(parent_dir, raw_dir_name) ts = get_timestamp("log") ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True) symlink_cur_to_latest(raw_dirpath, ts) with open(join(ts_dir, "control.pickle"), "wb") as fh: pickle.dump(self.control_data, fh) return ts_dir def get_cell_log_pos(self): """Returns the current position of the last byte in the Tor cell log.""" return self.cell_log.seek(0, SEEK_END) def crawl_url(self, url): """Load a web page in Tor Browser and optionally pass a function to execute custom actions on it.""" self.logger.info("{url}: starting page load...".format(**locals())) try: self.tb_driver.load_url(url, wait_on_page=self.wait_on_page, wait_for_page_body=True) except TimeoutException: self.logger.warning("{url}: timed out.".format(**locals())) raise CrawlerLoggedError except http.client.CannotSendRequest: self.logger.warning("{url}: cannot send request--improper " "connection state.".format(**locals())) raise CrawlerLoggedError # Make sure we haven't just hit an error page or nothing loaded try: if (self.tb_driver.is_connection_error_page or self.tb_driver.current_url == "about:newtab"): raise CrawlerReachedErrorPage except CrawlerReachedErrorPage: self.logger.warning("{url}: reached connection error " "page.".format(**locals())) raise CrawlerLoggedError self.logger.info("{url}: successfully loaded.".format(**locals())) def get_rend_circ_ids(self, url): """Returns the rendezvous circuit id(s) associated with a given onion service.""" self.logger.info("{url}: collecting circuit " "information...".format(**locals())) active_circs = self.controller.get_circuits() rend_circ_ids = set() for circ in active_circs: if (circ.purpose == "HS_CLIENT_REND" and circ.socks_username and circ.socks_username in url): rend_circ_ids.add(circ.id) # If everything goes perfect, we should only see one. Multiple indicate # the first failed. Zero indicates one closed abruptly (or there's an # error with stem--still waiting on data to confirm or deny). rend_circ_ct = len(rend_circ_ids) self.logger.info("{url}: {rend_circ_ct} associated rendezvous circuits " "discovered.".format(**locals())) if rend_circ_ct == 0: raise CrawlerNoRendCircError return rend_circ_ids def execute_extra_fn(self, url, trace_path, start_idx): self.logger.info("{url}: executing extra function " "code...".format(**locals())) extra_fn(self, url, trace_path, start_idx) self.logger.info("{url}: extra function executed " "successfully.".format(**locals())) def save_debug_log(self, url, trace_path, start_idx): self.logger.warning("{url}: saving debug log...".format(**locals())) exc_time = self.get_cell_log_pos() trace = self.get_full_trace(start_idx, exc_time) with open(trace_path + "@debug", "wb") as fh: fh.write(trace) def get_full_trace(self, start_idx, end_idx): """Returns the Tor DATA cells transmitted over a circuit during a specified time period.""" # Sanity check assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile " "position") assert end_idx > start_idx, ("logfile section end_idx must come " "after start_idx") self.cell_log.seek(start_idx, SEEK_SET) return self.cell_log.read(end_idx - start_idx) def restart_tb(self): """Restarts the Tor Browser.""" self.logger.info("Restarting the Tor Browser...") self.tb_driver.quit() self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=USE_RUNNING_TOR, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.logger.info("Tor Browser restarted...") def collect_set_of_traces(self, url_set, extra_fn=None, trace_dir=None, iteration=0, shuffle=True, retry=True, url_to_id_mapping=None): """Collect a set of traces.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = url_set trace_dir = None elif not trace_dir: trace_dir = self.make_ts_dir() set_size = len(url_set) self.logger.info("Saving set of {set_size} traces to " "{trace_dir}.".format(**locals())) # Converts both sets (from pickle files) and dicts (whose keys are # URLs--from database) to URL lists url_set = list(url_set) if shuffle: random.shuffle(url_set) failed_urls = [] for url_idx in range(set_size): self.logger.info("Collecting trace {} of " "{set_size}...".format(url_idx+1, **locals())) url = url_set[url_idx] if self.db_handler: hsid = url_to_id_mapping[url] else: hsid = None if (self.collect_onion_trace(url, hsid=hsid, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration) == "failed" and retry): failed_urls.append(url) if failed_urls: failed_ct = len(failed_urls) self.logger.info("Retrying {failed_ct} of {set_size} traces that " "failed.".format(**locals())) self.collect_set_of_traces(failed_urls, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration, shuffle=shuffle, retry=False, url_to_id_mapping=url_to_id_mapping) def crawl_monitored_nonmonitored(self, monitored_class, nonmonitored_class, extra_fn=None, shuffle=True, retry=True, monitored_name="monitored", nonmonitored_name="nonmonitored", url_to_id_mapping=None, ratio=40): """Crawl a monitored class ratio times interspersed between the crawling of a(n ostensibly larger) non-monitored class.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = nonmonitored_class url_to_id_mapping.update(monitored_class) trace_dir, mon_trace_dir, nonmon_trace_dir = (None,) * 3 else: trace_dir = self.make_ts_dir() mon_trace_dir = join(trace_dir, monitored_name) mkdir(mon_trace_dir) nonmon_trace_dir = join(trace_dir, nonmonitored_name) mkdir(nonmon_trace_dir) # db: calling list on a dict returns a list of its keys (URLs) # pickle: calling list on set is necessary to make it shuffleable nonmonitored_class = list(nonmonitored_class) monitored_class = list(monitored_class) nonmonitored_class_ct = len(nonmonitored_class) chunk_size = int(nonmonitored_class_ct / ratio) if shuffle: random.shuffle(nonmonitored_class) random.shuffle(monitored_class) for iteration in range(ratio): self.logger.info("Beginning iteration {i} of {ratio} in the " "{monitored_name} class".format(i=iteration+1, **locals())) self.collect_set_of_traces(monitored_class, trace_dir=mon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping) slice_lb = iteration * chunk_size slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct) self.logger.info("Crawling services {} through {slice_ub} of " "{nonmonitored_class_ct} in the " "{nonmonitored_name} " "class".format(slice_lb + 1, **locals())) self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub], trace_dir=nonmon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping)
class Visit(object): """Hold info about a particular visit to a page.""" def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None, experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True): self.batch_num = batch_num self.site_num = site_num self.instance_num = instance_num self.page_url = page_url self.bg_site = bg_site self.experiment = experiment self.base_dir = base_dir self.visit_dir = None self.visit_log_dir = None self.tbb_version = cm.RECOMMENDED_TBB_VERSION self.capture_screen = capture_screen self.tor_controller = tor_controller self.xvfb = xvfb self.init_visit_dir() self.pcap_path = os.path.join( self.visit_dir, "{}.pcap".format(self.get_instance_name())) if self.xvfb and not cm.running_in_CI: wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H)) self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H) self.vdisplay.start() # Create new instance of TorBrowser driver TorBrowserDriver.add_exception(self.page_url) self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH, tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log")) self.sniffer = Sniffer() # sniffer to capture the network traffic def init_visit_dir(self): """Create results and logs directories for this visit.""" visit_name = str(self.instance_num) self.visit_dir = os.path.join(self.base_dir, visit_name) ut.create_dir(self.visit_dir) self.visit_log_dir = os.path.join(self.visit_dir, 'logs') ut.create_dir(self.visit_log_dir) def get_instance_name(self): """Construct and return a filename for the instance.""" inst_file_name = '{}_{}_{}' \ .format(self.batch_num, self.site_num, self.instance_num) return inst_file_name def filter_guards_from_pcap(self): guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()]) wl_log.debug("Found %s guards in the concensus.", len(guard_ips)) orig_pcap = self.pcap_path + ".original" copyfile(self.pcap_path, orig_pcap) try: preader = PcapReader(orig_pcap) pcap_filtered = [] for p in preader: if IP not in p: pcap_filtered.append(p) continue ip = p.payload if ip.dst in guard_ips or ip.src in guard_ips: pcap_filtered.append(p) wrpcap(self.pcap_path, pcap_filtered) except Exception as e: wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s", e, orig_pcap) else: os.remove(orig_pcap) def post_crawl(self): pass # TODO: add some sanity checks? def cleanup_visit(self): """Kill sniffer and Tor browser if they're running.""" wl_log.info("Cleaning up visit.") wl_log.info("Cancelling timeout") ut.cancel_timeout() if self.sniffer and self.sniffer.is_recording: wl_log.info("Stopping sniffer...") self.sniffer.stop_capture() # remove non-tor traffic self.filter_guards_from_pcap() if self.tb_driver and self.tb_driver.is_running: # shutil.rmtree(self.tb_driver.prof_dir_path) wl_log.info("Quitting selenium driver...") self.tb_driver.quit() # close all open streams to prevent pollution self.tor_controller.close_all_streams() if self.xvfb and not cm.running_in_CI: wl_log.info("Stopping display...") self.vdisplay.stop() # after closing driver and stoping sniffer, we run postcrawl self.post_crawl() def take_screenshot(self): try: out_png = os.path.join(self.visit_dir, 'screenshot.png') wl_log.info("Taking screenshot of %s to %s" % (self.page_url, out_png)) self.tb_driver.get_screenshot_as_file(out_png) if cm.running_in_CI: wl_log.debug("Screenshot data:image/png;base64,%s" % self.tb_driver.get_screenshot_as_base64()) except: wl_log.info("Exception while taking screenshot of: %s" % self.page_url) def get_wang_and_goldberg(self): """Visit the site according to Wang and Goldberg (WPES'13) settings.""" ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to stop the visit self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {}".format(self.page_url)) t1 = time.time() self.tb_driver.get(self.page_url) page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit() def get_multitab(self): """Open two tab, use one to load a background site and the other to load the real site.""" PAUSE_BETWEEN_TAB_OPENINGS = 0.5 ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to kill running procs # load a blank page - a page is needed to send keys to the browser self.tb_driver.get(BAREBONE_HOME_PAGE) self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {} with {} in the background". format(self.page_url, self.bg_site)) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab # now that the focus is on the address bar, load the background # site by "typing" it to the address bar and "pressing" ENTER (\n) # simulated by send_keys function body.send_keys('%s\n' % self.bg_site) # the delay between the loading of background and real sites time.sleep(PAUSE_BETWEEN_TAB_OPENINGS) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab t1 = time.time() self.tb_driver.get(self.page_url) # load the real site in the 2nd tab page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit() def get(self): """Call the specific visit function depending on the experiment.""" if self.experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG: self.get_wang_and_goldberg() elif self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA: self.get_multitab() else: raise ValueError("Cannot determine experiment type")
from datetime import datetime from tbselenium.tbdriver import TorBrowserDriver from xvfbwrapper import Xvfb tor_dir = '../tor-browser-patched/Primary/' vdisplay = Xvfb() vdisplay.start() # open list of urls for testing with open('alexa-top-1000.txt', 'r') as url_file: test_urls = url_file.readlines() driver = TorBrowserDriver(tor_dir) #, pref_dict=rfp) driver.set_page_load_timeout(15) # do 10 runs uses = 0 notUses = 0 inconclusive = 0 for i, url in enumerate(test_urls): try: # request url from list #print("Fetching " + str(url),end='') url = 'https://' + url driver.get(url) # pull window.performance.timing after loading the page and add information about url and number of run perf_timings = driver.execute_script( "return window.performance.getEntries()") #print(perf_timings)
class DescargarPdf: def __init__(self): self.contadorCredenciales=0 self.tbb_dir = "/usr/local/share/tor-browser_en-US" self.usuario=[] self.contraseñaTxT=[] self.conversor='?convertedTo=pdf' def iniciarTor(self): self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log') def iniciarSecion(self): self.element=self.zLibraty.find_element_by_name("email") self.element.send_keys(self.correo) sleep(2) self.element2=self.zLibraty.find_elements_by_class_name("form-control")[1] self.element2.send_keys(self.contraseña) self.element2.send_keys(Keys.RETURN) def paginaDescargas(self): print("estoy en la funcion paginaDescagas") self.zLibraty.load_url(self.url) sleep(4) self.html=self.zLibraty.page_source def paginaPrinsipal(self,añoInicial,añoFinal): self.urlAños='http://zlibraryexau2g3p.onion/s/?yearFrom='+str(añoInicial)+'&yearTo='+str(añoFinal) self.url=self.urlAños def cambiarPagina(self,x): print("estoy en cambiar pagina prinsipal") self.url+='&page='+str(x) print(self.url) def Crearcsv(self): desde=datosDescarga(1) asta=datosDescarga(2) self.carpetaUrl='/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url' try : os.mkdir(self.carpetaUrl) except OSError as e: if e.errno != errno.EEXIST: raise self.escrivirUrlWed=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/url2.csv','w')) self.imprimirUrlPdf=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/urlDowload2.csv','w')) def credenciales(self,numeroUsuario): print("llegue") if self.contadorCredenciales==0 or self.contadorCredenciales==20: self.zLibraty.load_url("https://singlelogin.org/") self.zLibraty.find_element_by_name("redirectToHost").click() sleep(3) pyautogui.press("down") sleep(2) pyautogui.press("down") sleep(1) pyautogui.press("enter") sleep(5) self.correo=self.usuario[numeroUsuario] self.contraseña=self.contraseñaTxT[numeroUsuario] def UsuariosYcontraseñas(self): self.dir='/home/dd/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt' self.data=open(self.dir,'r+') for self.i in range(0,200): if self.i%2==0 : self.usuario.append(self.data.readline()) if self.i%2!=0: self.contraseñaTxT.append(self.data.readline()) def urlPdf(self,): self.contadorCredenciales=1 self.boleanoPdf=0 self.respaldoContador=0 self.contadorUsuarios=usuarioUsadosLeer() self.contadorLibros=datosDescarga(4) self.contadorLibros2=self.contadorLibros%10 self.Crearcsv() self.soup=BeautifulSoup(self.html,'html.parser') try: for self.urlwed in self.soup.find_all(itemprop = "name") : self.contador=0 self.urlwed=self.urlwed.find('a',href=re.compile('')) self.urlDowload=self.urlwed.get('href') self.urlpdfGeleneralH=re.sub('/book/','https://b-ok.cc/book/',self.urlDowload) self.urlDowload=re.sub('/book/','http://zlibraryexau2g3p.onion/book/',self.urlDowload) self.escrivirUrlWed.writerow([self.urlDowload]) print(self.urlDowload) self.voleano=validarFormato(self.urlpdfGeleneralH) guardarNumeroDescargas(self.contadorLibros) print(self.respaldoContador) if self.contadorLibros==self.respaldoContador: for self.urlRedirec in range(0,1): self.zLibraty.load_url(self.urlDowload) sleep(5) self.htmlPdf=self.zLibraty.page_source self.soupRedirec=BeautifulSoup(self.htmlPdf,'html.parser') self.urlDowloadPDF=self.soupRedirec.find(class_="btn btn-primary dlButton addDownloadedBook") self.urlDowloadPDF=self.urlDowloadPDF.get('href') self.urlDowloadPDF=re.sub('/dl/','http://zlibraryexau2g3p.onion/dl/',self.urlDowloadPDF) self.imprimirUrlPdf.writerow([self.urlDowloadPDF]) print(self.urlDowloadPDF) print("vamos a por el if") sleep(15) if self.voleano==True: self.zLibraty.set_page_load_timeout(12) try: self.zLibraty.load_url(self.urlDowloadPDF) except: sleep(5) self.zLibraty.set_page_load_timeout(7000) print("funciona PDF ") self.voleano=False sleep(5) self.contadorLibros+=1 self.contadorLibros2+=1 else: self.zLibraty.set_page_load_timeout(12) try: self.zLibraty.load_url(self.urlDowloadPDF) except: sleep(8) pyautogui.press("down") sleep(2) pyautogui.press("enter") self.zLibraty.set_page_load_timeout(7000) sleep(5) self.contadorLibros+=1 self.contadorLibros2+=1 self.zLibraty.load_url("about:downloads") self.datosEsperaDescarga() self.peticiones() self.zLibraty.back() informaiconPdf(self.urlpdfGeleneralH) guardarNumeroDescargas(self.contadorLibros) self.respaldoContador+=1 if self.contadorLibros==self.respaldoContador: if self.contadorLibros2%10==0: print((self.contadorLibros2-1)%10) self.contador+=1 if self.contadorLibros==20: self.contadorCredenciales=20 print("saliendo de secion¡¡¡¡¡¡") pyautogui.moveTo(1707,245) pyautogui.hotkey("ctrl","shift","u") sleep(2) pyautogui.press("enter") sleep(7) pyautogui.press("enter") sleep(15) else: print("saliendo de secion") self.zLibraty.get("http://zlibraryexau2g3p.onion/logout.php") self.contadorUsuarios+=1 print(self.contadorUsuarios) try: self.zLibraty.switch_to_window(self.zLibraty.window_handles[0]) except: print("error al cambian de ventana") usuarioUsadosReescrivir(self.contadorUsuarios) print("por aqui¿¿¿¿¿¿") self.credenciales(self.contadorUsuarios) self.contadorCredenciales=1 print("no por aqui¿¿¿¿¿¿") sleep(20) self.iniciarSecion() sleep(15) self.paginaDescargas() sleep(7) self.contadorLibros2=0 sleep(15) print("numero de li bros por usuario ",self.contadorLibros2) if self.contador==5: self.contador=0 except OSError as e : print(e.strerror) print("error en la urlPdf:::::") guardarNumeroDescargas(self.contadorLibros) usuarioUsadosReescrivir(self.contadorUsuarios) print(self.contadorLibros) archivos=int(contarNueroArchivos()) print(archivos) self.zLibraty.load_url("about:downloads") self.datosEsperaDescarga() self.peticiones() self.zLibraty.back() informaiconPdf(self.urlpdfGeleneralH) def DescargarContenido(self,_html): self.contenido=_html def serrarTor(self): self.zLibraty.close() def datosEsperaDescarga(self): sleep(4) self.htmlValidador=self.zLibraty.page_source def validarDescarga(self): self.htmlFalce=self.zLibraty.page_source self.soupFalce=BeautifulSoup(self.htmlFalce,"html.parser") self.validarfalce=self.soupFalce.find_all("description",class_="downloadDetails downloadDetailsNormal") self.respuestafalce=re.search("value=.+",str(self.validarfalce)) self.buscarFalse=self.respuestafalce.group() if re.search("Canceled",self.buscarFalse): print("se daño al descarga =(") sleep(5) pyautogui.click(1393,139) sleep(5) else : if re.search("Failed",self.buscarFalse): print("se daño al descarga pero vamos a solucionarlo =( ") sleep(5) pyautogui.click(1393,139) sleep(5) else: print("la descarga va bien =)") def peticiones(self): self.validarDescarga() self.carga=0 self.daño=0 self.conteo=0 while self.carga<100: self.soup=BeautifulSoup(self.htmlValidador,"html.parser") try: self.archivoDescarga=self.soup.find_all("progress",class_="downloadProgress") self.respaldo=re.split("value",str(self.archivoDescarga)) self.tiempo=re.search("[0-9]+",self.respaldo[1]) print(self.tiempo.group()) self.carga=int(self.tiempo.group()) self.datosEsperaDescarga() sleep(3) self.validarDescarga() if self.conteo==3: pyautogui.press("enter") self.conteo=0 except: print("o no ,se daño la descargar y no la e podido volver a iniciar") if self.daño==7: os.system('rm -r /home/dd/zlibros/libros1920-1921/libro/*.*') raise self.daño+=1 sleep(5)