Пример #1
0
#!/usr/bin/env python3
from tbselenium.tbdriver import TorBrowserDriver
import pickle

tbpath = "tor-browser_en-US"

with open('onions.sav', 'rb') as f:
    potential_onions = pickle.load(f)

print("Loaded {} onions".format(len(potential_onions)))

driver = TorBrowserDriver(tbpath)
driver.set_page_load_timeout(60)

good_onions = []

for onion in potential_onions:
    try:
        driver.load_url(onion)
        good_onions.append(onion)
    except Exception as e:
        print(e)

print("Good onions")
for onion in good_onions:
    print(onion)

with open('good-onions.sav', 'wb') as f:
    pickle.dump(good_onions, f)
Пример #2
0
class Crawler:
    """Crawls your onions, but also manages Tor, drives Tor Browser, and uses
    information from your Tor cell log and stem to collect cell sequences."""
    def __init__(
            self,
            take_ownership=True,  # Tor dies when the Crawler does
            torrc_config={"CookieAuth": "1"},
            tor_log="/var/log/tor/tor.log",
            tor_cell_log="/var/log/tor/tor_cell_seq.log",
            control_port=9051,
            socks_port=9050,
            run_in_xvfb=True,
            tbb_path=join("/opt", "tbb", "tor-browser_en-US"),
            tb_log_path=join(_log_dir, "firefox.log"),
            tb_tor_cfg=USE_RUNNING_TOR,
            page_load_timeout=20,
            wait_on_page=5,
            wait_after_closing_circuits=0,
            restart_on_sketchy_exception=True,
            additional_control_fields={},
            db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")
        # Set stem logging level to INFO - "high level library activity"
        stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO)

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(
            config=self.torrc_config, take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)

    def authenticate_to_tor_controlport(self):
        self.logger.info("Authenticating to the tor controlport...")
        try:
            self.controller = Controller.from_port(port=self.control_port)
        except stem.SocketError as exc:
            panic("Unable to connect to tor on port {self.control_port}: "
                  "{exc}".format(**locals()))
        try:
            self.controller.authenticate()
        except stem.connection.MissingPassword:
            panic("Unable to authenticate to tor controlport. Please add "
                  "`CookieAuth 1` to your tor configuration file.")

    def get_control_data(self, page_load_timeout, wait_on_page,
                         wait_after_closing_circuits,
                         additional_control_fields):
        """Gather metadata about the crawler instance."""
        control_data = {}
        # Configuration settings
        control_data["page_load_timeout"] = page_load_timeout
        control_data["wait_on_page"] = wait_on_page
        control_data["wait_after_closing_circuits"] = \
                wait_after_closing_circuits
        if additional_control_fields:
            control_data.update(additional_control_fields)
        # System facts
        control_data["kernel"] = platform.system()
        control_data["kernel_version"] = platform.release()
        control_data["os"] = platform.version()
        control_data["python_version"] = platform.python_version()
        ip = urlopen("https://api.ipify.org").read().decode()
        control_data["ip"] = ip
        # This API seems to be unstable and we haven't found a suitable
        # alternative :(
        try:
            asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip))
            asn_geoip = literal_eval(asn_geoip.read().decode())
            control_data["asn"] = asn_geoip.get("ip").get("as").get("asn")
            control_data["city"] = asn_geoip.get("ip").get("city")
            control_data["country"] = asn_geoip.get("ip").get("country")
        except urllib.error.HTTPError:
            self.logger.warning("Unable to query ASN API and thus some "
                                "control data may be missing from this run.")
        control_data["tor_version"] = self.controller.get_version().version_str
        control_data["tb_version"] = self.tb_driver.tb_version
        # Tor will have multiple entry nodes in its state file, but will
        # choose the first sequential one that is up as its entry guard.
        entry_nodes = self.controller.get_info("entry-guards").split('\n')
        control_data["entry_node"] = next(
            re.search("[0-9A-F]{40}", g).group(0) for g in entry_nodes
            if re.search("up", g))
        control_data["crawler_version"] = _version
        return control_data

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
        return self

    def __del__(self):
        self.close()

    def close(self):
        self.logger.info("Beginning Crawler exit process...")
        if "tb_driver" in dir(self):
            self.logger.info("Closing Tor Browser...")
            self.tb_driver.quit()
        if "virtual_framebuffer" in dir(self):
            self.logger.info("Closing the virtual framebuffer...")
            # A bug in pyvirtualdisplay triggers a KeyError exception when closing a
            # virtual framebuffer if the $DISPLAY environment variable is not set.
            try:
                stop_xvfb(self.virtual_framebuffer)
            except KeyError:
                pass
        if "cell_log" in dir(self):
            self.logger.info("Closing the Tor cell stream...")
            self.cell_log.close()
        if "tor_process" in dir(self):
            self.logger.info("Killing the tor process...")
            self.tor_process.kill()
        self.logger.info("Crawler exit completed.")

    def collect_onion_trace(self,
                            url,
                            hsid=None,
                            extra_fn=None,
                            trace_dir=None,
                            iteration=0):
        """Crawl an onion service and collect a complete cell sequence for the
        activity at the time. Also, record additional information about the
        circuits with stem. Optionally, pass a function to execute additional
        actions after the page has loaded."""
        # Todo: create collect_trace method that works for regular sites as
        # well
        assert ".onion" in url, ("This method is only suitable for crawling "
                                 "onion services.")

        self.logger.info("{url}: closing existing circuits before starting "
                         "crawl.".format(**locals()))
        for circuit in self.controller.get_circuits():
            self.controller.close_circuit(circuit.id)

        sleep(self.wait_after_closing_circuits)

        if not trace_dir:
            trace_dir = self.make_ts_dir()
        trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration)
        trace_path = join(trace_dir, trace_name)

        start_idx = self.get_cell_log_pos()

        try:
            self.crawl_url(url)
            rend_circ_ids = self.get_rend_circ_ids(url)
            if extra_fn:
                self.execute_extra_fn(url, trace_path, start_idx)
        except CrawlerLoggedError:
            return "failed"
        except CrawlerNoRendCircError:
            self.save_debug_log(url, trace_path, start_idx)
            return "failed"
        except:
            self.logger.exception("{url}: unusual exception "
                                  "encountered:".format(**locals()))
            # Also log active circuit info
            self.controller.get_circuits()

            exc_type, exc_value, exc_traceback = exc_info()
            if exc_type in _sketchy_exceptions:
                self.save_debug_log(url, trace_path, start_idx)
                if self.restart_on_sketchy_exception:
                    self.restart_tb()

            return "failed"

        self.logger.info("{url}: saving full trace...".format(**locals()))
        end_idx = self.get_cell_log_pos()
        full_trace = self.get_full_trace(start_idx, end_idx)

        # Save the trace to the database or write to file
        if self.db_handler:
            try:
                new_example = {
                    'hsid': hsid,
                    'crawlid': self.crawlid,
                    't_scrape': get_timestamp("db")
                }
            except NameError:
                panic("If using the database, and calling collect_onion_trace "
                      "directly, you must specify the hsid of the site.")
            exampleid = self.db_handler.add_example(new_example)
            self.db_handler.add_trace(str(full_trace), exampleid)
        else:
            with open(trace_path + "-full", "wb") as fh:
                fh.write(full_trace)

        return "succeeded"

    def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"):
        """Creates a timestamped folder to hold a group of traces."""
        raw_dirpath = join(parent_dir, raw_dir_name)
        ts = get_timestamp("log")
        ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True)
        symlink_cur_to_latest(raw_dirpath, ts)

        with open(join(ts_dir, "control.pickle"), "wb") as fh:
            pickle.dump(self.control_data, fh)

        return ts_dir

    def get_cell_log_pos(self):
        """Returns the current position of the last byte in the Tor cell log."""
        return self.cell_log.seek(0, SEEK_END)

    def crawl_url(self, url):
        """Load a web page in Tor Browser and optionally pass a function
        to execute custom actions on it."""

        self.logger.info("{url}: starting page load...".format(**locals()))

        try:
            self.tb_driver.load_url(url,
                                    wait_on_page=self.wait_on_page,
                                    wait_for_page_body=True)
        except TimeoutException:
            self.logger.warning("{url}: timed out.".format(**locals()))
            raise CrawlerLoggedError
        except http.client.CannotSendRequest:
            self.logger.warning("{url}: cannot send request--improper "
                                "connection state.".format(**locals()))
            raise CrawlerLoggedError

        # Make sure we haven't just hit an error page or nothing loaded
        try:
            if (self.tb_driver.is_connection_error_page
                    or self.tb_driver.current_url == "about:newtab"):
                raise CrawlerReachedErrorPage
        except CrawlerReachedErrorPage:
            self.logger.warning("{url}: reached connection error "
                                "page.".format(**locals()))
            raise CrawlerLoggedError

        self.logger.info("{url}: successfully loaded.".format(**locals()))

    def get_rend_circ_ids(self, url):
        """Returns the rendezvous circuit id(s) associated with a given onion
        service."""
        self.logger.info("{url}: collecting circuit "
                         "information...".format(**locals()))
        active_circs = self.controller.get_circuits()
        rend_circ_ids = set()

        for circ in active_circs:
            if (circ.purpose == "HS_CLIENT_REND" and circ.socks_username
                    and circ.socks_username in url):
                rend_circ_ids.add(circ.id)

        # If everything goes perfect, we should only see one. Multiple indicate
        # the first failed. Zero indicates one closed abruptly (or there's an
        # error with stem--still waiting on data to confirm or deny).
        rend_circ_ct = len(rend_circ_ids)
        self.logger.info(
            "{url}: {rend_circ_ct} associated rendezvous circuits "
            "discovered.".format(**locals()))
        if rend_circ_ct == 0:
            raise CrawlerNoRendCircError

        return rend_circ_ids

    def execute_extra_fn(self, url, trace_path, start_idx):
        self.logger.info("{url}: executing extra function "
                         "code...".format(**locals()))
        extra_fn(self, url, trace_path, start_idx)
        self.logger.info("{url}: extra function executed "
                         "successfully.".format(**locals()))

    def save_debug_log(self, url, trace_path, start_idx):
        self.logger.warning("{url}: saving debug log...".format(**locals()))
        exc_time = self.get_cell_log_pos()
        trace = self.get_full_trace(start_idx, exc_time)
        with open(trace_path + "@debug", "wb") as fh:
            fh.write(trace)

    def get_full_trace(self, start_idx, end_idx):
        """Returns the Tor DATA cells transmitted over a circuit during a
        specified time period."""
        # Sanity check
        assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile "
                                                "position")
        assert end_idx > start_idx, ("logfile section end_idx must come "
                                     "after start_idx")

        self.cell_log.seek(start_idx, SEEK_SET)
        return self.cell_log.read(end_idx - start_idx)

    def restart_tb(self):
        """Restarts the Tor Browser."""
        self.logger.info("Restarting the Tor Browser...")
        self.tb_driver.quit()
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=USE_RUNNING_TOR,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)
        self.logger.info("Tor Browser restarted...")

    def collect_set_of_traces(self,
                              url_set,
                              extra_fn=None,
                              trace_dir=None,
                              iteration=0,
                              shuffle=True,
                              retry=True,
                              url_to_id_mapping=None):
        """Collect a set of traces."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = url_set
            trace_dir = None
        elif not trace_dir:
            trace_dir = self.make_ts_dir()

        set_size = len(url_set)
        self.logger.info("Saving set of {set_size} traces to "
                         "{trace_dir}.".format(**locals()))

        # Converts both sets (from pickle files) and dicts (whose keys are
        # URLs--from database) to URL lists
        url_set = list(url_set)
        if shuffle:
            random.shuffle(url_set)

        failed_urls = []

        for url_idx in range(set_size):
            self.logger.info("Collecting trace {} of "
                             "{set_size}...".format(url_idx + 1, **locals()))
            url = url_set[url_idx]
            if self.db_handler:
                hsid = url_to_id_mapping[url]
            else:
                hsid = None

            if (self.collect_onion_trace(url,
                                         hsid=hsid,
                                         extra_fn=extra_fn,
                                         trace_dir=trace_dir,
                                         iteration=iteration) == "failed"
                    and retry):
                failed_urls.append(url)

        if failed_urls:
            failed_ct = len(failed_urls)
            self.logger.info("Retrying {failed_ct} of {set_size} traces that "
                             "failed.".format(**locals()))
            self.collect_set_of_traces(failed_urls,
                                       extra_fn=extra_fn,
                                       trace_dir=trace_dir,
                                       iteration=iteration,
                                       shuffle=shuffle,
                                       retry=False,
                                       url_to_id_mapping=url_to_id_mapping)

    def crawl_monitored_nonmonitored(self,
                                     monitored_class,
                                     nonmonitored_class,
                                     extra_fn=None,
                                     shuffle=True,
                                     retry=True,
                                     monitored_name="monitored",
                                     nonmonitored_name="nonmonitored",
                                     url_to_id_mapping=None,
                                     ratio=40):
        """Crawl a monitored class ratio times interspersed between the
        crawling of a(n ostensibly larger) non-monitored class."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = nonmonitored_class
                url_to_id_mapping.update(monitored_class)
            trace_dir, mon_trace_dir, nonmon_trace_dir = (None, ) * 3
        else:
            trace_dir = self.make_ts_dir()
            mon_trace_dir = join(trace_dir, monitored_name)
            mkdir(mon_trace_dir)
            nonmon_trace_dir = join(trace_dir, nonmonitored_name)
            mkdir(nonmon_trace_dir)

        # db: calling list on a dict returns a list of its keys (URLs)
        # pickle: calling list on set is necessary to make it shuffleable
        nonmonitored_class = list(nonmonitored_class)
        monitored_class = list(monitored_class)

        nonmonitored_class_ct = len(nonmonitored_class)
        chunk_size = int(nonmonitored_class_ct / ratio)

        if shuffle:
            random.shuffle(nonmonitored_class)
            random.shuffle(monitored_class)

        for iteration in range(ratio):
            self.logger.info("Beginning iteration {i} of {ratio} in the "
                             "{monitored_name} class".format(i=iteration + 1,
                                                             **locals()))
            self.collect_set_of_traces(monitored_class,
                                       trace_dir=mon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)

            slice_lb = iteration * chunk_size
            slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct)
            self.logger.info("Crawling services {} through {slice_ub} of "
                             "{nonmonitored_class_ct} in the "
                             "{nonmonitored_name} "
                             "class".format(slice_lb + 1, **locals()))
            self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub],
                                       trace_dir=nonmon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)
Пример #3
0
class DescargarPdf:
    def __init__(self):
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.usuario = []
        self.contraseñaTxT = []
        self.conversor = '?convertedTo=pdf'

    def iniciarTor(self):
        self.zLibraty = TorBrowserDriver(self.tbb_dir,
                                         tbb_logfile_path='test.log')

    def iniciarSecion(self):
        self.element = self.zLibraty.find_element_by_name("email")
        self.element.send_keys(self.correo)
        sleep(2)
        self.element2 = self.zLibraty.find_elements_by_class_name(
            "form-control")[1]
        self.element2.send_keys(self.contraseña)
        self.element2.send_keys(Keys.RETURN)

    def paginaDescargas(self):
        print("estoy en la funcion paginaDescagas")
        sleep(4)
        self.zLibraty.get(self.url)
        self.html = self.zLibraty.page_source

    def paginaPrinsipal(self, añoInicial, añoFinal):
        self.urlAños = 'http://zlibraryexau2g3p.onion/s/?yearFrom=' + str(
            añoInicial) + '&yearTo=' + str(añoFinal)
        self.url = self.urlAños

    def cambiarPagina(self, x):
        self.url += '&page=' + str(x)

    def Crearcsv(self):
        self.carpetaUrl = '/home/dgc7/Documentos/zlibrary/libros1920-1921/url'
        try:
            os.mkdir(self.carpetaUrl)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        self.escrivirUrlWed = csv.writer(
            open('/home/dgc7/Documentos/zlibrary/libros1920-1921/url/url2.csv',
                 'w'))
        self.imprimirUrlPdf = csv.writer(
            open(
                '/home/dgc7/Documentos/zlibrary/libros1920-1921/url/urlDowload2.csv',
                'w'))

    def credenciales(self, numeroUsuario):
        print("llegue")
        self.correo = self.usuario[numeroUsuario]
        self.contraseña = self.contraseñaTxT[numeroUsuario]
        self.urlLoguin = 'http://zlibraryexau2g3p.onion'
        self.zLibraty.get(self.urlLoguin)

    def UsuariosYcontraseñas(self):
        self.dir = '/home/dgc7/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt'
        self.data = open(self.dir, 'r+')
        for self.i in range(0, 200):
            if self.i % 2 == 0:
                self.usuario.append(self.data.readline())
            if self.i % 2 != 0:
                self.contraseñaTxT.append(self.data.readline())

    def urlPdf(self, ):
        self.boleanoPdf = 0
        self.respaldoContador = 0
        self.contadorUsuarios = usuarioUsadosLeer()
        self.contadorLibros = datosDescarga(4)
        self.contadorLibros2 = self.contadorLibros % 10
        self.Crearcsv()
        self.soup = BeautifulSoup(self.html, 'html.parser')
        try:
            for self.urlwed in self.soup.find_all(itemprop="name"):
                self.contador = 0
                self.urlwed = self.urlwed.find('a', href=re.compile(''))
                self.urlDowload = self.urlwed.get('href')
                self.urlpdfGeleneralH = re.sub('/book/',
                                               'https://b-ok.cc/book/',
                                               self.urlDowload)
                self.urlDowload = re.sub(
                    '/book/', 'http://zlibraryexau2g3p.onion/book/',
                    self.urlDowload)
                self.escrivirUrlWed.writerow([self.urlDowload])
                print(self.urlDowload)
                voleano = validarFormato(self.urlpdfGeleneralH)
                guardarNumeroDescargas(self.contadorLibros)
                print(self.respaldoContador)
                if self.contadorLibros == self.respaldoContador:
                    for self.urlRedirec in range(0, 1):
                        self.zLibraty.get(self.urlDowload)
                        sleep(5)
                        self.htmlPdf = self.zLibraty.page_source
                        self.soupRedirec = BeautifulSoup(
                            self.htmlPdf, 'html.parser')
                        self.urlDowloadPDF = self.soupRedirec.find(
                            class_="btn btn-primary dlButton addDownloadedBook"
                        )
                        self.urlDowloadPDF = self.urlDowloadPDF.get('href')
                        self.urlDowloadPDF = re.sub(
                            '/dl/', 'http://zlibraryexau2g3p.onion/dl/',
                            self.urlDowloadPDF)
                        self.imprimirUrlPdf.writerow([self.urlDowloadPDF])
                        print(self.urlDowloadPDF)
                        print("vamos a por el if")
                        sleep(10)
                        if voleano == True:
                            self.zLibraty.set_page_load_timeout(8)
                            try:
                                self.zLibraty.get(self.urlDowloadPDF)
                            except:
                                self.zLibraty.set_page_load_timeout(70)
                                self.zLibraty.refresh()
                                print("funciona PDF ")

                            voleano = False
                            sleep(5)
                            self.contadorLibros += 1
                            self.contadorLibros2 += 1
                        else:
                            try:
                                self.zLibraty.set_page_load_timeout(5)
                                try:
                                    self.zLibraty.get(self.urlDowloadPDF)
                                except:
                                    sleep(4)
                                    pyautogui.press("down")
                                    sleep(2)
                                    pyautogui.press("enter")
                                self.zLibraty.set_page_load_timeout(70)
                            except:
                                print(
                                    "\nerror al controlasr el teclado y dar enter\n"
                                )
                                raise
                            sleep(5)
                            self.zLibraty.refresh()
                            self.contadorLibros += 1
                            self.contadorLibros2 += 1
                        sleep(20)
                        tiempoDescarga()
                        informaiconPdf(self.urlpdfGeleneralH)
                self.respaldoContador += 1
                if self.contadorLibros == self.respaldoContador:
                    if self.contadorLibros2 % 10 == 0:
                        print((self.contadorLibros2 - 1) % 10)
                        self.contador += 1
                        pyautogui.hotkey("ctrl", "shift", "u")
                        sleep(2)
                        pyautogui.press("enter")
                        sleep(7)
                        pyautogui.press("enter")
                        sleep(15)
                        self.contadorUsuarios += 1
                        print(self.contadorUsuarios)
                        try:
                            self.zLibraty.switch_to_window(
                                self.zLibraty.window_handles[0])
                        except:
                            print("error al cambian de  ventana")
                        usuarioUsadosReescrivir(self.contadorUsuarios)
                        print("por aqui¿¿¿¿¿¿")
                        self.credenciales(self.contadorUsuarios)
                        print("no por aqui¿¿¿¿¿¿")
                        sleep(23)
                        self.iniciarSecion()
                        sleep(7)
                        self.contadorLibros2 = 0
                        sleep(15)
                        print("numero de li bros por usuario ",
                              self.contadorLibros2)
                        if self.contador == 5:
                            self.contador = 0
        except OSError as e:
            print(e.strerror)
            print("error en la urlPdf:::::")
            guardarNumeroDescargas(self.contadorLibros)
            usuarioUsadosReescrivir(self.contadorUsuarios)
            print(self.contadorLibros)
            raise
        print("termine la pagina")

    def DescargarContenido(self, _html):
        self.contenido = _html

    def serrarTor(self):
        self.zLibraty.close()
class Crawler:
    """Crawls your onions, but also manages Tor, drives Tor Browser, and uses
    information from your Tor cell log and stem to collect cell sequences."""
    def __init__(self,
                 take_ownership=True, # Tor dies when the Crawler does
                 torrc_config={"CookieAuth": "1"},
                 tor_log="/var/log/tor/tor.log",
                 tor_cell_log="/var/log/tor/tor_cell_seq.log",
                 control_port=9051,
                 socks_port=9050,
                 run_in_xvfb=True,
                 tbb_path=join("/opt","tbb","tor-browser_en-US"),
                 tb_log_path=join(_log_dir,"firefox.log"),
                 tb_tor_cfg=USE_RUNNING_TOR,
                 page_load_timeout=20,
                 wait_on_page=5,
                 wait_after_closing_circuits=0,
                 restart_on_sketchy_exception=True,
                 additional_control_fields={},
                 db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")
        # Set stem logging level to INFO - "high level library activity"
        stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO)

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(config=self.torrc_config,
                                                  take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)


    def authenticate_to_tor_controlport(self):
        self.logger.info("Authenticating to the tor controlport...")
        try:
            self.controller = Controller.from_port(port=self.control_port)
        except stem.SocketError as exc:
            panic("Unable to connect to tor on port {self.control_port}: "
                  "{exc}".format(**locals()))
        try:
            self.controller.authenticate()
        except stem.connection.MissingPassword:
            panic("Unable to authenticate to tor controlport. Please add "
                  "`CookieAuth 1` to your tor configuration file.")


    def get_control_data(self, page_load_timeout, wait_on_page,
                         wait_after_closing_circuits,
                         additional_control_fields):
        """Gather metadata about the crawler instance."""
        control_data = {}
        # Configuration settings
        control_data["page_load_timeout"] = page_load_timeout
        control_data["wait_on_page"] = wait_on_page
        control_data["wait_after_closing_circuits"] = \
                wait_after_closing_circuits
        if additional_control_fields:
            control_data.update(additional_control_fields)
        # System facts
        control_data["kernel"] = platform.system()
        control_data["kernel_version"] = platform.release()
        control_data["os"] = platform.version()
        control_data["python_version"] = platform.python_version()
        ip = urlopen("https://api.ipify.org").read().decode()
        control_data["ip"] = ip
        # This API seems to be unstable and we haven't found a suitable
        # alternative :(
        try:
            asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip))
            asn_geoip = literal_eval(asn_geoip.read().decode())
            control_data["asn"] = asn_geoip.get("ip").get("as").get("asn")
            control_data["city"] = asn_geoip.get("ip").get("city")
            control_data["country"] = asn_geoip.get("ip").get("country")
        except urllib.error.HTTPError:
            self.logger.warning("Unable to query ASN API and thus some "
                                "control data may be missing from this run.")
        control_data["tor_version"] = self.controller.get_version().version_str
        control_data["tb_version"] = self.tb_driver.tb_version
        # Tor will have multiple entry nodes in its state file, but will
        # choose the first sequential one that is up as its entry guard.
        entry_nodes = self.controller.get_info("entry-guards").split('\n')
        control_data["entry_node"] = next(re.search("[0-9A-F]{40}", g).group(0)
                                          for g in entry_nodes
                                          if re.search("up", g))
        control_data["crawler_version"] = _version
        return control_data


    def __enter__(self):
        return self


    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
        return self


    def __del__(self):
        self.close()


    def close(self):
        self.logger.info("Beginning Crawler exit process...")
        if "tb_driver" in dir(self):
            self.logger.info("Closing Tor Browser...")
            self.tb_driver.quit()
        if "virtual_framebuffer" in dir(self):
            self.logger.info("Closing the virtual framebuffer...")
	    # A bug in pyvirtualdisplay triggers a KeyError exception when closing a
            # virtual framebuffer if the $DISPLAY environment variable is not set.
            try:
                stop_xvfb(self.virtual_framebuffer)
            except KeyError:
                pass
        if "cell_log" in dir(self):
            self.logger.info("Closing the Tor cell stream...")
            self.cell_log.close()
        if "tor_process" in dir(self):
            self.logger.info("Killing the tor process...")
            self.tor_process.kill()
        self.logger.info("Crawler exit completed.")


    def collect_onion_trace(self, url, hsid=None, extra_fn=None, trace_dir=None,
                            iteration=0):
        """Crawl an onion service and collect a complete cell sequence for the
        activity at the time. Also, record additional information about the
        circuits with stem. Optionally, pass a function to execute additional
        actions after the page has loaded."""
        # Todo: create collect_trace method that works for regular sites as
        # well
        assert ".onion" in url, ("This method is only suitable for crawling "
                                 "onion services.")

        self.logger.info("{url}: closing existing circuits before starting "
                         "crawl.".format(**locals()))
        for circuit in self.controller.get_circuits():
            self.controller.close_circuit(circuit.id)

        sleep(self.wait_after_closing_circuits)

        if not trace_dir:
            trace_dir = self.make_ts_dir()
        trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration)
        trace_path = join(trace_dir, trace_name)

        start_idx = self.get_cell_log_pos()

        try:
            self.crawl_url(url)
            rend_circ_ids = self.get_rend_circ_ids(url)
            if extra_fn:
                self.execute_extra_fn(url, trace_path, start_idx)
        except CrawlerLoggedError:
            return "failed"
        except CrawlerNoRendCircError:
            self.save_debug_log(url, trace_path, start_idx)
            return "failed"
        except:
            self.logger.exception("{url}: unusual exception "
                                  "encountered:".format(**locals()))
            # Also log active circuit info
            self.controller.get_circuits()

            exc_type, exc_value, exc_traceback = exc_info()
            if exc_type in _sketchy_exceptions:
                self.save_debug_log(url, trace_path, start_idx)
                if self.restart_on_sketchy_exception:
                    self.restart_tb()

            return "failed"

        self.logger.info("{url}: saving full trace...".format(**locals()))
        end_idx = self.get_cell_log_pos()
        full_trace = self.get_full_trace(start_idx, end_idx)

        # Save the trace to the database or write to file
        if self.db_handler:
            try:
                new_example = {'hsid': hsid,
                               'crawlid': self.crawlid,
                               't_scrape': get_timestamp("db")}
            except NameError:
                panic("If using the database, and calling collect_onion_trace "
                      "directly, you must specify the hsid of the site.")
            exampleid = self.db_handler.add_example(new_example)
            self.db_handler.add_trace(str(full_trace), exampleid)
        else:
            with open(trace_path+"-full", "wb") as fh:
                fh.write(full_trace)

        return "succeeded"


    def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"):
        """Creates a timestamped folder to hold a group of traces."""
        raw_dirpath = join(parent_dir, raw_dir_name)
        ts = get_timestamp("log")
        ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True)
        symlink_cur_to_latest(raw_dirpath, ts)

        with open(join(ts_dir, "control.pickle"), "wb") as fh:
            pickle.dump(self.control_data, fh)

        return ts_dir


    def get_cell_log_pos(self):
        """Returns the current position of the last byte in the Tor cell log."""
        return self.cell_log.seek(0, SEEK_END)


    def crawl_url(self, url):
        """Load a web page in Tor Browser and optionally pass a function
        to execute custom actions on it."""

        self.logger.info("{url}: starting page load...".format(**locals()))

        try:
            self.tb_driver.load_url(url, wait_on_page=self.wait_on_page,
                                    wait_for_page_body=True)
        except TimeoutException:
            self.logger.warning("{url}: timed out.".format(**locals()))
            raise CrawlerLoggedError
        except http.client.CannotSendRequest:
            self.logger.warning("{url}: cannot send request--improper "
                                "connection state.".format(**locals()))
            raise CrawlerLoggedError

        # Make sure we haven't just hit an error page or nothing loaded
        try:
            if (self.tb_driver.is_connection_error_page
                or self.tb_driver.current_url == "about:newtab"):
                raise CrawlerReachedErrorPage
        except CrawlerReachedErrorPage:
            self.logger.warning("{url}: reached connection error "
                                "page.".format(**locals()))
            raise CrawlerLoggedError

        self.logger.info("{url}: successfully loaded.".format(**locals()))


    def get_rend_circ_ids(self, url):
        """Returns the rendezvous circuit id(s) associated with a given onion
        service."""
        self.logger.info("{url}: collecting circuit "
                         "information...".format(**locals()))
        active_circs = self.controller.get_circuits()
        rend_circ_ids = set()

        for circ in active_circs:
            if (circ.purpose == "HS_CLIENT_REND" and
                circ.socks_username and
                circ.socks_username in url):
                rend_circ_ids.add(circ.id)

        # If everything goes perfect, we should only see one. Multiple indicate
        # the first failed. Zero indicates one closed abruptly (or there's an
        # error with stem--still waiting on data to confirm or deny).
        rend_circ_ct = len(rend_circ_ids)
        self.logger.info("{url}: {rend_circ_ct} associated rendezvous circuits "
                         "discovered.".format(**locals()))
        if rend_circ_ct == 0:
            raise CrawlerNoRendCircError

        return rend_circ_ids


    def execute_extra_fn(self, url, trace_path, start_idx):
        self.logger.info("{url}: executing extra function "
                         "code...".format(**locals()))
        extra_fn(self, url, trace_path, start_idx)
        self.logger.info("{url}: extra function executed "
                         "successfully.".format(**locals()))


    def save_debug_log(self, url, trace_path, start_idx):
        self.logger.warning("{url}: saving debug log...".format(**locals()))
        exc_time = self.get_cell_log_pos()
        trace = self.get_full_trace(start_idx, exc_time)
        with open(trace_path + "@debug", "wb") as fh:
            fh.write(trace)



    def get_full_trace(self, start_idx, end_idx):
        """Returns the Tor DATA cells transmitted over a circuit during a
        specified time period."""
        # Sanity check
        assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile "
                                                "position")
        assert end_idx > start_idx, ("logfile section end_idx must come "
                                     "after start_idx")

        self.cell_log.seek(start_idx, SEEK_SET)
        return self.cell_log.read(end_idx - start_idx)


    def restart_tb(self):
        """Restarts the Tor Browser."""
        self.logger.info("Restarting the Tor Browser...")
        self.tb_driver.quit()
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=USE_RUNNING_TOR,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)
        self.logger.info("Tor Browser restarted...")


    def collect_set_of_traces(self, url_set, extra_fn=None, trace_dir=None,
                              iteration=0, shuffle=True, retry=True,
                              url_to_id_mapping=None):
        """Collect a set of traces."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = url_set
            trace_dir = None
        elif not trace_dir:
                trace_dir = self.make_ts_dir()

        set_size = len(url_set)
        self.logger.info("Saving set of {set_size} traces to "
                         "{trace_dir}.".format(**locals()))

        # Converts both sets (from pickle files) and dicts (whose keys are
        # URLs--from database) to URL lists
        url_set = list(url_set)
        if shuffle:
            random.shuffle(url_set)

        failed_urls = []

        for url_idx in range(set_size):
            self.logger.info("Collecting trace {} of "
                             "{set_size}...".format(url_idx+1, **locals()))
            url = url_set[url_idx]
            if self.db_handler:
                hsid = url_to_id_mapping[url]
            else:
                hsid = None

            if (self.collect_onion_trace(url, hsid=hsid, extra_fn=extra_fn,
                                         trace_dir=trace_dir,
                                         iteration=iteration) == "failed"
                and retry):
                failed_urls.append(url)

        if failed_urls:
            failed_ct = len(failed_urls)
            self.logger.info("Retrying {failed_ct} of {set_size} traces that "
                             "failed.".format(**locals()))
            self.collect_set_of_traces(failed_urls, extra_fn=extra_fn,
                                       trace_dir=trace_dir,
                                       iteration=iteration, shuffle=shuffle,
                                       retry=False,
                                       url_to_id_mapping=url_to_id_mapping)


    def crawl_monitored_nonmonitored(self, monitored_class, nonmonitored_class,
                                     extra_fn=None, shuffle=True, retry=True,
                                     monitored_name="monitored",
                                     nonmonitored_name="nonmonitored",
                                     url_to_id_mapping=None, ratio=40):
        """Crawl a monitored class ratio times interspersed between the
        crawling of a(n ostensibly larger) non-monitored class."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = nonmonitored_class
                url_to_id_mapping.update(monitored_class)
            trace_dir, mon_trace_dir, nonmon_trace_dir = (None,) * 3
        else:
            trace_dir = self.make_ts_dir()
            mon_trace_dir = join(trace_dir, monitored_name)
            mkdir(mon_trace_dir)
            nonmon_trace_dir = join(trace_dir, nonmonitored_name)
            mkdir(nonmon_trace_dir)

        # db: calling list on a dict returns a list of its keys (URLs)
        # pickle: calling list on set is necessary to make it shuffleable
        nonmonitored_class = list(nonmonitored_class)
        monitored_class = list(monitored_class)

        nonmonitored_class_ct = len(nonmonitored_class)
        chunk_size = int(nonmonitored_class_ct / ratio)

        if shuffle:
            random.shuffle(nonmonitored_class)
            random.shuffle(monitored_class)

        for iteration in range(ratio):
            self.logger.info("Beginning iteration {i} of {ratio} in the "
                             "{monitored_name} class".format(i=iteration+1,
                                                             **locals()))
            self.collect_set_of_traces(monitored_class,
                                       trace_dir=mon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)

            slice_lb = iteration * chunk_size
            slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct)
            self.logger.info("Crawling services {} through {slice_ub} of "
                             "{nonmonitored_class_ct} in the "
                             "{nonmonitored_name} "
                             "class".format(slice_lb + 1, **locals()))
            self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub],
                                       trace_dir=nonmon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)
Пример #5
0
class Visit(object):
    """Hold info about a particular visit to a page."""

    def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None,
                 experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True):
        self.batch_num = batch_num
        self.site_num = site_num
        self.instance_num = instance_num
        self.page_url = page_url
        self.bg_site = bg_site
        self.experiment = experiment
        self.base_dir = base_dir
        self.visit_dir = None
        self.visit_log_dir = None
        self.tbb_version = cm.RECOMMENDED_TBB_VERSION
        self.capture_screen = capture_screen
        self.tor_controller = tor_controller
        self.xvfb = xvfb
        self.init_visit_dir()
        self.pcap_path = os.path.join(
            self.visit_dir, "{}.pcap".format(self.get_instance_name()))

        if self.xvfb and not cm.running_in_CI:
            wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H))
            self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H)
            self.vdisplay.start()

        # Create new instance of TorBrowser driver
        TorBrowserDriver.add_exception(self.page_url)
        self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH,
                                          tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log"))
        self.sniffer = Sniffer()  # sniffer to capture the network traffic

    def init_visit_dir(self):
        """Create results and logs directories for this visit."""
        visit_name = str(self.instance_num)
        self.visit_dir = os.path.join(self.base_dir, visit_name)
        ut.create_dir(self.visit_dir)
        self.visit_log_dir = os.path.join(self.visit_dir, 'logs')
        ut.create_dir(self.visit_log_dir)

    def get_instance_name(self):
        """Construct and return a filename for the instance."""
        inst_file_name = '{}_{}_{}' \
            .format(self.batch_num, self.site_num, self.instance_num)
        return inst_file_name

    def filter_guards_from_pcap(self):
        guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()])
        wl_log.debug("Found %s guards in the concensus.", len(guard_ips))
        orig_pcap = self.pcap_path + ".original"
        copyfile(self.pcap_path, orig_pcap)
        try:
            preader = PcapReader(orig_pcap)
            pcap_filtered = []
            for p in preader:
                if IP not in p:
                    pcap_filtered.append(p)
                    continue
                ip = p.payload
                if ip.dst in guard_ips or ip.src in guard_ips:
                    pcap_filtered.append(p)
            wrpcap(self.pcap_path, pcap_filtered)
        except Exception as e:
            wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s",
                         e, orig_pcap)
        else:
            os.remove(orig_pcap)

    def post_crawl(self):
        pass
        # TODO: add some sanity checks?

    def cleanup_visit(self):
        """Kill sniffer and Tor browser if they're running."""
        wl_log.info("Cleaning up visit.")
        wl_log.info("Cancelling timeout")
        ut.cancel_timeout()

        if self.sniffer and self.sniffer.is_recording:
            wl_log.info("Stopping sniffer...")
            self.sniffer.stop_capture()

        # remove non-tor traffic
        self.filter_guards_from_pcap()

        if self.tb_driver and self.tb_driver.is_running:
            # shutil.rmtree(self.tb_driver.prof_dir_path)
            wl_log.info("Quitting selenium driver...")
            self.tb_driver.quit()

        # close all open streams to prevent pollution
        self.tor_controller.close_all_streams()
        if self.xvfb and not cm.running_in_CI:
            wl_log.info("Stopping display...")
            self.vdisplay.stop()

        # after closing driver and stoping sniffer, we run postcrawl
        self.post_crawl()

    def take_screenshot(self):
        try:
            out_png = os.path.join(self.visit_dir, 'screenshot.png')
            wl_log.info("Taking screenshot of %s to %s" % (self.page_url,
                                                           out_png))
            self.tb_driver.get_screenshot_as_file(out_png)
            if cm.running_in_CI:
                wl_log.debug("Screenshot data:image/png;base64,%s"
                             % self.tb_driver.get_screenshot_as_base64())
        except:
            wl_log.info("Exception while taking screenshot of: %s"
                        % self.page_url)

    def get_wang_and_goldberg(self):
        """Visit the site according to Wang and Goldberg (WPES'13) settings."""
        ut.timeout(cm.HARD_VISIT_TIMEOUT)  # set timeout to stop the visit
        self.sniffer.start_capture(self.pcap_path,
                                   'tcp and not host %s and not tcp port 22 and not tcp port 20'
                                   % LOCALHOST_IP)
        time.sleep(cm.PAUSE_BETWEEN_INSTANCES)
        try:
            self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
        except:
            wl_log.info("Exception setting a timeout {}".format(self.page_url))

        wl_log.info("Crawling URL: {}".format(self.page_url))

        t1 = time.time()
        self.tb_driver.get(self.page_url)
        page_load_time = time.time() - t1
        wl_log.info("{} loaded in {} sec"
                    .format(self.page_url, page_load_time))
        time.sleep(cm.WAIT_IN_SITE)
        if self.capture_screen:
            self.take_screenshot()
        self.cleanup_visit()

    def get_multitab(self):
        """Open two tab, use one to load a background site and the other to
        load the real site."""
        PAUSE_BETWEEN_TAB_OPENINGS = 0.5
        ut.timeout(cm.HARD_VISIT_TIMEOUT)  # set timeout to kill running procs
        # load a blank page - a page is needed to send keys to the browser
        self.tb_driver.get(BAREBONE_HOME_PAGE)
        self.sniffer.start_capture(self.pcap_path,
                                   'tcp and not host %s and not tcp port 22 and not tcp port 20'
                                   % LOCALHOST_IP)

        time.sleep(cm.PAUSE_BETWEEN_INSTANCES)
        try:
            self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
        except:
            wl_log.info("Exception setting a timeout {}".format(self.page_url))

        wl_log.info("Crawling URL: {} with {} in the background".
                    format(self.page_url, self.bg_site))

        body = self.tb_driver.find_element_by_tag_name("body")
        body.send_keys(Keys.CONTROL + 't')  # open a new tab
        # now that the focus is on the address bar, load the background
        # site by "typing" it to the address bar and "pressing" ENTER (\n)
        # simulated by send_keys function
        body.send_keys('%s\n' % self.bg_site)

        # the delay between the loading of background and real sites
        time.sleep(PAUSE_BETWEEN_TAB_OPENINGS)

        body = self.tb_driver.find_element_by_tag_name("body")
        body.send_keys(Keys.CONTROL + 't')  # open a new tab

        t1 = time.time()
        self.tb_driver.get(self.page_url)  # load the real site in the 2nd tab

        page_load_time = time.time() - t1
        wl_log.info("{} loaded in {} sec"
                    .format(self.page_url, page_load_time))
        time.sleep(cm.WAIT_IN_SITE)
        if self.capture_screen:
            self.take_screenshot()
        self.cleanup_visit()

    def get(self):
        """Call the specific visit function depending on the experiment."""
        if self.experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG:
            self.get_wang_and_goldberg()
        elif self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA:
            self.get_multitab()
        else:
            raise ValueError("Cannot determine experiment type")
Пример #6
0
from datetime import datetime

from tbselenium.tbdriver import TorBrowserDriver
from xvfbwrapper import Xvfb

tor_dir = '../tor-browser-patched/Primary/'

vdisplay = Xvfb()
vdisplay.start()

# open list of urls for testing
with open('alexa-top-1000.txt', 'r') as url_file:
    test_urls = url_file.readlines()

driver = TorBrowserDriver(tor_dir)  #, pref_dict=rfp)
driver.set_page_load_timeout(15)

# do 10 runs
uses = 0
notUses = 0
inconclusive = 0
for i, url in enumerate(test_urls):
    try:
        # request url from list
        #print("Fetching " + str(url),end='')
        url = 'https://' + url
        driver.get(url)
        # pull window.performance.timing after loading the page and add information about url and number of run
        perf_timings = driver.execute_script(
            "return window.performance.getEntries()")
        #print(perf_timings)
Пример #7
0
class DescargarPdf:
    def __init__(self):
        self.contadorCredenciales=0
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.usuario=[]
        self.contraseñaTxT=[]
        self.conversor='?convertedTo=pdf'
    def iniciarTor(self):
        self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log')
    def iniciarSecion(self):
        self.element=self.zLibraty.find_element_by_name("email")
        self.element.send_keys(self.correo)
        sleep(2)
        self.element2=self.zLibraty.find_elements_by_class_name("form-control")[1]
        self.element2.send_keys(self.contraseña)
        self.element2.send_keys(Keys.RETURN)
    def paginaDescargas(self):
        print("estoy en la funcion paginaDescagas")
        self.zLibraty.load_url(self.url)
        sleep(4)
        self.html=self.zLibraty.page_source
    def paginaPrinsipal(self,añoInicial,añoFinal):
        self.urlAños='http://zlibraryexau2g3p.onion/s/?yearFrom='+str(añoInicial)+'&yearTo='+str(añoFinal)
        self.url=self.urlAños  
    def cambiarPagina(self,x):
        print("estoy en cambiar pagina prinsipal")
        self.url+='&page='+str(x)
        print(self.url)
    def Crearcsv(self):
        desde=datosDescarga(1)
        asta=datosDescarga(2)
        self.carpetaUrl='/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url'
        try :
             os.mkdir(self.carpetaUrl)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        self.escrivirUrlWed=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/url2.csv','w'))
        self.imprimirUrlPdf=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/urlDowload2.csv','w'))
    def credenciales(self,numeroUsuario):
        print("llegue")
        if self.contadorCredenciales==0 or self.contadorCredenciales==20:
            self.zLibraty.load_url("https://singlelogin.org/")
            self.zLibraty.find_element_by_name("redirectToHost").click()
            sleep(3)
            pyautogui.press("down")
            sleep(2)
            pyautogui.press("down")
            sleep(1)
            pyautogui.press("enter")
        sleep(5)
        self.correo=self.usuario[numeroUsuario]
        self.contraseña=self.contraseñaTxT[numeroUsuario]
    def UsuariosYcontraseñas(self):
        self.dir='/home/dd/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt'
        self.data=open(self.dir,'r+')
        for self.i in range(0,200):
            if self.i%2==0 :
                self.usuario.append(self.data.readline())
            if self.i%2!=0:
                self.contraseñaTxT.append(self.data.readline())
    def urlPdf(self,):
        self.contadorCredenciales=1
        self.boleanoPdf=0
        self.respaldoContador=0
        self.contadorUsuarios=usuarioUsadosLeer()
        self.contadorLibros=datosDescarga(4)
        self.contadorLibros2=self.contadorLibros%10
        self.Crearcsv()
        self.soup=BeautifulSoup(self.html,'html.parser')
        try:
            for self.urlwed in self.soup.find_all(itemprop = "name") :
                self.contador=0
                self.urlwed=self.urlwed.find('a',href=re.compile(''))
                self.urlDowload=self.urlwed.get('href')
                self.urlpdfGeleneralH=re.sub('/book/','https://b-ok.cc/book/',self.urlDowload)
                self.urlDowload=re.sub('/book/','http://zlibraryexau2g3p.onion/book/',self.urlDowload)
                self.escrivirUrlWed.writerow([self.urlDowload])
                print(self.urlDowload)
                self.voleano=validarFormato(self.urlpdfGeleneralH)
                guardarNumeroDescargas(self.contadorLibros) 
                print(self.respaldoContador) 
                if self.contadorLibros==self.respaldoContador:
                    for self.urlRedirec in range(0,1):
                        self.zLibraty.load_url(self.urlDowload)
                        sleep(5)
                        self.htmlPdf=self.zLibraty.page_source
                        self.soupRedirec=BeautifulSoup(self.htmlPdf,'html.parser')
                        self.urlDowloadPDF=self.soupRedirec.find(class_="btn btn-primary dlButton addDownloadedBook")
                        self.urlDowloadPDF=self.urlDowloadPDF.get('href')
                        self.urlDowloadPDF=re.sub('/dl/','http://zlibraryexau2g3p.onion/dl/',self.urlDowloadPDF)
                        self.imprimirUrlPdf.writerow([self.urlDowloadPDF])
                        print(self.urlDowloadPDF)
                        print("vamos a por el if")
                        sleep(15)
                        if self.voleano==True:
                            self.zLibraty.set_page_load_timeout(12)
                            try:
                                self.zLibraty.load_url(self.urlDowloadPDF)
                            except:
                                sleep(5)
                                self.zLibraty.set_page_load_timeout(7000)
                                print("funciona PDF ")                                
                            self.voleano=False
                            sleep(5)
                            self.contadorLibros+=1
                            self.contadorLibros2+=1
                        else:                          
                            self.zLibraty.set_page_load_timeout(12)
                            try:
                                self.zLibraty.load_url(self.urlDowloadPDF)
                            except:
                                sleep(8)
                                pyautogui.press("down")
                                sleep(2)
                                pyautogui.press("enter")
                            self.zLibraty.set_page_load_timeout(7000)
                            sleep(5)
                            self.contadorLibros+=1
                            self.contadorLibros2+=1
                        self.zLibraty.load_url("about:downloads")
                        self.datosEsperaDescarga()
                        self.peticiones()
                        self.zLibraty.back()
                        informaiconPdf(self.urlpdfGeleneralH)
                        guardarNumeroDescargas(self.contadorLibros)
                self.respaldoContador+=1                   
                if self.contadorLibros==self.respaldoContador:
                    if self.contadorLibros2%10==0:
                        print((self.contadorLibros2-1)%10)
                        self.contador+=1
                        if self.contadorLibros==20:
                            self.contadorCredenciales=20
                            print("saliendo de secion¡¡¡¡¡¡")
                            pyautogui.moveTo(1707,245)
                            pyautogui.hotkey("ctrl","shift","u")
                            sleep(2)
                            pyautogui.press("enter")
                            sleep(7)
                            pyautogui.press("enter")
                            sleep(15)
                        else:
                            print("saliendo de secion")
                            self.zLibraty.get("http://zlibraryexau2g3p.onion/logout.php")          
                        self.contadorUsuarios+=1
                        print(self.contadorUsuarios)
                        try:
                            self.zLibraty.switch_to_window(self.zLibraty.window_handles[0])
                        except:
                            print("error al cambian de  ventana")
                       
                        usuarioUsadosReescrivir(self.contadorUsuarios)
                        print("por aqui¿¿¿¿¿¿")
                        self.credenciales(self.contadorUsuarios)
                        self.contadorCredenciales=1
                        print("no por aqui¿¿¿¿¿¿")
                        sleep(20)
                        self.iniciarSecion()
                        sleep(15)
                        self.paginaDescargas()
                        sleep(7)
                        self.contadorLibros2=0
                        sleep(15)
                        print("numero de li bros por usuario ",self.contadorLibros2)
                        if self.contador==5:
                            self.contador=0  
        except OSError as e :
            print(e.strerror)
            print("error en la urlPdf:::::")
            guardarNumeroDescargas(self.contadorLibros)
            usuarioUsadosReescrivir(self.contadorUsuarios)
            print(self.contadorLibros)
            archivos=int(contarNueroArchivos())
            print(archivos)
            self.zLibraty.load_url("about:downloads")
            self.datosEsperaDescarga()
            self.peticiones()
            self.zLibraty.back()
            informaiconPdf(self.urlpdfGeleneralH)
    def DescargarContenido(self,_html):         
        self.contenido=_html
    def serrarTor(self):
         self.zLibraty.close()
    def datosEsperaDescarga(self):
        sleep(4)
        self.htmlValidador=self.zLibraty.page_source
    def validarDescarga(self):
        self.htmlFalce=self.zLibraty.page_source
        self.soupFalce=BeautifulSoup(self.htmlFalce,"html.parser")
        self.validarfalce=self.soupFalce.find_all("description",class_="downloadDetails downloadDetailsNormal")
        self.respuestafalce=re.search("value=.+",str(self.validarfalce))
        self.buscarFalse=self.respuestafalce.group()
        if re.search("Canceled",self.buscarFalse):
            print("se daño al descarga =(")
            sleep(5)
            pyautogui.click(1393,139)
            sleep(5)
        else :
            if re.search("Failed",self.buscarFalse):
                print("se daño al descarga pero vamos a solucionarlo =( ")
                sleep(5)
                pyautogui.click(1393,139)
                sleep(5)
            else:    
                print("la descarga va bien =)")
    def peticiones(self):   
        self.validarDescarga()      
        self.carga=0
        self.daño=0
        self.conteo=0
        while self.carga<100:
            self.soup=BeautifulSoup(self.htmlValidador,"html.parser")
            try:
                self.archivoDescarga=self.soup.find_all("progress",class_="downloadProgress")
                self.respaldo=re.split("value",str(self.archivoDescarga))
                self.tiempo=re.search("[0-9]+",self.respaldo[1])
                print(self.tiempo.group())
                self.carga=int(self.tiempo.group())
                self.datosEsperaDescarga()
                sleep(3)
                self.validarDescarga()
                if self.conteo==3:
                    pyautogui.press("enter")
                    self.conteo=0
            except:
                print("o  no ,se daño la descargar y no la e podido volver a iniciar")
                if self.daño==7:
                    os.system('rm -r /home/dd/zlibros/libros1920-1921/libro/*.*')         
                    raise
                self.daño+=1
                sleep(5)