def getTorDriver(tor_installation_path: str, driver_path: str):
    # store original path of the directory (tor is changing the path when executing)
    originalPath = getcwd()
    driver = TorBrowserDriver(tor_installation_path,
                              executable_path=driver_path)
    driver.get("https://www.facebookcorewwwi.onion/")
    chdir(originalPath)
    return driver
Пример #2
0
def capture(website,epoch):
    #print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT")
    #print(sys.argv)
    if 'tor' in sys.argv:
        print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT")
        browser = TorBrowserDriver(TBB_dir,socks_port=socks_port,control_port=control_port)
    else:
        profile = firfox_proxy(webdriver)
        browser = webdriver.Firefox(firefox_profile=profile,firefox_binary = firefox_dir)
    browser.delete_all_cookies()
    browser.get('http://' + website)
 def test_close_all_streams(self):
     streams_open = False
     new_tb_drv = TorBrowserDriver(cm.TBB_PATH)
     new_tb_drv.get('http://www.google.com')
     time.sleep(30)
     self.tor_controller.close_all_streams()
     for stream in self.tor_controller.controller.get_streams():
         print stream.id, stream.purpose, stream.target_address, "open!"
         streams_open = True
     new_tb_drv.quit()
     self.assertFalse(streams_open, 'Could not close all streams.')
 def test_close_all_streams(self):
     streams_open = False
     new_tb_drv = TorBrowserDriver(cm.TBB_DIR, tbb_logfile_path='test.log')
     new_tb_drv.get('http://www.google.com')
     time.sleep(30)
     self.tor_controller.close_all_streams()
     for stream in self.tor_controller.controller.get_streams():
         print(stream.id, stream.purpose, stream.target_address, "open!")
         streams_open = True
     new_tb_drv.quit()
     self.assertFalse(streams_open, 'Could not close all streams.')
class TruliaHelper():

    def __init__(self):
        self.url = 'https://www.trulia.com'
        # need to set chrome path here.
        tbpath = "/home/XX/XXXX/tor-browser-linux64-8.0.8_en-US/tor-browser_en-US"
        self.driver = TorBrowserDriver(tbb_path=tbpath, tbb_logfile_path='test.log')
        # self.driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary)
        # self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe', chrome_options=chrome_options)

    # method to get items from given link.
    def getItems(self):
        df=pd.read_excel("/home/XXXXX/XXXXX/XXXXXX.xlsx")
        a=df['Site Address']
        b=df['Site City']
        c=df['Site State']
        d=df['Site Zip']
        items = []
        # keywords = ['512 W 10th St Perris CA 92570', 'New York, NY', 'San Francisco, CA', 'Washington, CA']
        for keyword in (pd.concat([a,b,c,d],axis=1)).values.tolist():
#         keywords = ['512 W 10th St Perris CA 92570'] * 10
#         for keyword in keywords:
            self.driver.get(self.url)
            search_box = self.driver.find_element_by_id("homepageSearchBoxTextInput")
            search_box.clear()
            search_box.send_keys(str(keyword))
            search_btn = self.driver.find_element_by_xpath("//button[@data-auto-test-id='searchButton']")
            if search_btn:
                search_btn.click()
                time.sleep(10)
                items.append(self.getItemDetail())
            # break
        self.driver.close()
        return items


    def getItemDetail(self):
        data = {}
        try:
            soup = BeautifulSoup(self.driver.page_source, u'html.parser')
            #image = soup.find("div", attrs={"class": "Tiles__TileBackground-fk0fs3-0 cSObNX"}).find("img")["src"]
            price = soup.find("div", attrs={"class": "Text__TextBase-sc-1cait9d-0-div Text__TextContainerBase-sc-1cait9d-1 hlvKRM"}).text
            # container = soup.find("div", attrs={"class": "resultsColumn"}).find("ul")
            # items = container.findAll("li", recursive=False)
            print(price)
        except:
            pass
        return data
  
    # method to start process.
    def start(self):
        items = self.getItems()
        print("Items : ",items)
Пример #6
0
class TorBrowserThread(threading.Thread):
    def __init__(self, name, url):
        threading.Thread.__init__(self)
        self.name = name
        self.url = url

    def terminate(self):
        self.browser.quit()

    def run(self):
        self.browser = TorBrowserDriver("/home/dev/tor-browser")
        self.browser.profile.set_preference("browser.cache.disk.enable", False)
        self.browser.profile.set_preference("browser.cache.memory.enable",
                                            False)
        self.browser.profile.set_preference("browser.cache.offline.enable",
                                            False)
        self.browser.profile.set_preference("network.http.use-cache", False)
        self.browser.profile.set_preference("network.cookie.cookieBehavior", 2)
        self.browser.get(self.url)
class RunDriverWithControllerTest(unittest.TestCase):
    """
    This test shows how to run tor with TorController and browse with TorBrowserDriver.
    """

    @unittest.skip("Only for didactic purposes.")
    def test_run_driver_with_controller(self):
        # run controller on port N
        custom_socks_port = 6666
        self.tor_controller = TorController(cm.TBB_PATH, torrc_dict={'SocksPort': str(custom_socks_port)})
        self.tor_process = self.tor_controller.launch_tor_service()

        # set driver and get a page
        self.tor_driver = TorBrowserDriver(cm.TBB_PATH, socks_port=custom_socks_port)
        self.tor_driver.get("http://google.com")

        # shutdown
        self.tor_driver.quit()
        self.tor_controller.kill_tor_proc()
class RunDriverWithControllerTest(unittest.TestCase):
    """
    This test shows how to run tor with TorController and browse with TorBrowserDriver.
    """
    @unittest.skip("Only for didactic purposes.")
    def test_run_driver_with_controller(self):
        # run controller on port N
        custom_socks_port = 6666
        self.tor_controller = TorController(
            cm.TBB_DIR, torrc_dict={'SocksPort': str(custom_socks_port)})
        self.tor_process = self.tor_controller.launch_tor_service()

        # set driver and get a page
        self.tor_driver = TorBrowserDriver(cm.TBB_DIR,
                                           socks_port=custom_socks_port)
        self.tor_driver.get("http://google.com")

        # shutdown
        self.tor_driver.quit()
        self.tor_controller.quit()
Пример #9
0
class DescargarPdf:
    def __init__(self):
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.usuario = []
        self.contraseñaTxT = []
        self.conversor = '?convertedTo=pdf'

    def iniciarTor(self):
        self.zLibraty = TorBrowserDriver(self.tbb_dir,
                                         tbb_logfile_path='test.log')

    def iniciarSecion(self):
        self.zLibraty.refresh()
        sleep(10)
        self.element = self.zLibraty.find_element_by_name("email")
        self.element.send_keys(self.correo)
        sleep(2)
        self.element2 = self.zLibraty.find_elements_by_class_name(
            "form-control")[1]
        self.element2.send_keys(self.contraseña)
        self.element2.send_keys(Keys.RETURN)

    def paginaDescargas(self):
        self.zLibraty.load_url(self.url)
        self.html = self.zLibraty.page_source

    def paginaPrinsipal(self, añoInicial, añoFinal):
        self.urlAños = 'http://zlibraryexau2g3p.onion/s/?yearFrom=' + str(
            añoInicial) + '&yearTo=' + str(añoFinal)
        self.url = self.urlAños

    def cambiarPagina(self, x):
        self.url += '&page=' + str(x)

    def Crearcsv(self):
        print("hola")
        self.carpetaUrl = '/home/dgc7/Documentos/zlibrary/libros1920-1921/url'
        try:
            os.mkdir(self.carpetaUrl)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        self.escrivirUrlWed = csv.writer(
            open('/home/dgc7/Documentos/zlibrary/libros1920-1921/url/url2.csv',
                 'w'))
        self.imprimirUrlPdf = csv.writer(
            open(
                '/home/dgc7/Documentos/zlibrary/libros1920-1921/url/urlDowload2.csv',
                'w'))

    def credenciales(self, numeroUsuario):
        self.correo = self.usuario[numeroUsuario]
        self.contraseña = self.contraseñaTxT[numeroUsuario]
        self.urlLoguin = 'http://zlibraryexau2g3p.onion'
        self.zLibraty.load_url(self.urlLoguin)

    def UsuariosYcontraseñas(self):
        self.dir = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/descargarLIbros/descargarparte1/contraseñasYcorreos.txt'
        self.data = open(self.dir, 'r+')
        for self.i in range(0, 200):
            if self.i % 2 == 0:
                self.usuario.append(self.data.readline())
            if self.i % 2 != 0:
                self.contraseñaTxT.append(self.data.readline())

    def urlPdf(self, contador, _contadorusuarios):
        self.boleanoPdf = 0
        self.contadorUsuariosCon = _contadorusuarios
        self.contadorLibros2 = 0
        self.contadorLibros = 0
        self.Crearcsv()
        self.soup = BeautifulSoup(self.html, 'html.parser')
        for self.urlwed in self.soup.find_all(itemprop="name"):
            self.contador = 0
            self.urlwed = self.urlwed.find('a', href=re.compile(''))
            self.urlDowload = self.urlwed.get('href')
            self.urlpdfGeleneralH = re.sub('/book/', 'https://b-ok.cc/book/',
                                           self.urlDowload)
            self.urlDowload = re.sub('/book/',
                                     'http://zlibraryexau2g3p.onion/book/',
                                     self.urlDowload)
            self.escrivirUrlWed.writerow([self.urlDowload])
            print(self.urlDowload)
            self.contadorLibros += 1
            self.contadorLibros2 += 1
            if self.contadorLibros2 == 10:
                self.contador += 1
                self.serrarTor()
                sleep(4)
                self.iniciarTor()
                self.contadorUsuariosCon += 1
                print(self.contadorUsuariosCon)
                self.credenciales(contadorusuarios)
                self.iniciarSecion()
                sleep(7)
                self.contadorLibros2 = 0
                sleep(15)
                if self.contador == 5:
                    self.contador = 0
            voleano = validarFormato(self.urlpdfGeleneralH)
            for self.urlRedirec in range(0, 1):
                self.zLibraty.load_url(self.urlDowload)
                sleep(5)
                self.htmlPdf = self.zLibraty.page_source
                self.soupRedirec = BeautifulSoup(self.htmlPdf, 'html.parser')
                self.urlDowloadPDF = self.soupRedirec.find(
                    class_="btn btn-primary dlButton addDownloadedBook")
                self.urlDowloadPDF = self.urlDowloadPDF.get('href')
                self.urlDowloadPDF = re.sub(
                    '/dl/', 'http://zlibraryexau2g3p.onion/dl/',
                    self.urlDowloadPDF)
                self.imprimirUrlPdf.writerow([self.urlDowloadPDF])
                print(self.urlDowloadPDF)
                if voleano == True:
                    self.zLibraty.get(self.urlDowloadPDF)
                    voleano = False
                else:
                    self.convertirpdf = str(self.urlDowloadPDF) + str(
                        self.conversor)
                    self.zLibraty.get(self.convertirpdf)
                sleep(20)
                tiempoDescarga()
                informaiconPDf(self.urlpdfGeleneralH)

    def DescargarContenido(self, _html):
        self.contenido = _html

    def serrarTor(self):
        self.zLibraty.close()
Пример #10
0
def tor_web_crawler(index, link, ip_address):
    """
    This function is a web crawler for collection of traffic traces and saving those traces to pcap files.
    :param index: current trace of the link
    :param link: webpage address from where traffic is to be collected
    :param ip_address: ip-addres of the machine from which traffic is to be collected
    :param timeout: duration upto which traffic information needs to be collected
    :param pkt_count: number of packets to be collected for a particular trace
    :return:
    """

    # Extracting domain name for saving trace separately
    url = link
    lnk = tldextract.extract(url)
    domain_name = lnk.domain + '.' + lnk.suffix
    # print('Processing trace for domain name crawl : ', domain_name)

    # interface = 'enp0s31f6'
    # interface = 'any'
    interface = 'eth0'
    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = True  # optional
    # driver = TorBrowserDriver(TBB_PATH)
    try:
        driver = TorBrowserDriver(TBB_PATH)
        # saving the pcapfiles
        PP = PACP_PATH + '/' + domain_name
        # saving the screen shots
        SS = SCREEN_SHOT + '/' + domain_name
        driver.get(url)
    except wde as e:
        print('Browser crashed:')
        print(e)
        print('Trying again in 10 seconds ...')
        time.sleep(10)
        driver = driver
        print('Success!\n')
    except Exception as e:
        raise Exception(e)

    if not os.path.isdir(PP):
        print('Creating directory for saving capture files (pcap) ...')
        os.makedirs(PP)
    else:
        pass

    if not os.path.isdir(SS):
        print('Creating directory for saving screenshots ...')
        os.makedirs(SS)
    else:
        pass

    # command to be executed for capturing the trace
    # command = "sudo tcpdump -i " + str(interface) + " -n host " + str(ip_address) + " -c " + str(pkt_count) + " -w " + PP + "/" + domain_name + "_" + str(index) + ".pcap "
    command = "sudo timeout 60 tcpdump -i " + str(
        interface) + " -n host " + str(
            ip_address) + " -w " + PP + "/" + domain_name + "_" + str(
                index) + ".pcap"
    print('Capture trace ...')
    capture = subprocess.Popen(command, shell=True)
    #     time.sleep(1)
    capture.wait()
    print('Traffic trace captured and saved successfully.')
    # save the screenshot
    driver.save_screenshot(SS + '/' + domain_name + '-' + str(index) + '.png')
    print('Screen shot of the webpage saved successfully.')
    driver.quit()
Пример #11
0
def makeRequest(url, domain):
    """
    Makes HTTP request to url given as argument, after changing IP.
    """
    import time

    # Opening log file
    f = open(logfile_name, 'a')

    print('Changing IP...\n')

    # Below is method A using requests library without opening real TOR browser.
    # Method B will be used instead, which opens a real browser, so that JS code is executed
    # and Google Analytics tracks us as a real user.
    """
    # Resetting IP
    tr.reset_identity()
    # This command changes restarts tor service, resulting in IP address change. After '-p' flag insert user password.
    #os.system('sudo systemctl restart tor -p 0000')

    #Creating empty session object
    session = requests.session()
    session.proxies = {}

    # Adding proxies to session
    session.proxies['http'] = 'socks5h://localhost:9050'
    session.proxies['https'] = 'socks5h://localhost:9050'

    #Changing request headers
    headers = {}
    headers['User-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    print('Request headers were set.\n') 
-
    new_ip = session.get('http://ipecho.net/plain').text
    

    # Executing requests 

    #Executing request and assigning response status code
    status_code = session.get(url).status_code
    """

    # Method B, using complete TOR Browser

    driver = TorBrowserDriver("/home/manos/Desktop/tor-browser_en-US")
    # driver.get('https://ipecho.net/plain')
    # new_ip = driver.find_element_by_tag_name('body').text

    checkConn()

    driver.get(url)
    time.sleep(2.0)
    driver.close()

    # Request logging
    time = 'Date: ' + str(datetime.datetime.now())[0:10] + '\nTime: ' + str(
        datetime.datetime.now())[11:19]
    f.write(
        time + '\nDomain: ' + domain + '\n'
        'Request sent to ' + url + '.' + '\nResponse status code: ' +
        str(200) +
        '\n*******************************************************************************************\n\n'
    )
    f.close()
    os.system('clear')
Пример #12
0
class DescargarPdf:
    def __init__(self):
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.usuario = []
        self.contraseñaTxT = []
        self.conversor = '?convertedTo=pdf'

    def iniciarTor(self):
        self.zLibraty = TorBrowserDriver(self.tbb_dir,
                                         tbb_logfile_path='test.log')

    def iniciarSecion(self):
        self.element = self.zLibraty.find_element_by_name("email")
        self.element.send_keys(self.correo)
        sleep(2)
        self.element2 = self.zLibraty.find_elements_by_class_name(
            "form-control")[1]
        self.element2.send_keys(self.contraseña)
        self.element2.send_keys(Keys.RETURN)

    def paginaDescargas(self):
        print("estoy en la funcion paginaDescagas")
        sleep(4)
        self.zLibraty.get(self.url)
        self.html = self.zLibraty.page_source

    def paginaPrinsipal(self, añoInicial, añoFinal):
        self.urlAños = 'http://zlibraryexau2g3p.onion/s/?yearFrom=' + str(
            añoInicial) + '&yearTo=' + str(añoFinal)
        self.url = self.urlAños

    def cambiarPagina(self, x):
        self.url += '&page=' + str(x)

    def Crearcsv(self):
        self.carpetaUrl = '/home/dgc7/Documentos/zlibrary/libros1920-1921/url'
        try:
            os.mkdir(self.carpetaUrl)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        self.escrivirUrlWed = csv.writer(
            open('/home/dgc7/Documentos/zlibrary/libros1920-1921/url/url2.csv',
                 'w'))
        self.imprimirUrlPdf = csv.writer(
            open(
                '/home/dgc7/Documentos/zlibrary/libros1920-1921/url/urlDowload2.csv',
                'w'))

    def credenciales(self, numeroUsuario):
        print("llegue")
        self.correo = self.usuario[numeroUsuario]
        self.contraseña = self.contraseñaTxT[numeroUsuario]
        self.urlLoguin = 'http://zlibraryexau2g3p.onion'
        self.zLibraty.get(self.urlLoguin)

    def UsuariosYcontraseñas(self):
        self.dir = '/home/dgc7/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt'
        self.data = open(self.dir, 'r+')
        for self.i in range(0, 200):
            if self.i % 2 == 0:
                self.usuario.append(self.data.readline())
            if self.i % 2 != 0:
                self.contraseñaTxT.append(self.data.readline())

    def urlPdf(self, ):
        self.boleanoPdf = 0
        self.respaldoContador = 0
        self.contadorUsuarios = usuarioUsadosLeer()
        self.contadorLibros = datosDescarga(4)
        self.contadorLibros2 = self.contadorLibros % 10
        self.Crearcsv()
        self.soup = BeautifulSoup(self.html, 'html.parser')
        try:
            for self.urlwed in self.soup.find_all(itemprop="name"):
                self.contador = 0
                self.urlwed = self.urlwed.find('a', href=re.compile(''))
                self.urlDowload = self.urlwed.get('href')
                self.urlpdfGeleneralH = re.sub('/book/',
                                               'https://b-ok.cc/book/',
                                               self.urlDowload)
                self.urlDowload = re.sub(
                    '/book/', 'http://zlibraryexau2g3p.onion/book/',
                    self.urlDowload)
                self.escrivirUrlWed.writerow([self.urlDowload])
                print(self.urlDowload)
                voleano = validarFormato(self.urlpdfGeleneralH)
                guardarNumeroDescargas(self.contadorLibros)
                print(self.respaldoContador)
                if self.contadorLibros == self.respaldoContador:
                    for self.urlRedirec in range(0, 1):
                        self.zLibraty.get(self.urlDowload)
                        sleep(5)
                        self.htmlPdf = self.zLibraty.page_source
                        self.soupRedirec = BeautifulSoup(
                            self.htmlPdf, 'html.parser')
                        self.urlDowloadPDF = self.soupRedirec.find(
                            class_="btn btn-primary dlButton addDownloadedBook"
                        )
                        self.urlDowloadPDF = self.urlDowloadPDF.get('href')
                        self.urlDowloadPDF = re.sub(
                            '/dl/', 'http://zlibraryexau2g3p.onion/dl/',
                            self.urlDowloadPDF)
                        self.imprimirUrlPdf.writerow([self.urlDowloadPDF])
                        print(self.urlDowloadPDF)
                        print("vamos a por el if")
                        sleep(10)
                        if voleano == True:
                            self.zLibraty.set_page_load_timeout(8)
                            try:
                                self.zLibraty.get(self.urlDowloadPDF)
                            except:
                                self.zLibraty.set_page_load_timeout(70)
                                self.zLibraty.refresh()
                                print("funciona PDF ")

                            voleano = False
                            sleep(5)
                            self.contadorLibros += 1
                            self.contadorLibros2 += 1
                        else:
                            try:
                                self.zLibraty.set_page_load_timeout(5)
                                try:
                                    self.zLibraty.get(self.urlDowloadPDF)
                                except:
                                    sleep(4)
                                    pyautogui.press("down")
                                    sleep(2)
                                    pyautogui.press("enter")
                                self.zLibraty.set_page_load_timeout(70)
                            except:
                                print(
                                    "\nerror al controlasr el teclado y dar enter\n"
                                )
                                raise
                            sleep(5)
                            self.zLibraty.refresh()
                            self.contadorLibros += 1
                            self.contadorLibros2 += 1
                        sleep(20)
                        tiempoDescarga()
                        informaiconPdf(self.urlpdfGeleneralH)
                self.respaldoContador += 1
                if self.contadorLibros == self.respaldoContador:
                    if self.contadorLibros2 % 10 == 0:
                        print((self.contadorLibros2 - 1) % 10)
                        self.contador += 1
                        pyautogui.hotkey("ctrl", "shift", "u")
                        sleep(2)
                        pyautogui.press("enter")
                        sleep(7)
                        pyautogui.press("enter")
                        sleep(15)
                        self.contadorUsuarios += 1
                        print(self.contadorUsuarios)
                        try:
                            self.zLibraty.switch_to_window(
                                self.zLibraty.window_handles[0])
                        except:
                            print("error al cambian de  ventana")
                        usuarioUsadosReescrivir(self.contadorUsuarios)
                        print("por aqui¿¿¿¿¿¿")
                        self.credenciales(self.contadorUsuarios)
                        print("no por aqui¿¿¿¿¿¿")
                        sleep(23)
                        self.iniciarSecion()
                        sleep(7)
                        self.contadorLibros2 = 0
                        sleep(15)
                        print("numero de li bros por usuario ",
                              self.contadorLibros2)
                        if self.contador == 5:
                            self.contador = 0
        except OSError as e:
            print(e.strerror)
            print("error en la urlPdf:::::")
            guardarNumeroDescargas(self.contadorLibros)
            usuarioUsadosReescrivir(self.contadorUsuarios)
            print(self.contadorLibros)
            raise
        print("termine la pagina")

    def DescargarContenido(self, _html):
        self.contenido = _html

    def serrarTor(self):
        self.zLibraty.close()
Пример #13
0
class Visit(object):
    """Hold info about a particular visit to a page."""

    def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None,
                 experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True):
        self.batch_num = batch_num
        self.site_num = site_num
        self.instance_num = instance_num
        self.page_url = page_url
        self.bg_site = bg_site
        self.experiment = experiment
        self.base_dir = base_dir
        self.visit_dir = None
        self.visit_log_dir = None
        self.tbb_version = cm.RECOMMENDED_TBB_VERSION
        self.capture_screen = capture_screen
        self.tor_controller = tor_controller
        self.xvfb = xvfb
        self.init_visit_dir()
        self.pcap_path = os.path.join(
            self.visit_dir, "{}.pcap".format(self.get_instance_name()))

        if self.xvfb and not cm.running_in_CI:
            wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H))
            self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H)
            self.vdisplay.start()

        # Create new instance of TorBrowser driver
        TorBrowserDriver.add_exception(self.page_url)
        self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH,
                                          tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log"))
        self.sniffer = Sniffer()  # sniffer to capture the network traffic

    def init_visit_dir(self):
        """Create results and logs directories for this visit."""
        visit_name = str(self.instance_num)
        self.visit_dir = os.path.join(self.base_dir, visit_name)
        ut.create_dir(self.visit_dir)
        self.visit_log_dir = os.path.join(self.visit_dir, 'logs')
        ut.create_dir(self.visit_log_dir)

    def get_instance_name(self):
        """Construct and return a filename for the instance."""
        inst_file_name = '{}_{}_{}' \
            .format(self.batch_num, self.site_num, self.instance_num)
        return inst_file_name

    def filter_guards_from_pcap(self):
        guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()])
        wl_log.debug("Found %s guards in the concensus.", len(guard_ips))
        orig_pcap = self.pcap_path + ".original"
        copyfile(self.pcap_path, orig_pcap)
        try:
            preader = PcapReader(orig_pcap)
            pcap_filtered = []
            for p in preader:
                if IP not in p:
                    pcap_filtered.append(p)
                    continue
                ip = p.payload
                if ip.dst in guard_ips or ip.src in guard_ips:
                    pcap_filtered.append(p)
            wrpcap(self.pcap_path, pcap_filtered)
        except Exception as e:
            wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s",
                         e, orig_pcap)
        else:
            os.remove(orig_pcap)

    def post_crawl(self):
        pass
        # TODO: add some sanity checks?

    def cleanup_visit(self):
        """Kill sniffer and Tor browser if they're running."""
        wl_log.info("Cleaning up visit.")
        wl_log.info("Cancelling timeout")
        ut.cancel_timeout()

        if self.sniffer and self.sniffer.is_recording:
            wl_log.info("Stopping sniffer...")
            self.sniffer.stop_capture()

        # remove non-tor traffic
        self.filter_guards_from_pcap()

        if self.tb_driver and self.tb_driver.is_running:
            # shutil.rmtree(self.tb_driver.prof_dir_path)
            wl_log.info("Quitting selenium driver...")
            self.tb_driver.quit()

        # close all open streams to prevent pollution
        self.tor_controller.close_all_streams()
        if self.xvfb and not cm.running_in_CI:
            wl_log.info("Stopping display...")
            self.vdisplay.stop()

        # after closing driver and stoping sniffer, we run postcrawl
        self.post_crawl()

    def take_screenshot(self):
        try:
            out_png = os.path.join(self.visit_dir, 'screenshot.png')
            wl_log.info("Taking screenshot of %s to %s" % (self.page_url,
                                                           out_png))
            self.tb_driver.get_screenshot_as_file(out_png)
            if cm.running_in_CI:
                wl_log.debug("Screenshot data:image/png;base64,%s"
                             % self.tb_driver.get_screenshot_as_base64())
        except:
            wl_log.info("Exception while taking screenshot of: %s"
                        % self.page_url)

    def get_wang_and_goldberg(self):
        """Visit the site according to Wang and Goldberg (WPES'13) settings."""
        ut.timeout(cm.HARD_VISIT_TIMEOUT)  # set timeout to stop the visit
        self.sniffer.start_capture(self.pcap_path,
                                   'tcp and not host %s and not tcp port 22 and not tcp port 20'
                                   % LOCALHOST_IP)
        time.sleep(cm.PAUSE_BETWEEN_INSTANCES)
        try:
            self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
        except:
            wl_log.info("Exception setting a timeout {}".format(self.page_url))

        wl_log.info("Crawling URL: {}".format(self.page_url))

        t1 = time.time()
        self.tb_driver.get(self.page_url)
        page_load_time = time.time() - t1
        wl_log.info("{} loaded in {} sec"
                    .format(self.page_url, page_load_time))
        time.sleep(cm.WAIT_IN_SITE)
        if self.capture_screen:
            self.take_screenshot()
        self.cleanup_visit()

    def get_multitab(self):
        """Open two tab, use one to load a background site and the other to
        load the real site."""
        PAUSE_BETWEEN_TAB_OPENINGS = 0.5
        ut.timeout(cm.HARD_VISIT_TIMEOUT)  # set timeout to kill running procs
        # load a blank page - a page is needed to send keys to the browser
        self.tb_driver.get(BAREBONE_HOME_PAGE)
        self.sniffer.start_capture(self.pcap_path,
                                   'tcp and not host %s and not tcp port 22 and not tcp port 20'
                                   % LOCALHOST_IP)

        time.sleep(cm.PAUSE_BETWEEN_INSTANCES)
        try:
            self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
        except:
            wl_log.info("Exception setting a timeout {}".format(self.page_url))

        wl_log.info("Crawling URL: {} with {} in the background".
                    format(self.page_url, self.bg_site))

        body = self.tb_driver.find_element_by_tag_name("body")
        body.send_keys(Keys.CONTROL + 't')  # open a new tab
        # now that the focus is on the address bar, load the background
        # site by "typing" it to the address bar and "pressing" ENTER (\n)
        # simulated by send_keys function
        body.send_keys('%s\n' % self.bg_site)

        # the delay between the loading of background and real sites
        time.sleep(PAUSE_BETWEEN_TAB_OPENINGS)

        body = self.tb_driver.find_element_by_tag_name("body")
        body.send_keys(Keys.CONTROL + 't')  # open a new tab

        t1 = time.time()
        self.tb_driver.get(self.page_url)  # load the real site in the 2nd tab

        page_load_time = time.time() - t1
        wl_log.info("{} loaded in {} sec"
                    .format(self.page_url, page_load_time))
        time.sleep(cm.WAIT_IN_SITE)
        if self.capture_screen:
            self.take_screenshot()
        self.cleanup_visit()

    def get(self):
        """Call the specific visit function depending on the experiment."""
        if self.experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG:
            self.get_wang_and_goldberg()
        elif self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA:
            self.get_multitab()
        else:
            raise ValueError("Cannot determine experiment type")
Пример #14
0
class Browser:
    
    def __init__(self, config, browser, pet, env_type, proxy_setting):

        """
        If given valid proxy settings, this function will configure socks5 proxy properly on chrome (brave) and firefox.
        """
        def setup_socks5_proxy(browser, profile, proxy_setting):
            if proxy_setting is not None:
                address = proxy_setting["address"]
                port = proxy_setting["port"]
                bypass_list = proxy_setting["bypass-list"]

                if browser == "chrome":
                    # https://sordidfellow.wordpress.com/2015/05/21/ssh-tunnel-for-chrome/
                    profile.add_argument("--proxy-server=socks5://%s:%s" % (address, port))
                    profile.add_argument("--proxy-bypass-list=%s" % bypass_list)
                    print("socks5 proxy configured on chrome")

                elif browser == "firefox":
                    # https://developer.mozilla.org/en-US/docs/Mozilla/Preferences/Mozilla_networking_preferences
                    profile.set_preference("network.proxy.type", 1)
                    profile.set_preference("network.proxy.socks", address)
                    profile.set_preference("network.proxy.socks_port", port)
                    profile.set_preference("network.proxy.socks_version", 5)
                    profile.set_preference("network.proxy.socks_remote_dns", "true")
                    profile.set_preference("network.proxy.no_proxies_on", bypass_list)
                    print("socks5 proxy configured on firefox")

        """
            If the program is run in a virtual machine, xvfbwrapper has to get installed first.        
        """
        self.env_type = env_type
        if (env_type == "vm"):
            print("xvfb")
            from xvfbwrapper import Xvfb
            width, height, depth = get_display_parameters(config)
            self.vdisplay = Xvfb(width=width, height=height, colordepth=depth)
            self.vdisplay.start()

        print("Browser:", browser, "PET:", pet)
        pet_config = PetConfig()

        if pet == "brave":
            print("brave")
            chrome_options = ChromeOptions()
            bPath, dPath = pet_config.getPetBrowserDriverPath(pet,browser,env_type)
            print(bPath, dPath)
            chromedriver = dPath
            chrome_options.binary_location = bPath
            setup_socks5_proxy("chrome", chrome_options, proxy_setting)
            os.environ["webdriver.chrome.driver"] = chromedriver
            if env_type == "vm":
                chrome_options.add_argument("--no-sandbox")
            self.driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options)
            press_enter(1)
            return

        elif pet == "tor":
            plt= platform.system().lower()
            if plt == "darwin" or plt == "windows": # https://stackoverflow.com/questions/15316304/open-tor-browser-with-selenium
                print("native tor")
                bPath, dPath = pet_config.getPetBrowserDriverPath(pet,browser,env_type)
                print(bPath, dPath)
                profile = FirefoxProfile()
                profile.set_preference("network.proxy.type", 0)
                binary = FirefoxBinary(bPath)
                self.driver = webdriver.Firefox(firefox_profile = profile, firefox_binary= binary, executable_path = dPath)
            elif plt == "linux": # https://medium.com/@manivannan_data/selenium-with-tor-browser-using-python-7b3606b8c55c
                print("vm tor")
                from tbselenium.tbdriver import TorBrowserDriver
                pref_dict = {"network.proxy.no_proxies_on": "http://10.0.2.2/, http://192.168.4.204/"}
                self.driver = TorBrowserDriver(os.environ['TBB_PATH'], pref_dict = pref_dict)
            return


        aPath, bPath, dPath, pref = pet_config.getPetBrowserDriverPath(pet,browser,env_type)
        if (browser == "firefox"):
            fp = FirefoxProfile()
            setup_socks5_proxy("firefox", fp, proxy_setting)
            binary = FirefoxBinary(bPath)
            if pref != None:
                fp.set_preference(pref[0],pref[1])
            self.driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary, executable_path=dPath)

            if (aPath):
                self.driver.install_addon(aPath)            

        elif (browser == "chrome"):
            chrome_options = ChromeOptions()
            chrome_options = webdriver.ChromeOptions() #https://github.com/SeleniumHQ/selenium/issues/5966
            setup_socks5_proxy("chrome", chrome_options, proxy_setting)

            if aPath:
                chrome_options.add_extension(aPath)
            if pref != None:
                chrome_options.add_experimental_option(pref[0],pref[1])
	        chrome_options.binary_location = bPath
            os.environ["webdriver.chrome.driver"] = dPath
	    
            time.sleep(1)
            self.driver = webdriver.Chrome(executable_path=dPath, chrome_options=chrome_options)
            # to escape the alert chrome display on first visit
            time.sleep(1)
            press_enter(1)
        elif(browser == "safari"):
            self.driver = webdriver.Safari()
        else:
            print("Unsupported Browser")
            sys.exit(0)

    def quit(self):
        try:
            self.driver.quit()
        except:
            self.driver.close()     # for Tor
        if (self.env_type == "vm"):
            self.vdisplay.stop()


    def visit_sites(self, site_list, delay=5): 
        """Visits all pages in site_list with delay"""
        for site in site_list:
            sys.stdout.write(".")
            sys.stdout.flush()
            try:
                self.driver.get(site)
                time.sleep(delay)
            except:
                print("Unexpected error:", sys.exc_info()[0])
Пример #15
0
    "/home/ubuntu/Downloads/tor-browser-linux64-8.5.4_en-US/tor-browser_en-US")
insightsLogin(driver, "*****@*****.**", "hotmail143")

while (True):
    try:
        sql = "SELECT `term` FROM `insights` where completed is null order by rand() limit 1"
        cursor.execute(sql)
        result = cursor.fetchall()
        originalTerm = result[0]['term']
        print(originalTerm)
        term = originalTerm.replace("-", " ")

        driver.implicitly_wait(10)

        driver.get(
            "https://www.udemy.com/instructor/marketplace-insights/?q=" +
            term + "&lang=en")
        sleep(random.randint(5, 8))

        try:
            demandEl = driver.find_element_by_xpath(
                '//div[contains(@class,"panel-body")]/div[contains(@class,"course-label-metrics-opportunity")]/div[1]/div/div[2]'
            )
            print(demandEl.text)
        except NoSuchElementException:
            print("trying hyphenated...")
            term = term.replace(" ", "-")
            driver.get(
                "https://www.udemy.com/instructor/marketplace-insights/?q=" +
                term + "&lang=en")
            sleep(random.randint(5, 8))
Пример #16
0
# open list of urls for testing
with open('markMeasureResults.txt', 'r') as url_file:
    test_urls = url_file.readlines()

driver = TorBrowserDriver(tor_dir)  #, pref_dict=rfp)
driver.set_page_load_timeout(60)
cached = set()
# do 10 runs
while True:
    random.shuffle(test_urls)
    for i, url in enumerate(test_urls):
        try:
            # request url from list
            print("Fetching " + str(url), end='')
            driver.get(url)

            # pull window.performance.timing after loading the page and add information about url and number of run
            perf_timings = driver.execute_script(
                "return window.performance.timing")
            perf_timings['timestamp'] = datetime.now()
            perf_timings['path'] = tor_dir
            perf_timings['cached'] = str(url in cached)
            perf_timings['url'] = str(url)
            perf_timings['error'] = 'NONE'

            #print(str(set(perf_timings.keys())-set(colList)))
            #TODO Put in Database
            insertDict(sql, perf_timings)
            cached.add(url)
        except Exception as E:  # what to do in case that an exception is thrown (which happens usually upon page load timeout)
Пример #17
0
n = 0
w_ = open('%s/11th/11th_odds' % PATH, 'w')
def parser(element):
    x = driver.find_element_by_id(element).text
    time.sleep(3)
    #x_h = x_el.get_attribute('outerHTML')
    #x_ = x_h.split('">')
    #x = x_[1]
    #x = x[:x.index('<')]    
    return x

for line in f:
    n += 1
    

    driver.get(line[:-1])
    time.sleep(3)
    
    elem1 = parser('lpRow1')
    elem2 = parser('lpRow2')
    x = elem1+elem2 
    x_ = x.split('\n')
    #print(x_)
    x0 = x_[0]
    
    if x0[0] == ' ':
        x0 = x[1:]
    x1 = x_[1]
    x1_ = x1.split(' ')
    sell = x1_[1]
    buy = x1_[2]
Пример #18
0
class TruliaHelper():
    def __init__(self):
        self.url = 'https://www.trulia.com'
        # need to set Tor Browser path here.
        tbpath = "/home/gc14/Documents/softwares/tor-browser_en-US"
        self.driver = TorBrowserDriver(tbb_path=tbpath,
                                       tbb_logfile_path='test.log')
        # self.driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary)
        # self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe', chrome_options=chrome_options)

    # method to get items from given link.
    def getItems(self):
        items = []
        # keywords = ['512 W 10th St Perris CA 92570', 'New York, NY', 'San Francisco, CA', 'Washington, CA']
        keywords = ['512 W 10th St Perris CA 92570'] * 2
        for keyword in keywords:
            self.driver.get(self.url)
            search_box = self.driver.find_element_by_id(
                "homepageSearchBoxTextInput")
            search_box.clear()
            search_box.send_keys(keyword)
            search_btn = self.driver.find_element_by_xpath(
                "//button[@data-auto-test-id='searchButton']")
            if search_btn:
                print("Going to click")
                search_btn.click()
                time.sleep(10)
                items.append(self.getItemDetail())

        self.driver.close()
        return items

    def getItemDetail(self):
        data = {}
        try:
            soup = BeautifulSoup(self.driver.page_source, u'html.parser')
            image = soup.find("div",
                              attrs={
                                  "class":
                                  "Tiles__TileBackground-fk0fs3-0 cSObNX"
                              }).find("img")["src"]
            price = soup.find(
                "div",
                attrs={
                    "class":
                    "Text__TextBase-sc-1cait9d-0-div Text__TextContainerBase-sc-1cait9d-1 hlvKRM"
                }).text
            # container = soup.find("div", attrs={"class": "resultsColumn"}).find("ul")
            # items = container.findAll("li", recursive=False)
            data.update({"image": image, "price": price})
        except:
            pass
        return data

    # method to write csv file
    def writeCSVFile(self, data):
        try:
            with open(
                    '/home/gc14/Documents/fiverr/custom_scrapers/home/trulia.csv',
                    mode='w') as csv_file:
                fieldnames = ['Image', 'Price']
                writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
                writer.writeheader()
                for d in data:
                    writer.writerow({'Image': d['image'], 'Price': d['price']})
                csv_file.close()
            print("File written successfully.")
        except:
            print(sys.exc_info())
            pass

    # method to start process.
    def start(self):
        items = self.getItems()
        print("Items : ", len(items))
        if items:
            self.writeCSVFile(items)
Пример #19
0
class DescargarPdf:
    def __init__(self):
        self.contadorCredenciales=0
        self.tbb_dir = "/usr/local/share/tor-browser_en-US"
        self.usuario=[]
        self.contraseñaTxT=[]
        self.conversor='?convertedTo=pdf'
    def iniciarTor(self):
        self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log')
    def iniciarSecion(self):
        self.element=self.zLibraty.find_element_by_name("email")
        self.element.send_keys(self.correo)
        sleep(2)
        self.element2=self.zLibraty.find_elements_by_class_name("form-control")[1]
        self.element2.send_keys(self.contraseña)
        self.element2.send_keys(Keys.RETURN)
    def paginaDescargas(self):
        print("estoy en la funcion paginaDescagas")
        self.zLibraty.load_url(self.url)
        sleep(4)
        self.html=self.zLibraty.page_source
    def paginaPrinsipal(self,añoInicial,añoFinal):
        self.urlAños='http://zlibraryexau2g3p.onion/s/?yearFrom='+str(añoInicial)+'&yearTo='+str(añoFinal)
        self.url=self.urlAños  
    def cambiarPagina(self,x):
        print("estoy en cambiar pagina prinsipal")
        self.url+='&page='+str(x)
        print(self.url)
    def Crearcsv(self):
        desde=datosDescarga(1)
        asta=datosDescarga(2)
        self.carpetaUrl='/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url'
        try :
             os.mkdir(self.carpetaUrl)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        self.escrivirUrlWed=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/url2.csv','w'))
        self.imprimirUrlPdf=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/urlDowload2.csv','w'))
    def credenciales(self,numeroUsuario):
        print("llegue")
        if self.contadorCredenciales==0 or self.contadorCredenciales==20:
            self.zLibraty.load_url("https://singlelogin.org/")
            self.zLibraty.find_element_by_name("redirectToHost").click()
            sleep(3)
            pyautogui.press("down")
            sleep(2)
            pyautogui.press("down")
            sleep(1)
            pyautogui.press("enter")
        sleep(5)
        self.correo=self.usuario[numeroUsuario]
        self.contraseña=self.contraseñaTxT[numeroUsuario]
    def UsuariosYcontraseñas(self):
        self.dir='/home/dd/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt'
        self.data=open(self.dir,'r+')
        for self.i in range(0,200):
            if self.i%2==0 :
                self.usuario.append(self.data.readline())
            if self.i%2!=0:
                self.contraseñaTxT.append(self.data.readline())
    def urlPdf(self,):
        self.contadorCredenciales=1
        self.boleanoPdf=0
        self.respaldoContador=0
        self.contadorUsuarios=usuarioUsadosLeer()
        self.contadorLibros=datosDescarga(4)
        self.contadorLibros2=self.contadorLibros%10
        self.Crearcsv()
        self.soup=BeautifulSoup(self.html,'html.parser')
        try:
            for self.urlwed in self.soup.find_all(itemprop = "name") :
                self.contador=0
                self.urlwed=self.urlwed.find('a',href=re.compile(''))
                self.urlDowload=self.urlwed.get('href')
                self.urlpdfGeleneralH=re.sub('/book/','https://b-ok.cc/book/',self.urlDowload)
                self.urlDowload=re.sub('/book/','http://zlibraryexau2g3p.onion/book/',self.urlDowload)
                self.escrivirUrlWed.writerow([self.urlDowload])
                print(self.urlDowload)
                self.voleano=validarFormato(self.urlpdfGeleneralH)
                guardarNumeroDescargas(self.contadorLibros) 
                print(self.respaldoContador) 
                if self.contadorLibros==self.respaldoContador:
                    for self.urlRedirec in range(0,1):
                        self.zLibraty.load_url(self.urlDowload)
                        sleep(5)
                        self.htmlPdf=self.zLibraty.page_source
                        self.soupRedirec=BeautifulSoup(self.htmlPdf,'html.parser')
                        self.urlDowloadPDF=self.soupRedirec.find(class_="btn btn-primary dlButton addDownloadedBook")
                        self.urlDowloadPDF=self.urlDowloadPDF.get('href')
                        self.urlDowloadPDF=re.sub('/dl/','http://zlibraryexau2g3p.onion/dl/',self.urlDowloadPDF)
                        self.imprimirUrlPdf.writerow([self.urlDowloadPDF])
                        print(self.urlDowloadPDF)
                        print("vamos a por el if")
                        sleep(15)
                        if self.voleano==True:
                            self.zLibraty.set_page_load_timeout(12)
                            try:
                                self.zLibraty.load_url(self.urlDowloadPDF)
                            except:
                                sleep(5)
                                self.zLibraty.set_page_load_timeout(7000)
                                print("funciona PDF ")                                
                            self.voleano=False
                            sleep(5)
                            self.contadorLibros+=1
                            self.contadorLibros2+=1
                        else:                          
                            self.zLibraty.set_page_load_timeout(12)
                            try:
                                self.zLibraty.load_url(self.urlDowloadPDF)
                            except:
                                sleep(8)
                                pyautogui.press("down")
                                sleep(2)
                                pyautogui.press("enter")
                            self.zLibraty.set_page_load_timeout(7000)
                            sleep(5)
                            self.contadorLibros+=1
                            self.contadorLibros2+=1
                        self.zLibraty.load_url("about:downloads")
                        self.datosEsperaDescarga()
                        self.peticiones()
                        self.zLibraty.back()
                        informaiconPdf(self.urlpdfGeleneralH)
                        guardarNumeroDescargas(self.contadorLibros)
                self.respaldoContador+=1                   
                if self.contadorLibros==self.respaldoContador:
                    if self.contadorLibros2%10==0:
                        print((self.contadorLibros2-1)%10)
                        self.contador+=1
                        if self.contadorLibros==20:
                            self.contadorCredenciales=20
                            print("saliendo de secion¡¡¡¡¡¡")
                            pyautogui.moveTo(1707,245)
                            pyautogui.hotkey("ctrl","shift","u")
                            sleep(2)
                            pyautogui.press("enter")
                            sleep(7)
                            pyautogui.press("enter")
                            sleep(15)
                        else:
                            print("saliendo de secion")
                            self.zLibraty.get("http://zlibraryexau2g3p.onion/logout.php")          
                        self.contadorUsuarios+=1
                        print(self.contadorUsuarios)
                        try:
                            self.zLibraty.switch_to_window(self.zLibraty.window_handles[0])
                        except:
                            print("error al cambian de  ventana")
                       
                        usuarioUsadosReescrivir(self.contadorUsuarios)
                        print("por aqui¿¿¿¿¿¿")
                        self.credenciales(self.contadorUsuarios)
                        self.contadorCredenciales=1
                        print("no por aqui¿¿¿¿¿¿")
                        sleep(20)
                        self.iniciarSecion()
                        sleep(15)
                        self.paginaDescargas()
                        sleep(7)
                        self.contadorLibros2=0
                        sleep(15)
                        print("numero de li bros por usuario ",self.contadorLibros2)
                        if self.contador==5:
                            self.contador=0  
        except OSError as e :
            print(e.strerror)
            print("error en la urlPdf:::::")
            guardarNumeroDescargas(self.contadorLibros)
            usuarioUsadosReescrivir(self.contadorUsuarios)
            print(self.contadorLibros)
            archivos=int(contarNueroArchivos())
            print(archivos)
            self.zLibraty.load_url("about:downloads")
            self.datosEsperaDescarga()
            self.peticiones()
            self.zLibraty.back()
            informaiconPdf(self.urlpdfGeleneralH)
    def DescargarContenido(self,_html):         
        self.contenido=_html
    def serrarTor(self):
         self.zLibraty.close()
    def datosEsperaDescarga(self):
        sleep(4)
        self.htmlValidador=self.zLibraty.page_source
    def validarDescarga(self):
        self.htmlFalce=self.zLibraty.page_source
        self.soupFalce=BeautifulSoup(self.htmlFalce,"html.parser")
        self.validarfalce=self.soupFalce.find_all("description",class_="downloadDetails downloadDetailsNormal")
        self.respuestafalce=re.search("value=.+",str(self.validarfalce))
        self.buscarFalse=self.respuestafalce.group()
        if re.search("Canceled",self.buscarFalse):
            print("se daño al descarga =(")
            sleep(5)
            pyautogui.click(1393,139)
            sleep(5)
        else :
            if re.search("Failed",self.buscarFalse):
                print("se daño al descarga pero vamos a solucionarlo =( ")
                sleep(5)
                pyautogui.click(1393,139)
                sleep(5)
            else:    
                print("la descarga va bien =)")
    def peticiones(self):   
        self.validarDescarga()      
        self.carga=0
        self.daño=0
        self.conteo=0
        while self.carga<100:
            self.soup=BeautifulSoup(self.htmlValidador,"html.parser")
            try:
                self.archivoDescarga=self.soup.find_all("progress",class_="downloadProgress")
                self.respaldo=re.split("value",str(self.archivoDescarga))
                self.tiempo=re.search("[0-9]+",self.respaldo[1])
                print(self.tiempo.group())
                self.carga=int(self.tiempo.group())
                self.datosEsperaDescarga()
                sleep(3)
                self.validarDescarga()
                if self.conteo==3:
                    pyautogui.press("enter")
                    self.conteo=0
            except:
                print("o  no ,se daño la descargar y no la e podido volver a iniciar")
                if self.daño==7:
                    os.system('rm -r /home/dd/zlibros/libros1920-1921/libro/*.*')         
                    raise
                self.daño+=1
                sleep(5)
Пример #20
0
    else:
        sys.stderr.write("Please choose 1 for Chrome or 2 for Tor")

    os.chdir(
        '/home/kasai/Documents/adscape/code/newScape/adtool')  #settool dir

    frames_folder = domain + "/Frames"
    if not os.path.exists(frames_folder):
        os.makedirs(frames_folder)

    ads_folder = domain + "/Ads"
    if not os.path.exists(ads_folder):
        os.makedirs(ads_folder)

    start_time = time.time()
    driver.get(site)
    end_time = time.time()
    time.sleep(2)
    plt = round(end_time - start_time, 3)

    scroll_page(driver)  # scroll the page
    time.sleep(3)

    frame_count = 0
    image_count = 0
    ad_count = 0
    tracking_pix = 0
    processed_list_frame = []
    processed_list_img = []
    emb = 0
Пример #21
0
class InstagramScraper():
    """InstagramScraper: Web scraper class.

    This class is used represent the various browser types using numeric values instead
    of string values. This to allow for easy checking and changing/expanding of 
    values.
    """
    def __init__(self, browser_type, user_data_dir=None):
        # internal flag so we know what sort of web browser we are instantiating
        self.WebBrowserType = browser_type 

        # various browser initiation according to different browser types
        if (browser_type == WebBrowserType.CHROME):
            print_dbg_msg_L1("\t[+] Starting Chrome...")
            options = webdriver.chrome.options.Options()
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")
            options.add_argument("--no-sandbox")
            options.add_argument("--no-default-browser-check")
            self.browser = webdriver.Chrome(chrome_options=options)
        elif (browser_type == WebBrowserType.CHROME_DEBUG):
            print_dbg_msg_L1("\t[+] Starting Chrome in debug mode...")
            options = webdriver.chrome.options.Options()
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")
            options.add_argument("--no-sandbox")
            options.add_argument("--no-default-browser-check")
            options.add_argument("--remote-debugging-port=9222")
            if user_data_dir == None:
                user_data_dir=chrome_debug_profile + "/" + str(secrets.token_hex(16))
            #print_dbg_msg_L1("\t\t[+] User data dir: " + user_data_dir)
            if not os.path.exists(user_data_dir):
                os.makedirs(user_data_dir)
            options.add_argument("--user-data-dir=" + user_data_dir)
            self.browser = webdriver.Chrome(chrome_options=options)
        elif (browser_type == WebBrowserType.TOR):
            ''' 
            Sometimes the Tor process fails to launch or the web browser fails
            to instantiate properly. Regardless, loop until both the Tor
            process and the browser is instantiated correctly. So far, over
            30,000 runs, the instantiation usually kicks in after at most 1
            failure.
            
            '''
            while True:
                try:
                    self.tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir)
                    self.browser = TorBrowserDriver(tbb_dir, \
                        tor_cfg=cm.USE_STEM, \
                        tbb_profile_path=tbb_ff_default_dir, \
                        tbb_logfile_path=tbb_log_dir)
                except Exception as e:
                    print_dbg_msg_L1("\t[+] " + str(e))
                    print_dbg_msg_L1("\t[+] Error instantiating browser, retrying...")
                    time.sleep(1)
                    continue
                else:
                    break   
        else:
            self.browser = webdriver.Firefox()

    def get(self, targetWebAddress):
        self.browser.get(targetWebAddress)

    def close(self):
        self.browser.quit()
        if self.WebBrowserType == WebBrowserType.TOR:
            self.tor_process.kill()

    def __exit__(self, exc_type, exc_value, traceback):
        self.browser.quit()