def getTorDriver(tor_installation_path: str, driver_path: str): # store original path of the directory (tor is changing the path when executing) originalPath = getcwd() driver = TorBrowserDriver(tor_installation_path, executable_path=driver_path) driver.get("https://www.facebookcorewwwi.onion/") chdir(originalPath) return driver
def capture(website,epoch): #print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT") #print(sys.argv) if 'tor' in sys.argv: print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT") browser = TorBrowserDriver(TBB_dir,socks_port=socks_port,control_port=control_port) else: profile = firfox_proxy(webdriver) browser = webdriver.Firefox(firefox_profile=profile,firefox_binary = firefox_dir) browser.delete_all_cookies() browser.get('http://' + website)
def test_close_all_streams(self): streams_open = False new_tb_drv = TorBrowserDriver(cm.TBB_PATH) new_tb_drv.get('http://www.google.com') time.sleep(30) self.tor_controller.close_all_streams() for stream in self.tor_controller.controller.get_streams(): print stream.id, stream.purpose, stream.target_address, "open!" streams_open = True new_tb_drv.quit() self.assertFalse(streams_open, 'Could not close all streams.')
def test_close_all_streams(self): streams_open = False new_tb_drv = TorBrowserDriver(cm.TBB_DIR, tbb_logfile_path='test.log') new_tb_drv.get('http://www.google.com') time.sleep(30) self.tor_controller.close_all_streams() for stream in self.tor_controller.controller.get_streams(): print(stream.id, stream.purpose, stream.target_address, "open!") streams_open = True new_tb_drv.quit() self.assertFalse(streams_open, 'Could not close all streams.')
class TruliaHelper(): def __init__(self): self.url = 'https://www.trulia.com' # need to set chrome path here. tbpath = "/home/XX/XXXX/tor-browser-linux64-8.0.8_en-US/tor-browser_en-US" self.driver = TorBrowserDriver(tbb_path=tbpath, tbb_logfile_path='test.log') # self.driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary) # self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe', chrome_options=chrome_options) # method to get items from given link. def getItems(self): df=pd.read_excel("/home/XXXXX/XXXXX/XXXXXX.xlsx") a=df['Site Address'] b=df['Site City'] c=df['Site State'] d=df['Site Zip'] items = [] # keywords = ['512 W 10th St Perris CA 92570', 'New York, NY', 'San Francisco, CA', 'Washington, CA'] for keyword in (pd.concat([a,b,c,d],axis=1)).values.tolist(): # keywords = ['512 W 10th St Perris CA 92570'] * 10 # for keyword in keywords: self.driver.get(self.url) search_box = self.driver.find_element_by_id("homepageSearchBoxTextInput") search_box.clear() search_box.send_keys(str(keyword)) search_btn = self.driver.find_element_by_xpath("//button[@data-auto-test-id='searchButton']") if search_btn: search_btn.click() time.sleep(10) items.append(self.getItemDetail()) # break self.driver.close() return items def getItemDetail(self): data = {} try: soup = BeautifulSoup(self.driver.page_source, u'html.parser') #image = soup.find("div", attrs={"class": "Tiles__TileBackground-fk0fs3-0 cSObNX"}).find("img")["src"] price = soup.find("div", attrs={"class": "Text__TextBase-sc-1cait9d-0-div Text__TextContainerBase-sc-1cait9d-1 hlvKRM"}).text # container = soup.find("div", attrs={"class": "resultsColumn"}).find("ul") # items = container.findAll("li", recursive=False) print(price) except: pass return data # method to start process. def start(self): items = self.getItems() print("Items : ",items)
class TorBrowserThread(threading.Thread): def __init__(self, name, url): threading.Thread.__init__(self) self.name = name self.url = url def terminate(self): self.browser.quit() def run(self): self.browser = TorBrowserDriver("/home/dev/tor-browser") self.browser.profile.set_preference("browser.cache.disk.enable", False) self.browser.profile.set_preference("browser.cache.memory.enable", False) self.browser.profile.set_preference("browser.cache.offline.enable", False) self.browser.profile.set_preference("network.http.use-cache", False) self.browser.profile.set_preference("network.cookie.cookieBehavior", 2) self.browser.get(self.url)
class RunDriverWithControllerTest(unittest.TestCase): """ This test shows how to run tor with TorController and browse with TorBrowserDriver. """ @unittest.skip("Only for didactic purposes.") def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController(cm.TBB_PATH, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_PATH, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.kill_tor_proc()
class RunDriverWithControllerTest(unittest.TestCase): """ This test shows how to run tor with TorController and browse with TorBrowserDriver. """ @unittest.skip("Only for didactic purposes.") def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController( cm.TBB_DIR, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_DIR, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.quit()
class DescargarPdf: def __init__(self): self.tbb_dir = "/usr/local/share/tor-browser_en-US" self.usuario = [] self.contraseñaTxT = [] self.conversor = '?convertedTo=pdf' def iniciarTor(self): self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log') def iniciarSecion(self): self.zLibraty.refresh() sleep(10) self.element = self.zLibraty.find_element_by_name("email") self.element.send_keys(self.correo) sleep(2) self.element2 = self.zLibraty.find_elements_by_class_name( "form-control")[1] self.element2.send_keys(self.contraseña) self.element2.send_keys(Keys.RETURN) def paginaDescargas(self): self.zLibraty.load_url(self.url) self.html = self.zLibraty.page_source def paginaPrinsipal(self, añoInicial, añoFinal): self.urlAños = 'http://zlibraryexau2g3p.onion/s/?yearFrom=' + str( añoInicial) + '&yearTo=' + str(añoFinal) self.url = self.urlAños def cambiarPagina(self, x): self.url += '&page=' + str(x) def Crearcsv(self): print("hola") self.carpetaUrl = '/home/dgc7/Documentos/zlibrary/libros1920-1921/url' try: os.mkdir(self.carpetaUrl) except OSError as e: if e.errno != errno.EEXIST: raise self.escrivirUrlWed = csv.writer( open('/home/dgc7/Documentos/zlibrary/libros1920-1921/url/url2.csv', 'w')) self.imprimirUrlPdf = csv.writer( open( '/home/dgc7/Documentos/zlibrary/libros1920-1921/url/urlDowload2.csv', 'w')) def credenciales(self, numeroUsuario): self.correo = self.usuario[numeroUsuario] self.contraseña = self.contraseñaTxT[numeroUsuario] self.urlLoguin = 'http://zlibraryexau2g3p.onion' self.zLibraty.load_url(self.urlLoguin) def UsuariosYcontraseñas(self): self.dir = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/descargarLIbros/descargarparte1/contraseñasYcorreos.txt' self.data = open(self.dir, 'r+') for self.i in range(0, 200): if self.i % 2 == 0: self.usuario.append(self.data.readline()) if self.i % 2 != 0: self.contraseñaTxT.append(self.data.readline()) def urlPdf(self, contador, _contadorusuarios): self.boleanoPdf = 0 self.contadorUsuariosCon = _contadorusuarios self.contadorLibros2 = 0 self.contadorLibros = 0 self.Crearcsv() self.soup = BeautifulSoup(self.html, 'html.parser') for self.urlwed in self.soup.find_all(itemprop="name"): self.contador = 0 self.urlwed = self.urlwed.find('a', href=re.compile('')) self.urlDowload = self.urlwed.get('href') self.urlpdfGeleneralH = re.sub('/book/', 'https://b-ok.cc/book/', self.urlDowload) self.urlDowload = re.sub('/book/', 'http://zlibraryexau2g3p.onion/book/', self.urlDowload) self.escrivirUrlWed.writerow([self.urlDowload]) print(self.urlDowload) self.contadorLibros += 1 self.contadorLibros2 += 1 if self.contadorLibros2 == 10: self.contador += 1 self.serrarTor() sleep(4) self.iniciarTor() self.contadorUsuariosCon += 1 print(self.contadorUsuariosCon) self.credenciales(contadorusuarios) self.iniciarSecion() sleep(7) self.contadorLibros2 = 0 sleep(15) if self.contador == 5: self.contador = 0 voleano = validarFormato(self.urlpdfGeleneralH) for self.urlRedirec in range(0, 1): self.zLibraty.load_url(self.urlDowload) sleep(5) self.htmlPdf = self.zLibraty.page_source self.soupRedirec = BeautifulSoup(self.htmlPdf, 'html.parser') self.urlDowloadPDF = self.soupRedirec.find( class_="btn btn-primary dlButton addDownloadedBook") self.urlDowloadPDF = self.urlDowloadPDF.get('href') self.urlDowloadPDF = re.sub( '/dl/', 'http://zlibraryexau2g3p.onion/dl/', self.urlDowloadPDF) self.imprimirUrlPdf.writerow([self.urlDowloadPDF]) print(self.urlDowloadPDF) if voleano == True: self.zLibraty.get(self.urlDowloadPDF) voleano = False else: self.convertirpdf = str(self.urlDowloadPDF) + str( self.conversor) self.zLibraty.get(self.convertirpdf) sleep(20) tiempoDescarga() informaiconPDf(self.urlpdfGeleneralH) def DescargarContenido(self, _html): self.contenido = _html def serrarTor(self): self.zLibraty.close()
def tor_web_crawler(index, link, ip_address): """ This function is a web crawler for collection of traffic traces and saving those traces to pcap files. :param index: current trace of the link :param link: webpage address from where traffic is to be collected :param ip_address: ip-addres of the machine from which traffic is to be collected :param timeout: duration upto which traffic information needs to be collected :param pkt_count: number of packets to be collected for a particular trace :return: """ # Extracting domain name for saving trace separately url = link lnk = tldextract.extract(url) domain_name = lnk.domain + '.' + lnk.suffix # print('Processing trace for domain name crawl : ', domain_name) # interface = 'enp0s31f6' # interface = 'any' interface = 'eth0' cap = DesiredCapabilities().FIREFOX cap["marionette"] = True # optional # driver = TorBrowserDriver(TBB_PATH) try: driver = TorBrowserDriver(TBB_PATH) # saving the pcapfiles PP = PACP_PATH + '/' + domain_name # saving the screen shots SS = SCREEN_SHOT + '/' + domain_name driver.get(url) except wde as e: print('Browser crashed:') print(e) print('Trying again in 10 seconds ...') time.sleep(10) driver = driver print('Success!\n') except Exception as e: raise Exception(e) if not os.path.isdir(PP): print('Creating directory for saving capture files (pcap) ...') os.makedirs(PP) else: pass if not os.path.isdir(SS): print('Creating directory for saving screenshots ...') os.makedirs(SS) else: pass # command to be executed for capturing the trace # command = "sudo tcpdump -i " + str(interface) + " -n host " + str(ip_address) + " -c " + str(pkt_count) + " -w " + PP + "/" + domain_name + "_" + str(index) + ".pcap " command = "sudo timeout 60 tcpdump -i " + str( interface) + " -n host " + str( ip_address) + " -w " + PP + "/" + domain_name + "_" + str( index) + ".pcap" print('Capture trace ...') capture = subprocess.Popen(command, shell=True) # time.sleep(1) capture.wait() print('Traffic trace captured and saved successfully.') # save the screenshot driver.save_screenshot(SS + '/' + domain_name + '-' + str(index) + '.png') print('Screen shot of the webpage saved successfully.') driver.quit()
def makeRequest(url, domain): """ Makes HTTP request to url given as argument, after changing IP. """ import time # Opening log file f = open(logfile_name, 'a') print('Changing IP...\n') # Below is method A using requests library without opening real TOR browser. # Method B will be used instead, which opens a real browser, so that JS code is executed # and Google Analytics tracks us as a real user. """ # Resetting IP tr.reset_identity() # This command changes restarts tor service, resulting in IP address change. After '-p' flag insert user password. #os.system('sudo systemctl restart tor -p 0000') #Creating empty session object session = requests.session() session.proxies = {} # Adding proxies to session session.proxies['http'] = 'socks5h://localhost:9050' session.proxies['https'] = 'socks5h://localhost:9050' #Changing request headers headers = {} headers['User-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' print('Request headers were set.\n') - new_ip = session.get('http://ipecho.net/plain').text # Executing requests #Executing request and assigning response status code status_code = session.get(url).status_code """ # Method B, using complete TOR Browser driver = TorBrowserDriver("/home/manos/Desktop/tor-browser_en-US") # driver.get('https://ipecho.net/plain') # new_ip = driver.find_element_by_tag_name('body').text checkConn() driver.get(url) time.sleep(2.0) driver.close() # Request logging time = 'Date: ' + str(datetime.datetime.now())[0:10] + '\nTime: ' + str( datetime.datetime.now())[11:19] f.write( time + '\nDomain: ' + domain + '\n' 'Request sent to ' + url + '.' + '\nResponse status code: ' + str(200) + '\n*******************************************************************************************\n\n' ) f.close() os.system('clear')
class DescargarPdf: def __init__(self): self.tbb_dir = "/usr/local/share/tor-browser_en-US" self.usuario = [] self.contraseñaTxT = [] self.conversor = '?convertedTo=pdf' def iniciarTor(self): self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log') def iniciarSecion(self): self.element = self.zLibraty.find_element_by_name("email") self.element.send_keys(self.correo) sleep(2) self.element2 = self.zLibraty.find_elements_by_class_name( "form-control")[1] self.element2.send_keys(self.contraseña) self.element2.send_keys(Keys.RETURN) def paginaDescargas(self): print("estoy en la funcion paginaDescagas") sleep(4) self.zLibraty.get(self.url) self.html = self.zLibraty.page_source def paginaPrinsipal(self, añoInicial, añoFinal): self.urlAños = 'http://zlibraryexau2g3p.onion/s/?yearFrom=' + str( añoInicial) + '&yearTo=' + str(añoFinal) self.url = self.urlAños def cambiarPagina(self, x): self.url += '&page=' + str(x) def Crearcsv(self): self.carpetaUrl = '/home/dgc7/Documentos/zlibrary/libros1920-1921/url' try: os.mkdir(self.carpetaUrl) except OSError as e: if e.errno != errno.EEXIST: raise self.escrivirUrlWed = csv.writer( open('/home/dgc7/Documentos/zlibrary/libros1920-1921/url/url2.csv', 'w')) self.imprimirUrlPdf = csv.writer( open( '/home/dgc7/Documentos/zlibrary/libros1920-1921/url/urlDowload2.csv', 'w')) def credenciales(self, numeroUsuario): print("llegue") self.correo = self.usuario[numeroUsuario] self.contraseña = self.contraseñaTxT[numeroUsuario] self.urlLoguin = 'http://zlibraryexau2g3p.onion' self.zLibraty.get(self.urlLoguin) def UsuariosYcontraseñas(self): self.dir = '/home/dgc7/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt' self.data = open(self.dir, 'r+') for self.i in range(0, 200): if self.i % 2 == 0: self.usuario.append(self.data.readline()) if self.i % 2 != 0: self.contraseñaTxT.append(self.data.readline()) def urlPdf(self, ): self.boleanoPdf = 0 self.respaldoContador = 0 self.contadorUsuarios = usuarioUsadosLeer() self.contadorLibros = datosDescarga(4) self.contadorLibros2 = self.contadorLibros % 10 self.Crearcsv() self.soup = BeautifulSoup(self.html, 'html.parser') try: for self.urlwed in self.soup.find_all(itemprop="name"): self.contador = 0 self.urlwed = self.urlwed.find('a', href=re.compile('')) self.urlDowload = self.urlwed.get('href') self.urlpdfGeleneralH = re.sub('/book/', 'https://b-ok.cc/book/', self.urlDowload) self.urlDowload = re.sub( '/book/', 'http://zlibraryexau2g3p.onion/book/', self.urlDowload) self.escrivirUrlWed.writerow([self.urlDowload]) print(self.urlDowload) voleano = validarFormato(self.urlpdfGeleneralH) guardarNumeroDescargas(self.contadorLibros) print(self.respaldoContador) if self.contadorLibros == self.respaldoContador: for self.urlRedirec in range(0, 1): self.zLibraty.get(self.urlDowload) sleep(5) self.htmlPdf = self.zLibraty.page_source self.soupRedirec = BeautifulSoup( self.htmlPdf, 'html.parser') self.urlDowloadPDF = self.soupRedirec.find( class_="btn btn-primary dlButton addDownloadedBook" ) self.urlDowloadPDF = self.urlDowloadPDF.get('href') self.urlDowloadPDF = re.sub( '/dl/', 'http://zlibraryexau2g3p.onion/dl/', self.urlDowloadPDF) self.imprimirUrlPdf.writerow([self.urlDowloadPDF]) print(self.urlDowloadPDF) print("vamos a por el if") sleep(10) if voleano == True: self.zLibraty.set_page_load_timeout(8) try: self.zLibraty.get(self.urlDowloadPDF) except: self.zLibraty.set_page_load_timeout(70) self.zLibraty.refresh() print("funciona PDF ") voleano = False sleep(5) self.contadorLibros += 1 self.contadorLibros2 += 1 else: try: self.zLibraty.set_page_load_timeout(5) try: self.zLibraty.get(self.urlDowloadPDF) except: sleep(4) pyautogui.press("down") sleep(2) pyautogui.press("enter") self.zLibraty.set_page_load_timeout(70) except: print( "\nerror al controlasr el teclado y dar enter\n" ) raise sleep(5) self.zLibraty.refresh() self.contadorLibros += 1 self.contadorLibros2 += 1 sleep(20) tiempoDescarga() informaiconPdf(self.urlpdfGeleneralH) self.respaldoContador += 1 if self.contadorLibros == self.respaldoContador: if self.contadorLibros2 % 10 == 0: print((self.contadorLibros2 - 1) % 10) self.contador += 1 pyautogui.hotkey("ctrl", "shift", "u") sleep(2) pyautogui.press("enter") sleep(7) pyautogui.press("enter") sleep(15) self.contadorUsuarios += 1 print(self.contadorUsuarios) try: self.zLibraty.switch_to_window( self.zLibraty.window_handles[0]) except: print("error al cambian de ventana") usuarioUsadosReescrivir(self.contadorUsuarios) print("por aqui¿¿¿¿¿¿") self.credenciales(self.contadorUsuarios) print("no por aqui¿¿¿¿¿¿") sleep(23) self.iniciarSecion() sleep(7) self.contadorLibros2 = 0 sleep(15) print("numero de li bros por usuario ", self.contadorLibros2) if self.contador == 5: self.contador = 0 except OSError as e: print(e.strerror) print("error en la urlPdf:::::") guardarNumeroDescargas(self.contadorLibros) usuarioUsadosReescrivir(self.contadorUsuarios) print(self.contadorLibros) raise print("termine la pagina") def DescargarContenido(self, _html): self.contenido = _html def serrarTor(self): self.zLibraty.close()
class Visit(object): """Hold info about a particular visit to a page.""" def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None, experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True): self.batch_num = batch_num self.site_num = site_num self.instance_num = instance_num self.page_url = page_url self.bg_site = bg_site self.experiment = experiment self.base_dir = base_dir self.visit_dir = None self.visit_log_dir = None self.tbb_version = cm.RECOMMENDED_TBB_VERSION self.capture_screen = capture_screen self.tor_controller = tor_controller self.xvfb = xvfb self.init_visit_dir() self.pcap_path = os.path.join( self.visit_dir, "{}.pcap".format(self.get_instance_name())) if self.xvfb and not cm.running_in_CI: wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H)) self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H) self.vdisplay.start() # Create new instance of TorBrowser driver TorBrowserDriver.add_exception(self.page_url) self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH, tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log")) self.sniffer = Sniffer() # sniffer to capture the network traffic def init_visit_dir(self): """Create results and logs directories for this visit.""" visit_name = str(self.instance_num) self.visit_dir = os.path.join(self.base_dir, visit_name) ut.create_dir(self.visit_dir) self.visit_log_dir = os.path.join(self.visit_dir, 'logs') ut.create_dir(self.visit_log_dir) def get_instance_name(self): """Construct and return a filename for the instance.""" inst_file_name = '{}_{}_{}' \ .format(self.batch_num, self.site_num, self.instance_num) return inst_file_name def filter_guards_from_pcap(self): guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()]) wl_log.debug("Found %s guards in the concensus.", len(guard_ips)) orig_pcap = self.pcap_path + ".original" copyfile(self.pcap_path, orig_pcap) try: preader = PcapReader(orig_pcap) pcap_filtered = [] for p in preader: if IP not in p: pcap_filtered.append(p) continue ip = p.payload if ip.dst in guard_ips or ip.src in guard_ips: pcap_filtered.append(p) wrpcap(self.pcap_path, pcap_filtered) except Exception as e: wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s", e, orig_pcap) else: os.remove(orig_pcap) def post_crawl(self): pass # TODO: add some sanity checks? def cleanup_visit(self): """Kill sniffer and Tor browser if they're running.""" wl_log.info("Cleaning up visit.") wl_log.info("Cancelling timeout") ut.cancel_timeout() if self.sniffer and self.sniffer.is_recording: wl_log.info("Stopping sniffer...") self.sniffer.stop_capture() # remove non-tor traffic self.filter_guards_from_pcap() if self.tb_driver and self.tb_driver.is_running: # shutil.rmtree(self.tb_driver.prof_dir_path) wl_log.info("Quitting selenium driver...") self.tb_driver.quit() # close all open streams to prevent pollution self.tor_controller.close_all_streams() if self.xvfb and not cm.running_in_CI: wl_log.info("Stopping display...") self.vdisplay.stop() # after closing driver and stoping sniffer, we run postcrawl self.post_crawl() def take_screenshot(self): try: out_png = os.path.join(self.visit_dir, 'screenshot.png') wl_log.info("Taking screenshot of %s to %s" % (self.page_url, out_png)) self.tb_driver.get_screenshot_as_file(out_png) if cm.running_in_CI: wl_log.debug("Screenshot data:image/png;base64,%s" % self.tb_driver.get_screenshot_as_base64()) except: wl_log.info("Exception while taking screenshot of: %s" % self.page_url) def get_wang_and_goldberg(self): """Visit the site according to Wang and Goldberg (WPES'13) settings.""" ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to stop the visit self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {}".format(self.page_url)) t1 = time.time() self.tb_driver.get(self.page_url) page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit() def get_multitab(self): """Open two tab, use one to load a background site and the other to load the real site.""" PAUSE_BETWEEN_TAB_OPENINGS = 0.5 ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to kill running procs # load a blank page - a page is needed to send keys to the browser self.tb_driver.get(BAREBONE_HOME_PAGE) self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {} with {} in the background". format(self.page_url, self.bg_site)) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab # now that the focus is on the address bar, load the background # site by "typing" it to the address bar and "pressing" ENTER (\n) # simulated by send_keys function body.send_keys('%s\n' % self.bg_site) # the delay between the loading of background and real sites time.sleep(PAUSE_BETWEEN_TAB_OPENINGS) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab t1 = time.time() self.tb_driver.get(self.page_url) # load the real site in the 2nd tab page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit() def get(self): """Call the specific visit function depending on the experiment.""" if self.experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG: self.get_wang_and_goldberg() elif self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA: self.get_multitab() else: raise ValueError("Cannot determine experiment type")
class Browser: def __init__(self, config, browser, pet, env_type, proxy_setting): """ If given valid proxy settings, this function will configure socks5 proxy properly on chrome (brave) and firefox. """ def setup_socks5_proxy(browser, profile, proxy_setting): if proxy_setting is not None: address = proxy_setting["address"] port = proxy_setting["port"] bypass_list = proxy_setting["bypass-list"] if browser == "chrome": # https://sordidfellow.wordpress.com/2015/05/21/ssh-tunnel-for-chrome/ profile.add_argument("--proxy-server=socks5://%s:%s" % (address, port)) profile.add_argument("--proxy-bypass-list=%s" % bypass_list) print("socks5 proxy configured on chrome") elif browser == "firefox": # https://developer.mozilla.org/en-US/docs/Mozilla/Preferences/Mozilla_networking_preferences profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.socks", address) profile.set_preference("network.proxy.socks_port", port) profile.set_preference("network.proxy.socks_version", 5) profile.set_preference("network.proxy.socks_remote_dns", "true") profile.set_preference("network.proxy.no_proxies_on", bypass_list) print("socks5 proxy configured on firefox") """ If the program is run in a virtual machine, xvfbwrapper has to get installed first. """ self.env_type = env_type if (env_type == "vm"): print("xvfb") from xvfbwrapper import Xvfb width, height, depth = get_display_parameters(config) self.vdisplay = Xvfb(width=width, height=height, colordepth=depth) self.vdisplay.start() print("Browser:", browser, "PET:", pet) pet_config = PetConfig() if pet == "brave": print("brave") chrome_options = ChromeOptions() bPath, dPath = pet_config.getPetBrowserDriverPath(pet,browser,env_type) print(bPath, dPath) chromedriver = dPath chrome_options.binary_location = bPath setup_socks5_proxy("chrome", chrome_options, proxy_setting) os.environ["webdriver.chrome.driver"] = chromedriver if env_type == "vm": chrome_options.add_argument("--no-sandbox") self.driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options) press_enter(1) return elif pet == "tor": plt= platform.system().lower() if plt == "darwin" or plt == "windows": # https://stackoverflow.com/questions/15316304/open-tor-browser-with-selenium print("native tor") bPath, dPath = pet_config.getPetBrowserDriverPath(pet,browser,env_type) print(bPath, dPath) profile = FirefoxProfile() profile.set_preference("network.proxy.type", 0) binary = FirefoxBinary(bPath) self.driver = webdriver.Firefox(firefox_profile = profile, firefox_binary= binary, executable_path = dPath) elif plt == "linux": # https://medium.com/@manivannan_data/selenium-with-tor-browser-using-python-7b3606b8c55c print("vm tor") from tbselenium.tbdriver import TorBrowserDriver pref_dict = {"network.proxy.no_proxies_on": "http://10.0.2.2/, http://192.168.4.204/"} self.driver = TorBrowserDriver(os.environ['TBB_PATH'], pref_dict = pref_dict) return aPath, bPath, dPath, pref = pet_config.getPetBrowserDriverPath(pet,browser,env_type) if (browser == "firefox"): fp = FirefoxProfile() setup_socks5_proxy("firefox", fp, proxy_setting) binary = FirefoxBinary(bPath) if pref != None: fp.set_preference(pref[0],pref[1]) self.driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary, executable_path=dPath) if (aPath): self.driver.install_addon(aPath) elif (browser == "chrome"): chrome_options = ChromeOptions() chrome_options = webdriver.ChromeOptions() #https://github.com/SeleniumHQ/selenium/issues/5966 setup_socks5_proxy("chrome", chrome_options, proxy_setting) if aPath: chrome_options.add_extension(aPath) if pref != None: chrome_options.add_experimental_option(pref[0],pref[1]) chrome_options.binary_location = bPath os.environ["webdriver.chrome.driver"] = dPath time.sleep(1) self.driver = webdriver.Chrome(executable_path=dPath, chrome_options=chrome_options) # to escape the alert chrome display on first visit time.sleep(1) press_enter(1) elif(browser == "safari"): self.driver = webdriver.Safari() else: print("Unsupported Browser") sys.exit(0) def quit(self): try: self.driver.quit() except: self.driver.close() # for Tor if (self.env_type == "vm"): self.vdisplay.stop() def visit_sites(self, site_list, delay=5): """Visits all pages in site_list with delay""" for site in site_list: sys.stdout.write(".") sys.stdout.flush() try: self.driver.get(site) time.sleep(delay) except: print("Unexpected error:", sys.exc_info()[0])
"/home/ubuntu/Downloads/tor-browser-linux64-8.5.4_en-US/tor-browser_en-US") insightsLogin(driver, "*****@*****.**", "hotmail143") while (True): try: sql = "SELECT `term` FROM `insights` where completed is null order by rand() limit 1" cursor.execute(sql) result = cursor.fetchall() originalTerm = result[0]['term'] print(originalTerm) term = originalTerm.replace("-", " ") driver.implicitly_wait(10) driver.get( "https://www.udemy.com/instructor/marketplace-insights/?q=" + term + "&lang=en") sleep(random.randint(5, 8)) try: demandEl = driver.find_element_by_xpath( '//div[contains(@class,"panel-body")]/div[contains(@class,"course-label-metrics-opportunity")]/div[1]/div/div[2]' ) print(demandEl.text) except NoSuchElementException: print("trying hyphenated...") term = term.replace(" ", "-") driver.get( "https://www.udemy.com/instructor/marketplace-insights/?q=" + term + "&lang=en") sleep(random.randint(5, 8))
# open list of urls for testing with open('markMeasureResults.txt', 'r') as url_file: test_urls = url_file.readlines() driver = TorBrowserDriver(tor_dir) #, pref_dict=rfp) driver.set_page_load_timeout(60) cached = set() # do 10 runs while True: random.shuffle(test_urls) for i, url in enumerate(test_urls): try: # request url from list print("Fetching " + str(url), end='') driver.get(url) # pull window.performance.timing after loading the page and add information about url and number of run perf_timings = driver.execute_script( "return window.performance.timing") perf_timings['timestamp'] = datetime.now() perf_timings['path'] = tor_dir perf_timings['cached'] = str(url in cached) perf_timings['url'] = str(url) perf_timings['error'] = 'NONE' #print(str(set(perf_timings.keys())-set(colList))) #TODO Put in Database insertDict(sql, perf_timings) cached.add(url) except Exception as E: # what to do in case that an exception is thrown (which happens usually upon page load timeout)
n = 0 w_ = open('%s/11th/11th_odds' % PATH, 'w') def parser(element): x = driver.find_element_by_id(element).text time.sleep(3) #x_h = x_el.get_attribute('outerHTML') #x_ = x_h.split('">') #x = x_[1] #x = x[:x.index('<')] return x for line in f: n += 1 driver.get(line[:-1]) time.sleep(3) elem1 = parser('lpRow1') elem2 = parser('lpRow2') x = elem1+elem2 x_ = x.split('\n') #print(x_) x0 = x_[0] if x0[0] == ' ': x0 = x[1:] x1 = x_[1] x1_ = x1.split(' ') sell = x1_[1] buy = x1_[2]
class TruliaHelper(): def __init__(self): self.url = 'https://www.trulia.com' # need to set Tor Browser path here. tbpath = "/home/gc14/Documents/softwares/tor-browser_en-US" self.driver = TorBrowserDriver(tbb_path=tbpath, tbb_logfile_path='test.log') # self.driver = webdriver.Firefox(firefox_profile=profile, firefox_binary=binary) # self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe', chrome_options=chrome_options) # method to get items from given link. def getItems(self): items = [] # keywords = ['512 W 10th St Perris CA 92570', 'New York, NY', 'San Francisco, CA', 'Washington, CA'] keywords = ['512 W 10th St Perris CA 92570'] * 2 for keyword in keywords: self.driver.get(self.url) search_box = self.driver.find_element_by_id( "homepageSearchBoxTextInput") search_box.clear() search_box.send_keys(keyword) search_btn = self.driver.find_element_by_xpath( "//button[@data-auto-test-id='searchButton']") if search_btn: print("Going to click") search_btn.click() time.sleep(10) items.append(self.getItemDetail()) self.driver.close() return items def getItemDetail(self): data = {} try: soup = BeautifulSoup(self.driver.page_source, u'html.parser') image = soup.find("div", attrs={ "class": "Tiles__TileBackground-fk0fs3-0 cSObNX" }).find("img")["src"] price = soup.find( "div", attrs={ "class": "Text__TextBase-sc-1cait9d-0-div Text__TextContainerBase-sc-1cait9d-1 hlvKRM" }).text # container = soup.find("div", attrs={"class": "resultsColumn"}).find("ul") # items = container.findAll("li", recursive=False) data.update({"image": image, "price": price}) except: pass return data # method to write csv file def writeCSVFile(self, data): try: with open( '/home/gc14/Documents/fiverr/custom_scrapers/home/trulia.csv', mode='w') as csv_file: fieldnames = ['Image', 'Price'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for d in data: writer.writerow({'Image': d['image'], 'Price': d['price']}) csv_file.close() print("File written successfully.") except: print(sys.exc_info()) pass # method to start process. def start(self): items = self.getItems() print("Items : ", len(items)) if items: self.writeCSVFile(items)
class DescargarPdf: def __init__(self): self.contadorCredenciales=0 self.tbb_dir = "/usr/local/share/tor-browser_en-US" self.usuario=[] self.contraseñaTxT=[] self.conversor='?convertedTo=pdf' def iniciarTor(self): self.zLibraty = TorBrowserDriver(self.tbb_dir, tbb_logfile_path='test.log') def iniciarSecion(self): self.element=self.zLibraty.find_element_by_name("email") self.element.send_keys(self.correo) sleep(2) self.element2=self.zLibraty.find_elements_by_class_name("form-control")[1] self.element2.send_keys(self.contraseña) self.element2.send_keys(Keys.RETURN) def paginaDescargas(self): print("estoy en la funcion paginaDescagas") self.zLibraty.load_url(self.url) sleep(4) self.html=self.zLibraty.page_source def paginaPrinsipal(self,añoInicial,añoFinal): self.urlAños='http://zlibraryexau2g3p.onion/s/?yearFrom='+str(añoInicial)+'&yearTo='+str(añoFinal) self.url=self.urlAños def cambiarPagina(self,x): print("estoy en cambiar pagina prinsipal") self.url+='&page='+str(x) print(self.url) def Crearcsv(self): desde=datosDescarga(1) asta=datosDescarga(2) self.carpetaUrl='/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url' try : os.mkdir(self.carpetaUrl) except OSError as e: if e.errno != errno.EEXIST: raise self.escrivirUrlWed=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/url2.csv','w')) self.imprimirUrlPdf=csv.writer(open('/home/dd/Documentos/zlibrary/libros'+str(desde)+'-'+str(asta)+'/url/urlDowload2.csv','w')) def credenciales(self,numeroUsuario): print("llegue") if self.contadorCredenciales==0 or self.contadorCredenciales==20: self.zLibraty.load_url("https://singlelogin.org/") self.zLibraty.find_element_by_name("redirectToHost").click() sleep(3) pyautogui.press("down") sleep(2) pyautogui.press("down") sleep(1) pyautogui.press("enter") sleep(5) self.correo=self.usuario[numeroUsuario] self.contraseña=self.contraseñaTxT[numeroUsuario] def UsuariosYcontraseñas(self): self.dir='/home/dd/Documentos/zlibrary/credenciales/contraseñasYcorreos.txt' self.data=open(self.dir,'r+') for self.i in range(0,200): if self.i%2==0 : self.usuario.append(self.data.readline()) if self.i%2!=0: self.contraseñaTxT.append(self.data.readline()) def urlPdf(self,): self.contadorCredenciales=1 self.boleanoPdf=0 self.respaldoContador=0 self.contadorUsuarios=usuarioUsadosLeer() self.contadorLibros=datosDescarga(4) self.contadorLibros2=self.contadorLibros%10 self.Crearcsv() self.soup=BeautifulSoup(self.html,'html.parser') try: for self.urlwed in self.soup.find_all(itemprop = "name") : self.contador=0 self.urlwed=self.urlwed.find('a',href=re.compile('')) self.urlDowload=self.urlwed.get('href') self.urlpdfGeleneralH=re.sub('/book/','https://b-ok.cc/book/',self.urlDowload) self.urlDowload=re.sub('/book/','http://zlibraryexau2g3p.onion/book/',self.urlDowload) self.escrivirUrlWed.writerow([self.urlDowload]) print(self.urlDowload) self.voleano=validarFormato(self.urlpdfGeleneralH) guardarNumeroDescargas(self.contadorLibros) print(self.respaldoContador) if self.contadorLibros==self.respaldoContador: for self.urlRedirec in range(0,1): self.zLibraty.load_url(self.urlDowload) sleep(5) self.htmlPdf=self.zLibraty.page_source self.soupRedirec=BeautifulSoup(self.htmlPdf,'html.parser') self.urlDowloadPDF=self.soupRedirec.find(class_="btn btn-primary dlButton addDownloadedBook") self.urlDowloadPDF=self.urlDowloadPDF.get('href') self.urlDowloadPDF=re.sub('/dl/','http://zlibraryexau2g3p.onion/dl/',self.urlDowloadPDF) self.imprimirUrlPdf.writerow([self.urlDowloadPDF]) print(self.urlDowloadPDF) print("vamos a por el if") sleep(15) if self.voleano==True: self.zLibraty.set_page_load_timeout(12) try: self.zLibraty.load_url(self.urlDowloadPDF) except: sleep(5) self.zLibraty.set_page_load_timeout(7000) print("funciona PDF ") self.voleano=False sleep(5) self.contadorLibros+=1 self.contadorLibros2+=1 else: self.zLibraty.set_page_load_timeout(12) try: self.zLibraty.load_url(self.urlDowloadPDF) except: sleep(8) pyautogui.press("down") sleep(2) pyautogui.press("enter") self.zLibraty.set_page_load_timeout(7000) sleep(5) self.contadorLibros+=1 self.contadorLibros2+=1 self.zLibraty.load_url("about:downloads") self.datosEsperaDescarga() self.peticiones() self.zLibraty.back() informaiconPdf(self.urlpdfGeleneralH) guardarNumeroDescargas(self.contadorLibros) self.respaldoContador+=1 if self.contadorLibros==self.respaldoContador: if self.contadorLibros2%10==0: print((self.contadorLibros2-1)%10) self.contador+=1 if self.contadorLibros==20: self.contadorCredenciales=20 print("saliendo de secion¡¡¡¡¡¡") pyautogui.moveTo(1707,245) pyautogui.hotkey("ctrl","shift","u") sleep(2) pyautogui.press("enter") sleep(7) pyautogui.press("enter") sleep(15) else: print("saliendo de secion") self.zLibraty.get("http://zlibraryexau2g3p.onion/logout.php") self.contadorUsuarios+=1 print(self.contadorUsuarios) try: self.zLibraty.switch_to_window(self.zLibraty.window_handles[0]) except: print("error al cambian de ventana") usuarioUsadosReescrivir(self.contadorUsuarios) print("por aqui¿¿¿¿¿¿") self.credenciales(self.contadorUsuarios) self.contadorCredenciales=1 print("no por aqui¿¿¿¿¿¿") sleep(20) self.iniciarSecion() sleep(15) self.paginaDescargas() sleep(7) self.contadorLibros2=0 sleep(15) print("numero de li bros por usuario ",self.contadorLibros2) if self.contador==5: self.contador=0 except OSError as e : print(e.strerror) print("error en la urlPdf:::::") guardarNumeroDescargas(self.contadorLibros) usuarioUsadosReescrivir(self.contadorUsuarios) print(self.contadorLibros) archivos=int(contarNueroArchivos()) print(archivos) self.zLibraty.load_url("about:downloads") self.datosEsperaDescarga() self.peticiones() self.zLibraty.back() informaiconPdf(self.urlpdfGeleneralH) def DescargarContenido(self,_html): self.contenido=_html def serrarTor(self): self.zLibraty.close() def datosEsperaDescarga(self): sleep(4) self.htmlValidador=self.zLibraty.page_source def validarDescarga(self): self.htmlFalce=self.zLibraty.page_source self.soupFalce=BeautifulSoup(self.htmlFalce,"html.parser") self.validarfalce=self.soupFalce.find_all("description",class_="downloadDetails downloadDetailsNormal") self.respuestafalce=re.search("value=.+",str(self.validarfalce)) self.buscarFalse=self.respuestafalce.group() if re.search("Canceled",self.buscarFalse): print("se daño al descarga =(") sleep(5) pyautogui.click(1393,139) sleep(5) else : if re.search("Failed",self.buscarFalse): print("se daño al descarga pero vamos a solucionarlo =( ") sleep(5) pyautogui.click(1393,139) sleep(5) else: print("la descarga va bien =)") def peticiones(self): self.validarDescarga() self.carga=0 self.daño=0 self.conteo=0 while self.carga<100: self.soup=BeautifulSoup(self.htmlValidador,"html.parser") try: self.archivoDescarga=self.soup.find_all("progress",class_="downloadProgress") self.respaldo=re.split("value",str(self.archivoDescarga)) self.tiempo=re.search("[0-9]+",self.respaldo[1]) print(self.tiempo.group()) self.carga=int(self.tiempo.group()) self.datosEsperaDescarga() sleep(3) self.validarDescarga() if self.conteo==3: pyautogui.press("enter") self.conteo=0 except: print("o no ,se daño la descargar y no la e podido volver a iniciar") if self.daño==7: os.system('rm -r /home/dd/zlibros/libros1920-1921/libro/*.*') raise self.daño+=1 sleep(5)
else: sys.stderr.write("Please choose 1 for Chrome or 2 for Tor") os.chdir( '/home/kasai/Documents/adscape/code/newScape/adtool') #settool dir frames_folder = domain + "/Frames" if not os.path.exists(frames_folder): os.makedirs(frames_folder) ads_folder = domain + "/Ads" if not os.path.exists(ads_folder): os.makedirs(ads_folder) start_time = time.time() driver.get(site) end_time = time.time() time.sleep(2) plt = round(end_time - start_time, 3) scroll_page(driver) # scroll the page time.sleep(3) frame_count = 0 image_count = 0 ad_count = 0 tracking_pix = 0 processed_list_frame = [] processed_list_img = [] emb = 0
class InstagramScraper(): """InstagramScraper: Web scraper class. This class is used represent the various browser types using numeric values instead of string values. This to allow for easy checking and changing/expanding of values. """ def __init__(self, browser_type, user_data_dir=None): # internal flag so we know what sort of web browser we are instantiating self.WebBrowserType = browser_type # various browser initiation according to different browser types if (browser_type == WebBrowserType.CHROME): print_dbg_msg_L1("\t[+] Starting Chrome...") options = webdriver.chrome.options.Options() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") options.add_argument("--no-default-browser-check") self.browser = webdriver.Chrome(chrome_options=options) elif (browser_type == WebBrowserType.CHROME_DEBUG): print_dbg_msg_L1("\t[+] Starting Chrome in debug mode...") options = webdriver.chrome.options.Options() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") options.add_argument("--no-default-browser-check") options.add_argument("--remote-debugging-port=9222") if user_data_dir == None: user_data_dir=chrome_debug_profile + "/" + str(secrets.token_hex(16)) #print_dbg_msg_L1("\t\t[+] User data dir: " + user_data_dir) if not os.path.exists(user_data_dir): os.makedirs(user_data_dir) options.add_argument("--user-data-dir=" + user_data_dir) self.browser = webdriver.Chrome(chrome_options=options) elif (browser_type == WebBrowserType.TOR): ''' Sometimes the Tor process fails to launch or the web browser fails to instantiate properly. Regardless, loop until both the Tor process and the browser is instantiated correctly. So far, over 30,000 runs, the instantiation usually kicks in after at most 1 failure. ''' while True: try: self.tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir) self.browser = TorBrowserDriver(tbb_dir, \ tor_cfg=cm.USE_STEM, \ tbb_profile_path=tbb_ff_default_dir, \ tbb_logfile_path=tbb_log_dir) except Exception as e: print_dbg_msg_L1("\t[+] " + str(e)) print_dbg_msg_L1("\t[+] Error instantiating browser, retrying...") time.sleep(1) continue else: break else: self.browser = webdriver.Firefox() def get(self, targetWebAddress): self.browser.get(targetWebAddress) def close(self): self.browser.quit() if self.WebBrowserType == WebBrowserType.TOR: self.tor_process.kill() def __exit__(self, exc_type, exc_value, traceback): self.browser.quit()