def _get_ff_profile(self, ff_profile_dir): if isinstance(ff_profile_dir, FirefoxProfile): return ff_profile_dir if is_falsy(ff_profile_dir): return webdriver.FirefoxProfile() return webdriver.FirefoxProfile(ff_profile_dir)
#!/usr/bin/env python from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import Select from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import NoAlertPresentException import time, sys, os # note: user profile varys with computer profile = webdriver.FirefoxProfile("/Users/js/Library/Application Support/Firefox/Profiles/l40nl96j.default") driver = webdriver.Firefox(profile) driver.implicitly_wait(30) base_url = "http://irsa.ipac.caltech.edu/" lines=[tmp[:-1] for tmp in open(sys.argv[-1],'r').readlines()] for gal in lines: print gal, driver.get(base_url + "/data/SPITZER/Enhanced/SEIP/") window_start = driver.window_handles[0] driver.find_element_by_name("locstr").clear() driver.find_element_by_name("locstr").send_keys(gal) driver.find_element_by_name("region").click() # time.sleep(30) window_after = driver.window_handles[1] driver.switch_to_window(window_after) page=driver.page_source.encode("utf-8") if 'NOTIFICATION' in page:
''' 使用Chrome“检查”功能找到源地址还十分容易。但是有一些网站非常复杂,例如前面的天猫产品评论,使用“检查”功能很难找到调用的网页地址。 除此之外,有一些数据真实地址的URL也十分冗长和复杂,有些网站为了规避这些抓取会对地址进行加密,造成其中的一些变量让人摸不着头脑。 ''' ''' 动态网页抓取——方法二:使用浏览器渲染引擎。直接用浏览器在显示网页时解析HTML,应用CSS样式并执行JavaScript的语句 ''' from selenium import webdriver profile_directory = R"C:\Users\R\AppData\Roaming\Mozilla\Firefox\Profiles\yjdic0n5.default" profile = webdriver.FirefoxProfile(profile_directory) driver = webdriver.Firefox(profile) driver.get('http://www.santostang.com/2018/07/04/hello-world/') # 原来代码中的 JavaScript 解析成了一个 iframe,<iframe title="livere" scrolling="no"…>也就是说, # 所有的评论都装在这个框架之中,里面的评论并没有解析出来,所以才找不到div.reply-content元素。这时,我们需要加上对 iframe 的解析。 # driver.switch_to.frame(0) # 1.用frame的index来定位,第一个是0 # driver.switch_to.frame("frame1") # 2.用id来定位 # driver.switch_to.frame("myframe") # 3.用name来定位 # driver.switch_to.frame(driver.find_element_by_tag_name("iframe")) # 4.用WebElement对象来定位 driver.switch_to.frame( driver.find_element_by_css_selector("iframe[title='livere']")) comments = driver.find_elements_by_css_selector('div.reply-content') # 循环读取列表元素 for each in comments: content = each.find_element_by_tag_name('p') print(content.text) '''
''' from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time driver_status = "good" test_status = "good" ########################## SETUP FIREFOX PROFILE ######################################## abrahamAzam = webdriver.FirefoxProfile( "C:/Users/jared/AppData/Roaming/Mozilla/Firefox/Profiles/4rv3vvf2.AbrahamAzam" ) abrahamAzam.set_preference("webdriver_accept_untrusted_certs", True) #driver = webdriver.Firefox(firefox_profile = abrahamAzam) driver = webdriver.Firefox(abrahamAzam) driver.get("https://test.dontracker.navy.mil") exit() ########################## LOG INTO DON TRACKER ######################################### try: #driver = webdriver.Chrome() driver.get("https://test.dontracker.navy.mil") driver.find_element_by_id("button-1005-btnIconEl").click()
def main(): # Parse the command line arguments parser = argparse.ArgumentParser() parser.add_argument('website') parser.add_argument('dns_type', choices=['dns', 'doh', 'dot']) parser.add_argument('trr_resolver_ip') parser.add_argument('trr_resolver_uri') parser.add_argument('--timeout', type=int, default=30) args = parser.parse_args() # Enable devtools in Firefox options = Options() options.headless = True options.add_argument('-devtools') # Enable the netmonitor toolbox in devtools so we can save HARs profile = webdriver.FirefoxProfile() profile.set_preference('devtools.toolbox.selectedTool', 'netmonitor') # If we're running a DoT measurement, turn on Stubby if args.dns_type == 'dot': if args.trr_resolver_ip == '1.1.1.1': subprocess.run(["sudo", "/usr/local/bin/bin/stubby", "-C", "stubby-cf.yml", "-g"]) subprocess.run(["sudo", "cp", "resolv.conf", "/etc/resolv.conf"]) elif args.trr_resolver_ip == '9.9.9.9': subprocess.run(["sudo", "/usr/local/bin/bin/stubby", "-C", "stubby-quad9.yml", "-g"]) subprocess.run(["sudo", "cp", "resolv.conf", "/etc/resolv.conf"]) elif args.trr_resolver_ip == '8.8.8.8': subprocess.run(["sudo", "/usr/local/bin/bin/stubby", "-C", "stubby-google.yml", "-g"]) subprocess.run(["sudo", "cp", "resolv.conf", "/etc/resolv.conf"]) # Configure the DNS settings in Firefox if args.dns_type == 'dns' or args.dns_type == 'dot': options.set_preference('network.trr.mode', 0) elif args.dns_type == 'doh': options.set_preference('network.trr.mode', 3) options.set_preference('network.trr.request-timeout', 1500) options.set_preference('network.trr.max-fails', 5) trr_resolver_ip = args.trr_resolver_ip trr_resolver_uri = args.trr_resolver_uri if trr_resolver_ip: options.set_preference('network.trr.bootstrapAddress', trr_resolver_ip) if trr_resolver_uri: options.set_preference('network.trr.uri', trr_resolver_uri) # Launch Firefox and install our extension for getting HARs driver = webdriver.Firefox(options=options, firefox_profile=profile, firefox_binary="/opt/firefox/firefox-bin") driver.install_addon("/home/seluser/measure/harexporttrigger-0.6.2-fx.xpi") driver.set_page_load_timeout(30) # Make a page load started = datetime.now() driver.get(args.website) # Once the HAR is on disk in the container, write it to stdout so the host machine can get it har_file = "/home/seluser/measure/har.json" def har_file_ready(): return os.path.exists(har_file + ".ready") while (datetime.now() - started).total_seconds() < args.timeout \ and not har_file_ready(): time.sleep(1) if har_file_ready(): with open(har_file, 'rb') as f: sys.stdout.buffer.write(f.read()) driver.quit()
def get_firefox_profile() -> webdriver.FirefoxProfile: firefox_profile: webdriver.FirefoxProfile = webdriver.FirefoxProfile() firefox_profile.set_preference('permissions.default.image', 2) firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') return firefox_profile
def symmapzong(herbs): for herb in herbs: profile = webdriver.FirefoxProfile() profile.set_preference('browser.download.folderList', 2) profile.set_preference('browser.download.dir', '/Users/huanjiaming/webscraping/') profile.set_preference('browser.download.manager.showWhenStarting', False) profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/csv') browsersymmap = webdriver.Firefox(firefox_profile=profile) browsersymmap.get('http://www.symmap.org/search/') name = browsersymmap.find_element_by_id('herb_ipt') name.send_keys(herb) buttons = browsersymmap.find_element_by_id('herb_search') #buttons = browsersymmap.find_element_by_xpath('//*[@id="herb_ser_box"]/span') buttons.click() compoundnum = browsersymmap.find_element_by_tag_name('td') compoundnumsymmap = compoundnum.text symmapurl = 'http://www.symmap.org/detail/' + compoundnumsymmap browsersymmap.get(symmapurl) herbname = browsersymmap.find_element_by_xpath('/html/body/div/section/div/div/div/div[1]/table/tbody/tr[1]/td[2]') sherbname = herbname.text print(sherbname) xiala = browsersymmap.find_element_by_xpath('/html/body/div/section/div/div/div/div[3]/div[2]/form/div/div[1]/div/button/span[2]/span').click() ingredientbuttonurl = '/html/body/div/section/div/div/div/div[3]/div[2]/form/div/div[1]/div/div/ul/li[3]/a' ingredientbuttonurlcilck = browsersymmap.find_element_by_xpath(ingredientbuttonurl).click() downloadcompoudbutton = browsersymmap.find_element_by_id('dl-btn').click() oldname='/Users/huanjiaming/webscraping/data.csv' newname='/Users/huanjiaming/webscraping/' + 'A' + sherbname + ' ' + 'symmap.csv' os.rename(oldname,newname) herbfile = open('A' + sherbname + ' ' + 'symmap.csv') herbreader = csv.reader(herbfile) for row in herbreader: if herbreader.line_num == 1: continue compoundnum = row[0] compoundname = row[1] compoundssurl = 'http://www.symmap.org/detail/' + compoundnum browsersymmap.get(compoundssurl) compoundssearchname = browsersymmap.find_element_by_xpath('/html/body/div[1]/section/div/div/div/div[1]/table/tbody/tr[1]/td[2]') compoundsname = compoundssearchname.text comoundxiala = browsersymmap.find_element_by_xpath('/html/body/div[1]/section/div/div/div/div[3]/div[2]/form/div/div[1]/div/button').click() targesbuttonurlclick = '/html/body/div[1]/section/div/div/div/div[3]/div[2]/form/div/div[1]/div/div/ul/li[4]/a' targetsbutton = browsersymmap.find_element_by_xpath(targesbuttonurlclick).click() targetdownloadbutton = browsersymmap.find_element_by_id('dl-btn').click() compoundoldname='/Users/huanjiaming/webscraping/data.csv' compoundnewname='/Users/huanjiaming/webscraping/' + sherbname + ' ' + compoundname + ' ' + 'symmap.csv' os.rename(compoundoldname,compoundnewname) targetfile = open(sherbname + ' ' + compoundname + ' ' + 'symmap.csv') targetreader = csv.reader(targetfile) for targetrow in targetreader: if targetreader.line_num == 1: continue symmapct = open('symmapct.txt','a+') symmapct.write(herb + '$' + sherbname + '$' + 'symmap' + '$' + compoundname + '$' + targetrow[1] + '$' + targetrow[3] + '$' + targetrow[6]+'\n') symmapct.close() pass pass browsersymmap.quit() pass
def get_historical_data(name): stock_name = name # url = "https://finance.yahoo.com/quote/AMZN?p=AMZN&.tsrc=fin-srch" url = "https://finance.yahoo.com/quote/" + stock_name + "?p=" + stock_name + "&.tsrc=fin-srch" driver = webdriver.Firefox(executable_path="/usr/bin/geckodriver") # webdriver.FirefoxProfile() webdriver.FirefoxProfile().set_preference( "browser.download.manager.showWhenStarting", False) webdriver.FirefoxProfile().set_preference( "browser.download.manager.showAlertOnComplete", False) webdriver.FirefoxProfile().set_preference( "browser.helperApps.neverAsk.saveToDisk", "text/csv") webdriver.FirefoxProfile().set_preference("browser.download.dir", "~/Downloads") # url = "http://finance.yahoo.com/quote/AMZN/history?p=AMZN" try: driver.get(url) # delay = 3 print "Page is ready!" except TimeoutException: print "Loading took too much time!" print "Page loading is done" time.sleep(.5) print "Finding tag span Done" elm_lists = driver.find_elements_by_tag_name("span") for elm in elm_lists: try: # print elm.get_attribute('href'), elm.text if elm.text == "Historical Data": print "Found!!" print elm.text elm.click() # print self.url time.sleep(2.5) len_of_input_elm = 0 while len_of_input_elm < 5: input_elm_lists = driver.find_elements_by_tag_name("input") len_of_input_elm = len(input_elm_lists) print len(input_elm_lists) for input_elm in input_elm_lists: if input_elm.get_attribute( "class" ) == "C(t) O(n):f Tsh($actionBlueTextShadow) Bd(n) Bgc(t) Fz(14px) Pos(r) T(-1px) Bd(n):f Bxsh(n):f Cur(p) W(190px)": print "find right input tag" print input_elm.get_attribute("data-test") input_elm.click() time.sleep(2.5) elm = driver.find_element_by_name("startDate") print "Found startDate" elm.clear() elm.send_keys("6/25/2012") elm = driver.find_element_by_name("endDate") print "Found endDate" elm.clear() elm.send_keys("6/25/2015") break button_elm_lists = driver.find_elements_by_tag_name("button") print len(button_elm_lists) for button_elm in button_elm_lists: if button_elm.get_attribute( "class" ) == " Bgc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) C(white) Bgc($actionBlueHover):h Bd(0) D(ib) Cur(p) Td(n) Py(9px) Miw(80px)! Fl(start)": print "Found Done" button_elm.click() time.sleep(5.5) # if button_elm.get_attribute("class") == " Bgc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) C(white) Bgc($actionBlueHover):h Bd(0) D(ib) Cur(p) Td(n) Py(9px) Fl(end)": # print "Found Apply" # button_elm.click() # time.sleep(5.5) # # print input_elm.get_attribute("class") # break break except: pass button_elm_lists = driver.find_elements_by_tag_name("button") for button_elm in button_elm_lists: if button_elm.get_attribute( "class" ) == " Bgc($c-fuji-blue-1-b) Bdrs(3px) Px(20px) Miw(100px) Whs(nw) Fz(s) Fw(500) C(white) Bgc($actionBlueHover):h Bd(0) D(ib) Cur(p) Td(n) Py(9px) Fl(end)": print "Found Apply" button_elm.click() time.sleep(5.5) # print input_elm.get_attribute("class") break a_elm_lists = driver.find_elements_by_tag_name("a") for a_elm in a_elm_lists: if a_elm.get_attribute("class") == "Fl(end) Mt(3px) Cur(p)": print "Found download" url = a_elm.get_attribute('href') print url break driver.get(url)
def download(cfg): from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.remote.remote_connection import LOGGER LOGGER.setLevel(logging.WARNING) retCode = False filename_new = cfg.get('download', 'filename_new') filename_old = cfg.get('download', 'filename_old') login = cfg.get('download', 'login') password = cfg.get('download', 'password') url_lk = cfg.get('download', 'url_lk') url_file = cfg.get('download', 'url_file') download_path = os.path.join(os.getcwd(), 'tmp') if not os.path.exists(download_path): os.mkdir(download_path) for fName in os.listdir(download_path): os.remove(os.path.join(download_path, fName)) dir_befo_download = set(os.listdir(download_path)) if os.path.exists('geckodriver.log'): os.remove('geckodriver.log') try: ffprofile = webdriver.FirefoxProfile() ffprofile.set_preference("browser.download.dir", download_path) ffprofile.set_preference("browser.download.folderList", 2) ffprofile.set_preference( "browser.helperApps.neverAsk.saveToDisk", ",application/octet-stream" + ",application/vnd.ms-excel" + ",application/vnd.msexcel" + ",application/x-excel" + ",application/x-msexcel" + ",application/zip" + ",application/xls" + ",application/vnd.ms-excel" + ",application/vnd.ms-excel.addin.macroenabled.12" + ",application/vnd.ms-excel.sheet.macroenabled.12" + ",application/vnd.ms-excel.template.macroenabled.12" + ",application/vnd.ms-excelsheet.binary.macroenabled.12" + ",application/vnd.ms-fontobject" + ",application/vnd.ms-htmlhelp" + ",application/vnd.ms-ims" + ",application/vnd.ms-lrm" + ",application/vnd.ms-officetheme" + ",application/vnd.ms-pki.seccat" + ",application/vnd.ms-pki.stl" + ",application/vnd.ms-word.document.macroenabled.12" + ",application/vnd.ms-word.template.macroenabed.12" + ",application/vnd.ms-works" + ",application/vnd.ms-wpl" + ",application/vnd.ms-xpsdocument" + ",application/vnd.openofficeorg.extension" + ",application/vnd.openxmformats-officedocument.wordprocessingml.document" + ",application/vnd.openxmlformats-officedocument.presentationml.presentation" + ",application/vnd.openxmlformats-officedocument.presentationml.slide" + ",application/vnd.openxmlformats-officedocument.presentationml.slideshw" + ",application/vnd.openxmlformats-officedocument.presentationml.template" + ",application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ",application/vnd.openxmlformats-officedocument.spreadsheetml.template" + ",application/vnd.openxmlformats-officedocument.wordprocessingml.template" + ",application/x-ms-application" + ",application/x-ms-wmd" + ",application/x-ms-wmz" + ",application/x-ms-xbap" + ",application/x-msaccess" + ",application/x-msbinder" + ",application/x-mscardfile" + ",application/x-msclip" + ",application/x-msdownload" + ",application/x-msmediaview" + ",application/x-msmetafile" + ",application/x-mspublisher" + ",application/x-msschedule" + ",application/x-msterminal" + ",application/x-mswrite" + ",application/xml" + ",application/xml-dtd" + ",application/xop+xml" + ",application/xslt+xml" + ",application/xspf+xml" + ",application/xv+xml" + ",application/excel") if os.name == 'posix': #driver = webdriver.Firefox(ffprofile, executable_path=r'/usr/local/Cellar/geckodriver/0.19.1/bin/geckodriver') driver = webdriver.Firefox( ffprofile, executable_path=r'/usr/local/bin/geckodriver') elif os.name == 'nt': driver = webdriver.Firefox(ffprofile) driver.implicitly_wait(10) driver.get(url_lk) time.sleep(2) driver.get(url_file) time.sleep(2) driver.close() #driver.find_element_by_link_text(u"Выход").click() driver.quit() except Exception as e: log.debug('Exception: <' + str(e) + '>') dir_afte_download = set(os.listdir(download_path)) new_files = list(dir_afte_download.difference(dir_befo_download)) print(new_files) if len(new_files) == 0: log.error('Не удалось скачать файл прайса ') retCode = False elif len(new_files) > 1: log.error('Скачалось несколько файлов. Надо разбираться ...') retCode = False else: new_file = new_files[0] # загружен ровно один файл. new_ext = os.path.splitext(new_file)[-1].lower() DnewFile = os.path.join(download_path, new_file) new_file_date = os.path.getmtime(DnewFile) log.info( 'Скачанный файл ' + new_file + ' имеет дату ' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(new_file_date))) print(new_ext) if new_ext in ('.xls', '.xlsx', '.xlsb', '.xlsm', '.csv'): if os.path.exists(filename_new) and os.path.exists(filename_old): os.remove(filename_old) os.rename(filename_new, filename_old) if os.path.exists(filename_new): os.rename(filename_new, filename_old) shutil.copy2(DnewFile, filename_new) retCode = True elif new_ext == '.zip': # ветка устаревшая, не проверялась # Архив. Обработка не завершена log.debug('Zip-архив. Разархивируем.') work_dir = os.getcwd() os.chdir(os.path.join(download_path)) dir_befo_download = set(os.listdir(os.getcwd())) os.system('unzip -oj ' + new_file) os.remove(new_file) dir_afte_download = set(os.listdir(os.getcwd())) new_files = list(dir_afte_download.difference(dir_befo_download)) os.chdir(work_dir) if len(new_files) == 1: new_file = new_files[0] # разархивирован ровно один файл. new_ext = os.path.splitext(new_file)[-1] DnewFile = os.path.join(download_path, new_file) new_file_date = os.path.getmtime(DnewFile) log.debug('Файл из архива ' + DnewFile + ' имеет дату ' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(new_file_date))) filename_in = cfg.get('basic', 'filename_in') if os.path.exists(filename_new) and os.path.exists( filename_old): os.remove(filename_old) os.rename(filename_new, filename_old) if os.path.exists(filename_new): os.rename(filename_new, filename_old) shutil.copy2(DnewFile, filename_new) retCode = True elif len(new_files) > 1: log.debug('В архиве не единственный файл. Надо разбираться.') retCode = False else: log.debug( 'Нет новых файлов после разархивации. Загляни в папку юниттеста поставщика.' ) retCode = False return retCode
def picture_screenshot_html(self, keyword, ckurl, searchDevice, spidertype, searchPage, returnType): starttime = datetime.datetime.now() picturedata = None try: if int(searchDevice) == 1: browser = webdriver.Firefox( executable_path=sxconfig.geckodriverPath) browser.set_page_load_timeout(sxconfig.page_load_timeout) browser.set_script_timeout(sxconfig.script_timeout) browser.maximize_window() browser.get(sxconfig.baiduPcUrl) # Load page browser.find_element_by_id('kw').clear() # 用于清除输入框的内容 browser.find_element_by_id('kw').send_keys(u'' + keyword) # 在输入框内输入 browser.find_element_by_id('su').click() # 用于点击按钮 browser.find_element_by_id('su').submit() # 用于提交表单内容 # browser.find_element_by_name("") self.util.fullloaded(browser) for currentpage in xrange(1, searchPage + 1): # print currentpage # print browser.current_url jsClientWidth = '''return document.body.clientWidth''' tatalWidth = browser.execute_script(jsClientWidth) # print tatalWidth jsScrollHeight = '''return document.body.parentNode.scrollHeight''' tatalHeight = browser.execute_script(jsScrollHeight) # print tatalHeight html_source = browser.page_source # 页面 if int(returnType) > 0: # 根据页面返回排名 数组 rankitem = self.baiduPc.getRankListByHtmlPc( html_source, ckurl, spidertype) ranklist = rankitem['rankList'] nextPageUrl = rankitem['nextPageUrl'] if len(ranklist) > 0: picturedata = self.baiduPc.getPictureAndScreenPc( browser, ranklist, tatalWidth, tatalHeight, returnType) break else: if currentpage == 5: break if nextPageUrl is None: break browser.get(sxconfig.baiduPcUrl + nextPageUrl) self.util.fullloaded(browser) else: break else: # mobile firefoxProfile = webdriver.FirefoxProfile() # 设置 useragent firefoxProfile.set_preference("general.useragent.override", sxconfig.mobileUserAgent) browser = webdriver.Firefox( firefox_profile=firefoxProfile, executable_path=sxconfig.geckodriverPath) browser.set_page_load_timeout(sxconfig.page_load_timeout) browser.set_script_timeout(sxconfig.script_timeout) browser.set_window_size(sxconfig.baiduMobileWidth, sxconfig.baiduMobileHeight) browser.get(sxconfig.baiduMobileUrl) # Load page browser.find_element_by_id('index-kw').clear() # 用于清除输入框的内容 browser.find_element_by_id('index-kw').send_keys( u'' + keyword) # 在输入框内输入 browser.find_element_by_id('index-bn').click() # 用于点击按钮 self.util.fullloaded(browser) for currentpage in xrange(1, searchPage + 1): html_source = browser.page_source # 页面 if int(returnType) > 0: # 根据页面返回排名 数组 rankitem = self.baiduMobile.getRankListByHtmlMobile( html_source, ckurl, spidertype) ranklist = rankitem['rankList'] nextPageUrl = rankitem['nextPageUrl'] if len(ranklist) > 0: picturedata = self.baiduMobile.getPictureAndScreenMobile( browser, ranklist, sxconfig.baiduMobileWidth, sxconfig.baiduMobileHeight, returnType) break else: if currentpage == 5: break if nextPageUrl is None: break browser.get(nextPageUrl) self.util.fullloaded(browser) else: break browser.delete_all_cookies() browser.close() # browser.quit() endtime = datetime.datetime.now() print((endtime - starttime).seconds) if picturedata: picturedata["html"] = html_source picturedata["page"] = currentpage return json.dumps(picturedata) else: return -2 except Exception, e: print e browser.close()
def InitDriver(self): tag = False try: #如果driver存在,则退出再创建 if not self.driver: pass else: self.driver.quit() except Exception as e: log_error("释放浏览器资源失败") log_error(e) #创建浏览器对象 try: browser = my_browser if browser == "firefox": options = webdriver.FirefoxOptions() if my_sys_platform == "Linux": options.set_headless( ) #或者使用options.add_argument('-headless') options.add_argument('--disable-gpu') #禁用GPU加速 firefox_profile = webdriver.FirefoxProfile() user_agent = get_header() #随机user_agent my_log.logger.info("get random user_agent:%s" % user_agent) firefox_profile.set_preference("general.useragent.override", user_agent) #如果要截图,则加载图片,否则不加载 # if is_screenshot != '1': # firefox_profile.set_preference('permissions.default.image', 2)#禁止加载图片,某些firefox只需要这个 firefox_profile.update_preferences() # firefox_profile.set_preference('browser.migration.version', 9001)#禁止加载图片,部分需要加上这个 # firefox_profile.set_preference('permissions.default.stylesheet', 2)#禁用css # firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')#禁用flash # firefox_profile.set_preference('javascript.enabled', 'false')#禁用js if my_sys_platform == "Linux": self.driver = webdriver.Firefox( executable_path="./geckodriver", firefox_profile=firefox_profile, firefox_options=options) else: self.driver = webdriver.Firefox( firefox_profile=firefox_profile, firefox_options=options) # self.driver = webdriver.Firefox() elif browser == "chrome": # WIDTH = 320 # HEIGHT = 640 # PIXEL_RATIO = 3.0 # UA = 'Mozilla/5.0 (Linux; Android 4.1.1; GT-N7100 Build/JRO03C) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/6.3' # mobileEmulation = {"deviceMetrics": {"width": WIDTH, "height": HEIGHT, "pixelRatio": PIXEL_RATIO}, "userAgent": UA} # options = webdriver.ChromeOptions() # options.add_experimental_option('mobileEmulation', mobileEmulation) # self.driver = webdriver.Chrome(chrome_options=options) # prefs = {"profile.managed_default_content_settings.images": 2}#禁止加载图片 # options.add_experimental_option("prefs", prefs) self.driver = webdriver.Chrome() elif browser == "ie": self.driver = webdriver.Ie() elif browser == "phantomjs": from selenium.webdriver.common.desired_capabilities import DesiredCapabilities dcap = dict(DesiredCapabilities.PHANTOMJS) # dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36') dcap['phantomjs.page.settings.userAgent'] = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' ) self.driver = webdriver.PhantomJS( executable_path=r"./phantomjs", desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true']) # self.driver = webdriver.PhantomJS(executable_path = "./phantomjs") else: self.driver = webdriver.Firefox() #设置请求超时时间 self.driver.set_page_load_timeout(5) self.driver.set_script_timeout(5) tag = True log_info("打开浏览器成功Open browser successfully") except Exception as e: log_error("打开浏览器异常Failed to open browser") log_error(e) tag = False return tag
def get_result_type1(url, result_db, path): options = Options() options.headless = True profile = webdriver.FirefoxProfile() profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference("browser.download.dir", path) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") profile.set_preference("pdfjs.disabled", True) driver = webdriver.Firefox(firefox_profile=profile, options=options, executable_path='../driver/geckodriver.exe') driver.get(url) no = 0 for _ in result_db: root.update() id_num = result_db[no][0] pwd = result_db[no][1] test = True try: if driver.find_elements_by_id("regnum"): RegNo = driver.find_element_by_id("regnum") RegNo.send_keys(id_num) if driver.find_elements_by_id("dob"): DoB = driver.find_element_by_id("dob") DoB.send_keys(pwd) login = driver.find_element_by_name('sub') login.click() RegNo.clear() DoB.clear() elif driver.find_element_by_name("regno"): if driver.find_element_by_name("regno"): temp = driver.find_element_by_name("regno") temp.send_keys(id_num) else: temp = driver.find_element_by_id("regno") temp.send_keys(id_num) if driver.find_element_by_id("dob"): temp1 = driver.find_element_by_id("dob") temp1.send_keys(pwd) if driver.find_elements_by_name('but'): login = driver.find_element_by_name('but') login.click() else: login = driver.find_element_by_xpath( '/html/body/form/table/tbody/tr[5]/td/input') login.click() temp1.clear() temp.clear() else: textBox.insert(tk.END, "Something missing Contact dev\n") textBox.see(tk.END) except TimeoutException as e: textBox.insert(tk.END, "timeout retrying...\n") textBox.see(tk.END) individual(url, id_num, pwd, path) except Exception as exception: test = False log(exception, reg=id_num, value=pwd) textBox.insert(tk.END, "Issue generated,check log file \n") textBox.see(tk.END) if test == True: val = no + 1 textBox.insert(tk.END, '{} file downloaded \n'.format(val)) textBox.see(tk.END) else: val = no + 1 textBox.insert(tk.END, ' {} file not downloaded \n'.format(val)) textBox.see(tk.END) no += 1 driver.quit() if platform.system() in "Windows": os.system("Taskkill /IM firefox.exe /F")
from urllib.request import urlopen from bs4 import BeautifulSoup from time import sleep from selenium.common.exceptions import WebDriverException from selenium.webdriver.common.utils import keys_to_typing from selenium.webdriver.common.keys import Keys from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from unicodedata import normalize import re import random from unicodedata import normalize profile = webdriver.FirefoxProfile('/home/dgc7/.mozilla/firefox/7aebrp31.dd7') profile.set_preference("browser.download.panel.shown", False) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/zip") profile.set_preference('browser.download.folderList', 2) profile.set_preference('browser.download.dir', '/home/dgc7/zlibros/libros1920-1921') dirNombre = 'home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos' class crearCorros: def __init__(self): self.urlProtocoe = 'http://3g2upl4pq6kufc4m.onion', 'https://mail.protonmail.com/create/new', 'https://singlelogin.org/registration.php' print(self.urlProtocoe[2]) self.dirNombre = '/home/dgc7/ejersiciosLibros/pyaton/ejemplos/scrapin/zlibrari/crearCorreos/nombre.txt'
def findBookUrl(self): directory_name = '.' binary = FirefoxBinary('/docs/python_projects/firefox/firefox') fp = webdriver.FirefoxProfile() fp.set_preference("webdriver.log.file", "/tmp/firefox_console") fp.set_preference("browser.download.folderList", 2) fp.set_preference('browser.download.manager.showWhenStarting', False) fp.set_preference('browser.download.manager.focusWhenStarting', False) fp.set_preference("browser.download.dir", directory_name) fp.set_preference("browser.download.manager.scanWhenDone", False) fp.set_preference("browser.download.manager.useWindow", False) # fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.set_preference( "browser.helperApps.neverAsk.saveToDisk", "application/octet-stream,application/xml,application/pdf,text/plain,text/xml,image/jpeg,text/csv,application/zip,application/x-rar-compressed" ) fp.set_preference("browser.helperApps.alwaysAsk.force", False) fp.set_preference("browser.popups.showPopupBlocker", False) fp.update_preferences() driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary) # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img") driver.get(self.baseUrl) efd_link = driver.find_element_by_css_selector( ".login-popup > div:nth-child(1)") efd_link.click() try: emailEl = driver.find_element_by_css_selector( '#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > input:nth-child(1)' ) # emailEl = driver.find_element_by_name("email") ''' Login with user credential ''' emailEl.send_keys('*****@*****.**') passwordEl = driver.find_element_by_css_selector( "#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > input:nth-child(1)" ) passwordEl.send_keys('default') loginEl = driver.find_element_by_css_selector( "#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(3) > input:nth-child(1)" ) loginEl.click() if True: ''' clicking on My Account ''' myAccountEl = driver.find_element_by_css_selector( '#account-bar-logged-in > a:nth-child(1) > div:nth-child(1) > strong:nth-child(1)' ) myAccountEl.click() ''' clicking My ebooks ''' myEbook = driver.get(self.baseUrl + 'account/my-ebooks') productListEls = driver.find_elements_by_css_selector( 'div.product-line') print len(productListEls) bookList = list() for productEl in productListEls: print productEl try: bookName = productEl.find_element_by_css_selector( '.title').text book = self.createBookDetail(bookName) productEl.click() readMeEl = productEl.find_element_by_css_selector( '.fake-button-text') print 'new page', isbnEl = productEl.find_elements_by_css_selector( 'div > div:nth-child(2) > div:nth-child(1)> a:nth-child(1) > div:nth-child(1)' ) book.isbn_13 = isbnEl[0].get_attribute('isbn') # readMeEl.click() print 'div.product-line:nth-child(1) > div:nth-child(2) > div:nth-child(1) > a:nth-child(1) > div:nth-child(1)', # readMeEl.find_element_by_css_selector('h2.ng-binding') # # readingEl = driver.get('https://www.packtpub.com/mapt/book/All%20Books/' + book.isbn_13) # bookName1=driver.find_elements_by_css_selector('h2.ng-binding')[0].text bookList.append(book) except Exception as e: print e # product_account_list_el=driver.find_elements_by_css_selector('#product-account-list') driver.get('https://www.packtpub.com/packt/offers/free-learning') try: ''' clicking on Claim your free ebook ''' bookNameEl_1 = driver.find_element_by_css_selector( '.dotd-title > h2:nth-child(1)') isBookAlreadyAvailable = False bookName_1 = bookNameEl_1.text for book in bookList: if bookName_1 in book.bookName: isBookAlreadyAvailable = True break if not isBookAlreadyAvailable: claimFreeEbookEl = driver.find_element_by_css_selector( '.book-claim-token-inner > input:nth-child(3)') claimFreeEbookEl.click() except Exception as e: print e # myEbook.click() except Exception as e: print e finally: print 'completed' print 'hi'
def spider(self,url, time=time): #循环指数 num = 0 while 1: #f=xlrd.open_workbook('tianyancha.xlsx') ip='61.161.46.179' port=8118 proxies = self.get_ip() if proxies: items = re.findall('(.*?):(.*)', proxies) ip=items[0][0] port=items[0][1] firefox_options = webdriver.FirefoxOptions() ff_profile = webdriver.FirefoxProfile() ff_profile.set_preference("network.proxy.type", 1) ff_profile.set_preference("network.proxy.http", ip) ff_profile.set_preference("network.proxy.http_port", int(port)) ff_profile.set_preference("network.proxy.ssl", ip) ff_profile.set_preference("network.proxy.ssl_port", int(port)) ff_profile.set_preference("network.proxy.ftp", ip) ff_profile.set_preference("network.proxy.ftp_port", int(port)) ff_profile.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 6.1; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36") ff_profile.update_preferences() #这里是打开浏览器 # driver = webdriver.Firefox(firefox_options=firefox_options, firefox_profile=ff_profile) driver = webdriver.Firefox() wait=WebDriverWait(driver,15) try: driver.maximize_window()#窗口最大 driver.get(url)#天眼查制造业企业的信息 time.sleep(5)#给个五秒延迟,必须先定义,不能在函数中直接用 submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#searchHotelPanel > div.b_tool.clr_after > div.pager.js_pager > div > ul > li.item.next > a > span:nth-of-type(1)')))#直到此元素可以点击 for i in range(5): '''for j in range(1, 3): height = 20000 * j # 每次滑动20000像素 strWord = "window.scrollBy(0," + str(height) + ")" driver.execute_script(strWord) time.sleep(2)''' selector=driver.page_source yield selector #创建新窗口/选项卡 # for i in range(10): #开启选项卡 # driver.execute_script('window.open()') # 切换到此窗口 # driver.switch_to_window(driver.window_handles[i + 1]) # 访问新网址 # driver.get('http://www.douban.com/') # time.sleep(1) e=self.exit(driver,'//*[@id="tyc_banner_close"]') if e==True: print('有小弹窗') driver.find_element_by_xpath('//*[@id="tyc_banner_close"]').click() else: print('无弹窗') #s=driver.find_element_by_xpath('//*[@id="tyc_banner_close"]') '''if s!='':#判断元素是否存在 print('有小弹窗') s.click()#点击下一页 else: print('无弹窗')''' time.sleep(2) driver.find_element_by_css_selector('html body.font-bb49248c div#web-content.mt74 div.container.pt25 div.container-left div.search-block div.result-footer div ul.pagination li a.num.-next').click() time.sleep(5) num += 1 driver.close() except Exception as e: print('出错了:',e) driver.close() #退出while循环,判断num这个循环指数,如果因为出错而运行到这里,就要继续循环,num并没有+1 #如果是正常爬取完了,正常需要退出,num+1,可以退出这个循环 if num==1: print('正常退出浏览器') break print('非正常退出浏览器,继续访问')
def main(ip_of_wp, port_of_wp, admin_pwd): #os.setuid(pwd.getpwnam(username).pw_uid) # ncessary b/c cannot run selenium as root global driver global driver_two try: os.makedirs('./wp_csv_loc') except OSError as e: print e # if the dictory already exists,then we want to clear it (to make the result easy to find) # (taken from: https://stackoverflow.com/questions/185936/how-to-delete-the-contents-of-a-folder-in-python) # file_name = None for file_name in os.listdir('./wp_csv_loc'): file_path = os.path.join('./wp_csv_loc', file_name) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) if e.errno != errno.EEXIST: raise options = Options() options.headless = True #options.add_argument('--no-sandbox') # from: https://selenium-python.readthedocs.io/faq.html (literally copy-pasted) fp = webdriver.FirefoxProfile() fp.set_preference("browser.download.folderList", 2) fp.set_preference("browser.download.manager.showWhenStarting", False) fp.set_preference("browser.download.dir", os.getcwd() + '/wp_csv_loc/') fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") fp.accept_untrusted_certs = True driver = webdriver.Firefox(fp, options=options) driver_two = webdriver.Firefox(fp, options=options) admin_login(admin_pwd, driver) admin_login(admin_pwd, driver_two) time.sleep(5) # okay, first install fakerpress # ''' page_about_fakerpress = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/plugin-install.php?tab=plugin-information&plugin=fakerpress&TB_iframe=true&height=-34%22&width=772' # print driver.page_source.encode("utf-8") driver.get(page_about_fakerpress) time.sleep(5) install_pluggin() time.sleep(10) # then install "Export All URLs" page_about_export_all_urls = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/plugin-install.php?tab=plugin-information&plugin=export-all-urls&TB_iframe=true&width=772&height=627' driver.get(page_about_export_all_urls) install_pluggin() time.sleep(10) # then install "app_pass" page_about_app_pass = '******' + ip_of_wp + ':' + port_of_wp + '/wp-admin/plugin-install.php?tab=plugin-information&plugin=application-passwords&TB_iframe=true&width=772&height=627' driver.get(page_about_app_pass) install_pluggin() # ''' # now generate the fake data using fakerpress max_num = 'fakerpress-field-qty-max' max_num_class = 'fp-field fp-type-number fp-size-tiny' global min_num min_num = 'fakerpress-field-qty-min' drop_down_id = 's2id_fakerpress-field-meta-type' # ''' # this is good. user_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/admin.php?page=fakerpress&view=users' driver.get(user_page) #print(driver.page_source) user_page_code() time.sleep(170) ''' #''' # this is good. terms_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/admin.php?page=fakerpress&view=terms' driver.get(terms_page) terms_page_code() time.sleep(60) ''' #''' # this is good post_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/admin.php?page=fakerpress&view=posts' driver.get(post_page) posts_page_code() time.sleep(300) # ''' # ''' # this is good comments_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/admin.php?page=fakerpress&view=comments' driver.get(comments_page) comments_page_code() time.sleep(300) # ''' export_all_urls_page = 'https://' + ip_of_wp + ':' + port_of_wp + '/wp-admin/options-general.php?page=extract-all-urls-settings' # ''' This is good driver.get(export_all_urls_page) # export_urls_code(driver_two) thread = threading.Thread(target=export_urls_code, args=(driver, )) thread.start() print "will now sleep for 15" time.sleep(15) print "done sleeping" # ''' new_pdw = make_new_application_passwd(driver_two) time.sleep(35) # todo: first finish loading wp (via fakerpress) -- okay, I think this might be done (just gotta test it...) # then modify so that passwd is a cmd line arg -- okay, I think this might be done (just gotta test it...) # then make it run on cloudlab <----- start from here # then modify so called before run_experiment (should be easy...) # and needs to modify wordpress_background to take the passwd from this function as a cmdline argument... # might need to write to a file or something... (b/c gets wierd with python scripting) # might need to get tricky but shouldn't be too bad either... driver.close() driver_two.close() # let's also return the name of the resulting csv folder... folders_in_csv_path = os.listdir('./wp_csv_loc') print "folders_in_csv_path", folders_in_csv_path path_to_csv_file = './wp_csv_loc/' + folders_in_csv_path[0] try: os.remove("../" + "wordpress_users.csv") except OSError: pass try: os.remove("../wordpress_setup/" + "wordpress_users.csv") except OSError: pass shutil.copy(path_to_csv_file, "../" + "wordpress_users.csv") shutil.copy(path_to_csv_file, "../wordpress_setup/" + "wordpress_users.csv") with open('../wordpress_setup/wordpress_api_pwd.txt', 'w') as f: f.write(new_pdw) with open('../wordpress_setup/failures_list.txt', 'w') as f: f.write('') return new_pdw # , path_to_csv_file
def start_browser(self): try: self.proxy = self.server.create_proxy() except Exception as e: print("Browser " + str(self.id) + ": Proxy server is offline: ", e) try: self.barrier.wait(3 * self.timeout) except BrokenBarrierError: print( "Browser " + str(self.id) + ": Timed out waiting for a browser", e) exit(1) self.proxy = self.server.create_proxy() self.proxy.timeouts = { 'request': 5, 'read': 5, 'connection': 5, 'dns': 5 } self.profile = webdriver.FirefoxProfile() # Download files self.profile.set_preference("browser.download.folderList", 2) self.profile.set_preference("browser.download.dir", self.temp_dir) # A comma-separated list of MIME types to save to disk without asking # what to use to open the file self.profile.set_preference( "browser.helperApps.neverAsk.saveToDisk", "application/x-msexcel," + "application/excel," + "application/x-excel," + "application/vnd.ms-excel," + "application/pdf," + "application/msword," + "application/xml," + "application/octet-stream," + "image/png," + "image/jpeg," + "text/html," + "text/plain," + "text/csv") # Do not show the Download Manager self.profile.set_preference( "browser.download.manager.showWhenStarting", False) self.profile.set_preference( "browser.download.manager.focusWhenStarting", False) self.profile.set_preference("browser.download.manager.useWindow", False) self.profile.set_preference( "browser.download.manager.showAlertOnComplete", False) self.profile.set_preference("browser.download.manager.closeWhenDone", False) # Do not ask what to do with an unknown MIME type self.profile.set_preference("browser.helperApps.alwaysAsk.force", False) self.profile.set_proxy(self.proxy.selenium_proxy()) self.driver = webdriver.Firefox(firefox_profile=self.profile) self.driver.set_page_load_timeout(self.timeout)
# -*- coding: utf-8 -*- from selenium import webdriver import urllib2 import os import time adblockfile = 'c:/Users/julio/Downloads/adblock_plus-2.6.11-sm+tb+fx+an.xpi' ffprofile = webdriver.FirefoxProfile("C:/Users/julio/AppData/Local/Mozilla/Firefox/Profiles") ffprofile.add_extension(adblockfile) driver = webdriver.Firefox(ffprofile) mainpage = "http://en.dm5.com/manhua-yiquanchaoren/" base = "http://en.dm5.com" driver.get(mainpage) #%% chapters = driver.find_elements_by_xpath("//ul[@id='cbc_3']/li/a") chapter_links = [] for chapter in chapters: chapter_links.append(chapter.get_attribute('href')) #len(chapter_links) #chapter_links = chapter_links[0:2] #%% for chapter_link in chapter_links: driver.get(chapter_link) driver.execute_script( "window.onbeforeunload = function(e){};" ) # turn off all js title = driver.find_element_by_xpath("//div[@class='view_bt']/h1").text chapter_id = title.replace(u"一拳超人","").replace(u"原作版","").replace(u"话","") pages = driver.find_elements_by_xpath("//select[@id='pagelist']/option") option_value = pages[0].get_attribute('value').replace("-p1/","") for x in range(1, len(pages) + 1):
def get_profile(): profile = webdriver.FirefoxProfile() profile.set_preference("browser.privatebrowsing.autostart", True) profile.update_preferences() return profile
# '/usr/bin/google-chrome') driver_arguments['executable_path'] = chromedriver # Travis-CI uses OpenVZ containers which are incompatible with the sandbox # technology. # See https://code.google.com/p/chromium/issues/detail?id=31077 for more # information. if 'TRAVIS' in os.environ: driver_arguments['chrome_options'].add_argument('--no-sandbox') driver_arguments['chrome_options'].add_argument( '--disable-setuid-sandbox') driver_arguments['chrome_options'].add_argument( '--allow-sandbox-debugging') elif args.browser == "Firefox": driver_arguments['firefox_profile'] = webdriver.FirefoxProfile() # Firefox will often pop-up a dialog saying "script is taking too long" or # similar. So we can notice this problem we use "accept" rather then the # default "dismiss". webdriver.DesiredCapabilities.FIREFOX[ "unexpectedAlertBehaviour"] = "accept" elif args.browser == "PhantomJS": driver_arguments['executable_path'] = phantomjs driver_arguments['service_args'] = ['--remote-debugger-port=9000'] elif args.browser == "Remote": driver_arguments['command_executor'] = args.remote_executor for arg in args.remote_caps: if not arg.strip():
from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.action_chains import ActionChains import time import psycopg2 from sqlalchemy import create_engine from collections import defaultdict import re from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import StandardScaler import pickle import matplotlib.pyplot as plt #GRAB ALL STATES FROM THE WORLD WIDE WEB options = Options() options.set_headless(True) firefox_profile = webdriver.FirefoxProfile() firefox_profile.set_preference("browser.privatebrowsing.autostart", True) browser = webdriver.Firefox(options=options, firefox_profile=firefox_profile, executable_path='/usr/local/bin/geckodriver') url = 'https://alphabetizer.flap.tv/lists/list-of-states-in-alphabetical-order.php' browser.get(url) page_content = BeautifulSoup(browser.page_source, 'html.parser') scrape_results = page_content.findAll('li') states = [] for res in scrape_results: states.append(res.text.replace(' ', '-')) states.append('washington-dc') ##THIS IS FOR DATA IMPORT AND SQL EXPORT, IT DOES NOT NEED TO BE RUN AGAIN exporter = DatabaseExport('az_trail_recommender')
def crawler(self): url = "https://www.instagram.com/explore/tags/무신사/" # 포스트 내 컨텐츠 담을 리스트 선언 tagList = [] # 페이지 스크롤 변수 pagedowns = 0 # dict(hashtag,cnt) hashtag = {} # 엑셀 저장 데이터 feedList = [] # 리턴 데이터 returnList = {} # 크롤링 결과 데이터 crawlingList = {} # 크롬 옵션 설정 # options = webdriver.ChromeOptions() # print(options) # #headless 모드 # options.add_argument('headless') # options.add_argument('window-size=1920x1080') # options.add_argument('disable-gpu') # #headless 모드 탐지 방지 언어 및 headless로 보이지 않도록 플러그인 수정 # options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36") # options.add_argument("lang=ko_KR") # 한국어! # print(options) # driver = webdriver.Chrome('chromedriver',chrome_options=options) #네비게이터에 올바른 브라우저 환경처럼 보이도록 세팅해준다 #driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") #언어 #driver.execute_script("Object.defineProperty(navigator, 'languages', {get: function() {return ['ko-KR', 'ko']}})") #위에서 차단한 렌더링 가속 가짜로 넣어서 위장 #driver.execute_script("const getParameter = WebGLRenderingContext.getParameter;WebGLRenderingContext.prototype.getParameter = function(parameter) {if (parameter === 37445) {return 'NVIDIA Corporation'} if (parameter === 37446) {return 'NVIDIA GeForce GTX 980 Ti OpenGL Engine';}return getParameter(parameter);};") # 브라우저가 실행되며 해당 url로 이동 # 파이어폭스 옵션 설정 profile = webdriver.FirefoxProfile() profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.socks", "127.0.0.1") profile.set_preference("network.proxy.type", 9150) profile.set_preference('general.useragent.override', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0') profile.update_preferences() options = webdriver.FirefoxOptions() options.add_argument("--headless") try: driver = webdriver.Firefox(executable_path='/crawler/repo/blog/geckodriver.exe',firefox_profile=profile,firefox_options=options) except WebDriverException: webdriver.close() #코드 시작시간 start = datetime.datetime.now().strftime("%Y_%m_%d %H:%M:%S") print(start) driver.get(url) # 웹자원 대기 driver.implicitly_wait(1) # 총 게시물 수 태그 클래스이름으로 찾기 ttlFeed = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME,"g47SY"))) print("총 게시물:", ttlFeed.text) # body 태그를 태그 이름으로 찾기 time.sleep(1) # 페이지 내 첫번째 게시물 클릭 driver.find_elements_by_class_name("eLAPa")[0].click() # failCnt failCnt = 0 count = self.count # 데이터 스크래핑 시작 while pagedowns < count: # 페이지 호출 후 대기 #driver.implicitly_wait(5) #게시물 본문 try: post = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CLASS_NAME,"C4VMK"))) try: driver.find_element_by_class_name('XQXOT').send_keys(Keys.HOME) driver.find_element_by_class_name('XQXOT').find_element_by_xpath("//ul/li/div/button").click() driver.find_element_by_class_name('XQXOT').send_keys(Keys.HOME) except (NoSuchElementException,ElementNotInteractableException): pass # 게시물 글자수 160자 # 댓글포함 최대 30개 # 하나의 해시트그 내 글자수 100자 #id = driver.execute_script("document.body.getElementsByClassName('C4VMK')[0].getElementsByTagName('a')[0].innerText") #content = driver.execute_script("document.body.getElementsByClassName('C4VMK')[0].getElementsByTagName('span')[0].innerText") req = driver.page_source soup = BeautifulSoup(req,'html.parser') replyCount = soup.find_all("div",class_="C4VMK") tagCount = replyCount[0].select('span>a') id = replyCount[0].find_all(class_="_6lAjh")[0].select("a")[0].text content = replyCount[0].select('span')[0].text like = '0' tags=[] feedRow = {} try: #like = driver.find_element_by_class_name("Nm9Fw").find_element_by_tag_name("span").text like = soup.find_all("div",class_="Nm9Fw")[0].select("span")[0].text except (NoSuchElementException,IndexError): try: like = soup.find_all("span",class_="vcOH2")[0].select("span")[0].text except IndexError: pass #데이터 가공 emoji_pattern = re.compile("[\U00010000-\U0010ffff]", flags=re.UNICODE) content = emoji_pattern.sub('',content) #태그 로직 끝난 후에 긍정,부정 체크 메서드 만들것 #본문의 해시태그 if len(tagCount) > 0: for i in range(0,len(tagCount)): tag = tagCount[i].text if "#" in tag: tag = tag.replace("#","").replace(" ","") tags.append(tag) #댓글의 해시태그 if len(replyCount) > 0: for i in range(1,len(replyCount)): #replyid = "document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('a')[0].innerText" replyid = replyCount[i].find_all("a")[0].text if id == replyid: #replyTagCount = driver.execute_script("document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('span')[0].getElementsByTagName('a').length") replyTagCount = replyCount[i].find_all("a") if len(replyCount) > 1: for j in range(0,len(replyTagCount)): #reply = driver.execute_script("document.body.getElementsByClassName('C4VMK')["+i+"].getElementsByTagName('span')[0].getElementsByTagName('a')["+j+"].innerText") reply = replyTagCount[j].text if "#" in reply: reply = reply.replace("#","").replace(" ","") tags.append(reply) #중복제거 tags = list(set(tags)) tagList.append(tags) print("=======================================================================================") print("====================================pagedowns : ",pagedowns,"====================================") print("=======================================================================================") print("id===============================",id) print("content==========================",content) print("like=============================",like) print("finaltag=========================",tags) feedRow["id"] = id feedRow["content"] = content feedRow["tag"] = tags feedRow["like"] = like feedList.append(feedRow) time.sleep(1) #다음 게시물 클릭 try: driver.find_element_by_class_name("HBoOv").click() except NoSuchElementException: # 웹자원 대기 driver.get(url) driver.implicitly_wait(1) for i in range(0,pagedowns): driver.find_elements_by_class_name("eLAPa")[0].click() #html = driver.find_element_by_tag_name("html") #html.send_keys(Keys.DOWN) pagedowns += 1 print("=======================================================================================") print("=======================================================================================") except (NoSuchElementException,StaleElementReferenceException,TimeoutException): failCnt += 1 print("=======================================================================================") print("====================================failcount : ",failCnt,"=====================================") print("=======================================================================================") if failCnt > 3: driver.find_element_by_class_name("HBoOv").click() time.sleep(120) pass print("끝!!") # 해시태그 중복 검사 후 리스트로 재할당 tagList = list([tuple(set(tag)) for tag in tagList]) # 해시태그 갯수 구하기 for htags in tagList: for htag in htags: # 해시태그 카운트 업 if not (htag in hashtag): hashtag[htag] = 1 else: hashtag[htag] += 1 # 정렬 keys = sorted(hashtag.items(), key = lambda x:x[1], reverse = True) # n순위 까지 출력 for k, v in keys[:15]: print("{}({})".format(k, v)) end = datetime.datetime.now().strftime("%Y_%m_%d %H:%M:%S") print("start======",start) print("end======",end) print("enddivision=========",datetime.datetime.strptime(end,"%Y_%m_%d %H:%M:%S")-datetime.datetime.strptime(start,"%Y_%m_%d %H:%M:%S")) # result = pd.DataFrame(feedList) # result.columns = ['id','content','tag','like'] # result.head() #웹자원 종료 driver.close crawlingList["ttlfeed"] = ttlFeed.text crawlingList["crwfeed"] = len(tagList) crawlingList["succnt"] = pagedowns crawlingList["failcnt"] = failCnt crawlingList["created_at"] = start crawlingList["updated_at"] = end crawlingList["working_while"] = str(datetime.datetime.strptime(end,"%Y_%m_%d %H:%M:%S")-datetime.datetime.strptime(start,"%Y_%m_%d %H:%M:%S")) returnList["crawlingList"] = crawlingList returnList["tagList"] = keys returnList["excelList"] = feedList return returnList
def run(x): save_location = x currentdir = os.getcwd() #sema = threading.BoundedSemaphore(maxthreads) sema.acquire() for item in umlautdict.keys(): save_location = save_location.replace(item, umlautdict[item]) path = currentdir + '\\Output\\' + str(save_location) + "\\" #print(path) # Set Firefox preferences so that the file automatically saves to disk when downloaded if not os.path.exists('Output'): os.makedirs('Output') fp = webdriver.FirefoxProfile() fp.set_preference("browser.preferences.instantApply", True) fp.set_preference( "browser.helperApps.neverAsk.saveToDisk", "text/plain, application/octet-stream, application/binary, text/csv, application/csv, application/excel, text/comma-separated-values, text/xml, application/xml" ) fp.set_preference("browser.helperApps.alwaysAsk.force", False) fp.set_preference("browser.download.manager.showWhenStarting", False) fp.set_preference("browser.download.folderList", 2) fp.set_preference("browser.download.dir", currentdir + "\\Output\\" + str(save_location) + "\\") fp.set_preference("browser.download.downloadDir", currentdir + "\\Output\\" + str(save_location) + "\\") fp.set_preference("browser.download.defaultFolder", currentdir + "\\Output\\" + str(save_location) + "\\") driver = webdriver.Firefox(firefox_profile=fp) driver.get( "https://foerderportal.bund.de/foekat/jsp/SucheAction.do?actionMode=searchmask" ) # elem.clear() # bundesland.send_keys(options.bundesland) for i in range(len(options.bundesland) - 1): driver.find_element_by_css_selector( '#gemeindeZeile > td:nth-child(7) > input:nth-child(1)').click() for i in range(len(options.bundesland)): driver.find_element_by_css_selector( f'#suche_bundeslandSuche_{i}_').send_keys(options.bundesland[i]) if options.lfdvorhaben == False: driver.find_element_by_css_selector('#suche_lfdVhbN').click() driver.find_element_by_css_selector('#suche_nurVerbundJ').click() submit_button = driver.find_element_by_css_selector( "#suche_general_search") driver.find_element_by_css_selector("#suche_gemeindeSuche_0_").send_keys(x) # bundesland = driver.find_element_by_css_selector('#suche_bundeslandSuche_0_') driver.find_element_by_css_selector( '#suche_laufzeitVonSuche_0_').send_keys(options.laufzeit) submit_button.click() items = driver.find_element_by_css_selector( ".content_background_outer > h1:nth-child(3)").text #cosmetics items = int("".join(filter(str.isdigit, str(items)))) #cosmetics with tqdm(total=items) as progress_bar: select = Select( WebDriverWait(driver, 100).until( EC.element_to_be_clickable( (By.ID, 'listselect_suche_listrowfrom')))) progress = len(select.options) for index in range(len(select.options)): select = Select( WebDriverWait(driver, 5).until( EC.element_to_be_clickable( (By.ID, 'listselect_suche_listrowfrom')))) select.select_by_index(index) percentage = (index / progress) * 100 # print ("" + sys.argv[1] + " "+ str(percentage) + "%") # verbundprojekte = len(driver.find_elements_by_partial_link_text('J')) verbundprojekte = len( driver.find_elements_by_css_selector( "[title^='Detailansicht von Förderkennzeichen']")) #print (verbundprojekte) for index in range((verbundprojekte)): progress_bar.update(1) # update progress try: #counter = counter + 1 #print(str(counter) + "/" + "max." + str((progress*10))) #link = WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.PARTIAL_LINK_TEXT, 'J')))[index].click() WebDriverWait(driver, 5).until( EC.visibility_of_all_elements_located( (By.CSS_SELECTOR, "[title^='Detailansicht von Förderkennzeichen']" )))[index].click() # Detailansicht WebDriverWait(driver, 5).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, '#sucheVerbund > a:nth-child(3)' ))).click() #verbundliste WebDriverWait(driver, 5).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, 'li.nobreak_hz:nth-child(4)'))).click() #download #breakpoint() driver.execute_script("window.history.go(-3)") attempts = 0 while attempts < 5: try: select = Select( WebDriverWait(driver, 100).until( EC.element_to_be_clickable( (By.ID, 'listselect_suche_listrowfrom')))) attempts += 1 except StaleElementReferenceException as ex: select = Select( WebDriverWait(driver, 100).until( EC.element_to_be_clickable( (By.ID, 'listselect_suche_listrowfrom')))) #print ("stale") continue except NoSuchElementException as ex: # print("NoSuchElement " + str(ex)) continue except TimeoutException as ex: # print("Timeout " + str(ex)) # driver.back() # driver.back() driver.execute_script("window.history.go(-2)") continue except NoSuchElementException as ex: # print("NoSuchElement " + str(ex)) driver.execute_script("window.history.go(-2)") continue select = Select( WebDriverWait(driver, 5).until( EC.element_to_be_clickable( (By.ID, 'listselect_suche_listrowfrom')))) subprocess.call('copy *.csv merged.csv', shell=True, cwd=path) driver.close() sema.release()
def get_default_firefox_options(): firefoxOptions = webdriver.FirefoxOptions() firefoxOptions.setProfile(webdriver.FirefoxProfile()) return firefoxOptions
except Exception as e: print(str(e)) video_id = re.findall('"videoId": "(.*?)"', str(respData)) video_title = re.findall('"title": "(.*?)"', str(respData)) video_files = [] for titles in video_title: titles += ".mp3" video_files.insert(len(video_files), titles) dictionary = dict(zip(video_files, video_id)) video_dl = [] for eachP in dictionary.values(): video_dl.insert(len(video_dl), 'https://www.youtubeinmp3.com/download/?video=https://www.youtube.com/watch?v=' + (str(eachP))) dictionary = dict(zip(video_files, video_dl)) failed_dl = [] fp = webdriver.FirefoxProfile(r"C:\Users\Shlok Khandelwal\AppData\Roaming\Mozilla\Firefox\Profiles\4hlau0sw.Selenium") driver = webdriver.Firefox(executable_path=r"C:\Users\Shlok Khandelwal\Desktop\geckodriver.exe", firefox_profile=fp) driver.set_page_load_timeout(60) videoCount = 0 while(len(dictionary)> 0): songsToDelete = [] for eachLink in dictionary.values(): try: alert = driver.switch_to_alert() alert.dismiss() except Exception as e: print("No alert") try: driver.get(eachLink)
''' HTML 旨在显示信息,而 XML 旨在传输信息。 XML 没有预定义的标签。XML 允许创作者定义自己的标签和自己的文档结构。 在 HTML 中使用的标签(以及 HTML 的结构)是预定义的。HTML 文档只使用在 HTML 标准中定义过的标签(比如 <p> 、<h1> 等等)。 ''' from selenium import webdriver import time profile_dictionary = R"C:\Users\R\AppData\Roaming\Mozilla\Firefox\Profiles\yjdic0n5.default" profile = webdriver.FirefoxProfile(profile_dictionary) driver = webdriver.Firefox(profile) driver.get("Http://www.baidu.com") # 1. xpath 属性定位 # 1)通过元素ID、class、name等属性定位 *代表任意标签 # driver.find_element_by_xpath("//*[@id='kw']").send_keys("python") # driver.find_element_by_xpath("//*[@class='s_ipt']").send_keys("python") # driver.find_element_by_xpath("//*[@name='wd']").send_keys("python") # 2)如果一个元素id、name、class属性都没有,这时候也可以通过其它属性定位到 # 3) xpath:标签 *代表任意标签;如果有具体标签直接写标签即可 # driver.find_element_by_xpath("//input[@autocomplete='off']").send_keys("python") # 4)xpath:层级 1.如果一个元素,它的属性不是很明显,无法直接定位到,这时候我们可以先找父元素2.再找下个层级就能定位到了。 # 要是其父属性也不是很明显,就找它父元素的父元素。 # driver.find_element_by_xpath("//form[@id='form']/span/input").send_keys("python") # 5)xpath:索引 如果一个元素和其兄弟元素tag相同,那么使用层级来定位,就需要索引指定。索引从1开始算起 # driver.find_element_by_xpath("//select[@id='nr']/option[1]").click() # driver.find_element_by_xpath("//select[@id='nr']/option[2]").click()
def setUpClass(cls): super().setUpClass() profile = webdriver.FirefoxProfile( os.path.join(cls.profilesDir, 'test')) cls.browser = webdriver.Firefox(profile)
help="List the groups you want to scrape for recent posts") parser.add_argument("-d", "--depth", action="store", dest="depth", default=5, type=int, help="How many recent posts you want to gather -- in multiples of (roughly) 8.") args = parser.parse_args() BROWSER_EXE = '/usr/bin/firefox' GECKODRIVER = '/usr/local/bin/geckodriver' FIREFOX_BINARY = FirefoxBinary(BROWSER_EXE) # Code to disable notifications pop up of Chrome Browser PROFILE = webdriver.FirefoxProfile() # PROFILE.DEFAULT_PREFERENCES['frozen']['javascript.enabled'] = False PROFILE.set_preference("dom.webnotifications.enabled", False) PROFILE.set_preference("app.update.enabled", False) PROFILE.update_preferences() class CollectPosts(object): """Collector of recent FaceBook posts. Note: We bypass the FaceBook-Graph-API by using a selenium FireFox instance! This is against the FB guide lines and thus not allowed. USE THIS FOR EDUCATIONAL PURPOSES ONLY. DO NOT ACTAULLY RUN IT. """
info = thread.xpath('.//span[@class="GIEUOX-DOQ"]') parsed['seen'] = int(info[1].text.split()[0]) parsed['posts'] = int(info[0].text.split()[0]) return parsed GOOGLE_GROUP_BASE = 'https://groups.google.com/forum/' # GOOGLE_GROUP_URL = GOOGLE_GROUP_BASE + '#!forum/{}' GOOGLE_GROUP_URL = GOOGLE_GROUP_BASE + '#!forum/{}/?hl=en' GROUP_URL = GOOGLE_GROUP_URL.format('nsndev') proxy = os.environ.get('http_proxy') if proxy: PROXY_HOST, PROXY_PORT = os.environ.get('http_proxy').split('//')[1].split(':') PROXY_PORT = int(PROXY_PORT) fp = webdriver.FirefoxProfile() fp.set_preference("network.proxy.type", 1) fp.set_preference("network.proxy.http", PROXY_HOST) fp.set_preference("network.proxy.http_port", PROXY_PORT) fp.set_preference("network.proxy.ftp", PROXY_HOST) fp.set_preference("network.proxy.ftp_port", PROXY_PORT) fp.set_preference("network.proxy.ssl", PROXY_HOST) fp.set_preference("network.proxy.ssl_port", PROXY_PORT) # fp.set_preference("general.useragent.override", "whater_useragent") fp.update_preferences() browser = webdriver.Firefox(firefox_profile=fp) else: browser = webdriver.Firefox() browser.implicitly_wait(30)
from selenium import webdriver from selenium.webdriver.firefox.options import Options from pyvirtualdisplay import Display from time import sleep display = Display(visible=0, size=(800, 600)) display.start() options = Options() options.headless = False _browser_profile = webdriver.FirefoxProfile() _browser_profile.set_preference("dom.webnotifications.enabled", False) driver = webdriver.Firefox(options=options, firefox_profile=_browser_profile, executable_path=r'/root/ytbot/geckodriver') try: print("loaded") driver.get("https://www.ytmonster.net/campaigns/views") user_name = driver.find_element_by_id('inputUsername') user_name.send_keys('vinay221097') password = driver.find_element_by_id('inputPassword') password.send_keys('Musha22@') login = driver.find_element_by_xpath( '/html/body/div[2]/div/div/div/div[1]/div/form/button') login.click() print("logged in successfully") sleep(3) driver.get("https://www.ytmonster.net/exchange/views") sleep(4) except Exception as e: print("error occured", e)