wd = dr.window_handles # 所有的句柄 dr.switch_to_window(wd[-1]) # 切换句柄 # 处理框架 dr.switch_to_frame('') # 切换到内置框架中 dr.switch_to.default_content() # 退到原始页面 dr.switch_to.parent_frame() # 切换到上一层框架 # 等待时间 from selenium import webdriver from time import sleep import selenium.webdriver.support.ui as ui dr = webdriver.Firefox() dr.get('http://www.moore.ren/') # dr.maximize_window() # 强制等待sleep() sleep(2) # 智能等待 设置控制器dr等待 (先判断等待的元素是否显示在页面,如果显示了就不需要等待,不显示则等待) wait = ui.WebDriverWait(dr, 10) # 最大等待时间,就是说等待10秒后还是没有显示就会报错timeout un = wait.until(lambda dr: dr.find_element_by_xpath( '/html/body/div[2]/div/div[2]/div[6]/div/a/img').is_displayed()) # is_displayed判断元素有没有显示在屏幕上 # is_enabled 是判断元素是否为灰化状态 print( dr.find_element_by_xpath( '/html/body/div[2]/div/div[2]/div[6]/div/a/img').is_displayed()) dr.quit()
# _*_ coding: utf-8 _*_ from selenium import webdriver import selenium.webdriver.support.ui as ui import time print('----------------SYSTEM LOADIUNG, please wait........') SUMRESOURCES = 0 #全局变量 driver_item = webdriver.PhantomJS( executable_path= '/Users/sallyfan/downloads/phantomjs-2.1.1-macosx/bin/phantomjs') driver_detail = webdriver.Chrome( executable_path='/Users/sallyfan/downloads/chromedriver') url = "https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0" #等待页面家在方法 wait1 = ui.WebDriverWait(driver_item, 15) wait = ui.WebDriverWait(driver_detail, 15) #获取url和文章标题 def getURL_Title(): global SUMRESOURCES # 需要输入获取信息,例 种类,排序,想看多少 print('please select:') kind = input( "1-Hot\n2-Newest\n3-Classics\n4-Playable\n5-High Scores\n6-Wonderful but not popular\n7-Chinese film\n8-Hollywood\n9-Korea\n10-Japan\n11-Action movies\n12-Comedy\n13-Love story\n14-Science fiction\n15-Thriller\n16-Horror film\n17-Cartoon\nplease select:" ) print("-----------------------------------------------") sort = input( "1-Sort by hot\n2-Sort by time\n2-Sort by score\nplease select:")
def get_music(self): while True: if self.option_driver is False: self.driver_firefox() self.option_driver = True if self.driver is None: self.option_driver = False continue user_id = self.music_task.get() userId = user_id.strip() print('开始获取用户ID为:%s的歌曲。。。' % userId) try: self.driver.get( "http://music.163.com/user/songs/rank?id=%s" % userId ) # 需要抓取的用户链接,这里注意的是这里的id不是用户的id,而是用户听歌形成的所有时间排行的排行版的id self.driver.switch_to.frame( 'g_iframe') # 从windows切换到frame,切换到歌曲列表所在的frame except: self.option_driver = False continue try: time.sleep(1) wait = ui.WebDriverWait(self.driver, 60) # 找到歌曲列表所在的父标签 if wait.until(lambda driver: driver.find_element_by_class_name( 'g-bd')): WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, "z-sel"))) if self.driver.find_element_by_class_name( 'z-sel').text == '所有时间': soup = BeautifulSoup(self.driver.page_source, features='lxml') listen_num, all_time = self.music_info(soup) week_time = '' else: soup = BeautifulSoup(self.driver.page_source, features='lxml') listen_num, week_time = self.music_info(soup) self.driver.find_element_by_id('songsall').click() time.sleep(1) soup = BeautifulSoup(self.driver.page_source, features='lxml') listen_num, all_time = self.music_info(soup) else: all_time, week_time, listen_num = '', '', '0' result = { 'userId': user_id, 'all_music': all_time, 'week_music': week_time, 'listen_num': listen_num } self.user_result_queue.put(result) except Exception as e: print(e) try: soup = BeautifulSoup(self.driver.page_source, features='lxml') listen_num = soup.find( 'div', attrs={ 'class': 'u-title u-title-1 f-cb m-record-title' }).find('h4').string listen_num = re.findall(r'累积听歌(.*?)首', listen_num)[0].strip() except: listen_num = '0' result = { 'userId': user_id, 'all_music': '', 'week_music': '', 'listen_num': listen_num } self.user_result_queue.put(result)
def get_kindle_text(driver_type, book_title, first_page, last_page): if driver_type == 'chrome': path_to_chromedriver = '/Users/rwest/Downloads/chromedriver' # change path as needed driver = webdriver.Chrome(executable_path=path_to_chromedriver) elif driver_type == 'phantomjs': # phantom js settings dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/45.0.2454.99") #path_to_phantomjs = '/Users/rwest/Desktop/phantomjs' # change path as needed path_to_phantomjs = 'phantomjs' driver = webdriver.PhantomJS(executable_path=path_to_phantomjs, desired_capabilities=dcap) url = 'https://www.amazon.com/ap/signin?openid.assoc_handle=amzn_kweb&openid.'\ 'return_to=https%3A%2F%2Fread.amazon.com%2F&openid.mode='\ 'checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.'\ 'net%2Fauth%2F2.0&openid.identity=http%3A%2F%2Fspecs.openid.'\ 'net%2Fauth%2F2.0%2Fidentifier_select&openid.claimed_id='\ 'http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&'\ 'pageId=amzn_kcr' driver.get(url) # Fill out login form and submit email = driver.find_element_by_id("ap_email") password = driver.find_element_by_id("ap_password") email.send_keys("*****@*****.**") with open('kindle_credentials.txt', 'rb') as file: password_str = file.read() password.send_keys(password_str) driver.find_element_by_id("signInSubmit-input").click() # Wait until logged in then switch to KindleLibraryIFrame wait = ui.WebDriverWait(driver, 10) iFrame = wait.until( lambda driver: driver.find_element_by_id('KindleLibraryIFrame')) driver.switch_to.frame(iFrame) # close pop up message to use offline reader driver.find_element_by_id('kindle_dialog_firstRun_button').click() # select chosen book books = wait.until( lambda driver: driver.find_elements_by_class_name('book_title')) #books = driver.find_elements_by_class_name('book_title') for book in books: print book.text if book.text == book_title: book.click() # wait until book opened then switch to KindleReaderIFrame driver.switch_to.default_content() iFrame2 = wait.until( lambda driver: driver.find_element_by_id('KindleReaderIFrame')) driver.switch_to.frame(iFrame2) # select first page of book page_number = '2' def copy_page(page_number): page_selector = wait.until(lambda driver: driver.find_element_by_id( 'kindleReader_button_goto')) wait.until(lambda driver: driver.find_element_by_id( 'kindleReader_button_goto')) hover = ActionChains(driver).move_to_element( page_selector) # make button visible condition = '' while (condition == ''): try: page_selector.click() except ElementNotVisibleException: hover = ActionChains(driver).move_to_element( page_selector) # make button visible else: condition = 'passed' condition = '' while (condition == ''): try: driver.find_element_by_id( 'kindleReader_goToMenuItem_goToLocation').click() except NoSuchElementException: page_selector.click() else: condition = 'passed' enter_page_num = wait.until(lambda driver: driver.find_element_by_id( "kindleReader_dialog_gotoField")) enter_page_num.send_keys(page_number) buttons = driver.find_elements_by_class_name("ui-button") for b in buttons: if b.text == 'Go to location': b.click() time.sleep(7) driver.get_screenshot_as_file('temp.png') text = get_page_text('temp.png') os.remove('temp.png') return text results = [] for i in xrange(first_page, last_page + 1): text = copy_page(page_number=str(i)) results.append(text) print results # use this code to change page '''
def daily_task(): global DATE DATE = str(datetime.date.today()) chromeOptions = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images":2} chromeOptions.add_argument("--headless") chromeOptions.add_experimental_option("prefs",prefs) browser2 = webdriver.Chrome(chrome_options=chromeOptions,executable_path=CHROME_DRIVER_PATH) browser = webdriver.Chrome(chrome_options=chromeOptions,executable_path=CHROME_DRIVER_PATH) # browser2 = webdriver.Chrome(chrome_options=chromeOptions) # browser = webdriver.Chrome(chrome_options=chromeOptions) # browser = webdriver.Chrome() browser.set_window_position(400, 40) browser.set_window_size(1300, 1024) wait = ui.WebDriverWait(browser,60) wait2 = ui.WebDriverWait(browser,10) browser.get(BASE_URL) urls = [] write_html(browser.page_source, "All_cat_") soup = BeautifulSoup(browser.page_source, 'lxml') main_list = soup.find('ul', class_='nav-verticalmenu').find_all('li') k=0 for main_item in main_list: href = BASE_URL + main_item.find('a').get('href') browser.get(href) if k >= 1: break soup = BeautifulSoup(browser.page_source, 'lxml') list = soup.find('ul', class_='listSidebar').find_all('li') for item in list: if item.find('span', class_='pull-right') == None: continue else: url = BASE_URL + item.find('a').get('href') if url not in urls: urls.append(url) k+=1 j=0 while j < len(urls): print('Scraping', urls[j]) browser.get(urls[j]) soup = BeautifulSoup(browser.page_source, 'lxml') category_titles = soup.find('ol', class_='breadcrumb').find_all('li') if len(category_titles) == 2: category = category_titles[1].find('span').text.strip() else: category = category_titles[1].find('span').text.strip() i=0 pagination = True while pagination: if i != 0: try: wait2.until(lambda browser: browser.find_element_by_css_selector('#pagination > ul')) elements = browser.find_elements_by_css_selector('#pagination > ul > li') c=0 while c < len(elements)-1: class_name = elements[c].get_attribute("class") if "active" in class_name: if len(elements)-2 >= c+1: href_glob = elements[c+1].find_element_by_css_selector('a').get_attribute("href") browser.get(href_glob) c+=1 break else: pagination = False c+=1 break c+=1 except NoSuchElementException: pagination = False except TimeoutException: pagination = False except: pagination = False try: soup = BeautifulSoup(browser.page_source, 'lxml') list = soup.find('div', class_='product_list').find_all('div', class_='product_block') except: pagination = False if i == 0: try: soup = BeautifulSoup(browser.page_source, 'lxml') list = soup.find('div', class_='product_list').find_all('div', class_='product_block') except: pagination = False if pagination == False: break # print(len(list)) # print(i+1) for item in list: if item.find('a', class_='product-name') == None: continue else: href = BASE_URL + item.find('a', class_='product-name').get('href') browser2.get(href) soup = BeautifulSoup(browser2.page_source, 'lxml') if soup.find('h1', itemprop='name') != None: product_name = soup.find('h1', itemprop='name').text.strip() else: product_name = None # ---brand, (shown as Nhãn hiệu) # ---availability (shown as Tình trạng) # ---delivery fee, (if exists) # ---1111111111product name, # ---111111111price, # ---1111111111old_price (previous price if exists), # ---1111111111category (name of category), # ---1111111111current date # if item.find('div', class_='english_name') != None: # title_English = item.find('div', class_='english_name').text.strip() # else: # title_English = None # print("Title: " + title) if soup.find('span', class_='price') != None: price = soup.find('span', class_='price').text.strip() # price = price.split('₫')[1] # price = price.strip() else: price = None if soup.find('span', class_='availability') != None: availability = soup.find('span', class_='availability').text.strip() else: availability = None # print("Price: " + str(price)) if soup.find('span', class_='product-price-old') != None: old_price = soup.find('span', class_='product-price-old').text.strip() # old_price = old_price.split('₫')[1] # old_price = old_price.strip() else: old_price = None brand = None # availability = None m_list = soup.find('ul', class_='description').find_all('li')[0] brand = m_list.find('a').text.strip() data = {'category': category, 'product_name': product_name, 'brand': brand, 'availability': availability, 'price': price, 'old_price': old_price, 'date': DATE} write_csv(data) file_name = str(j+1) + "_" + str(i+1) + "_" write_html(browser.page_source, file_name) i+=1 # print(j) j+=1 # Close browser browser.close() browser.service.process.send_signal(signal.SIGTERM) browser.quit() # Close browser browser2.close() browser2.service.process.send_signal(signal.SIGTERM) browser2.quit() compress_data()
def _wait_for_element(self, args): wait = ui.WebDriverWait(self.browser, self.timeout) wait.until( expected_conditions.element_to_be_clickable( (args["find_by"], args["find_text"])))
# encoding:utf-8 import requests import re import selenium.webdriver.support.ui as ui from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC brower = webdriver.Chrome()#选择打开的浏览器 wait = ui.WebDriverWait(brower, 10) # 设置浏览器最长的加载时间 #查找内容 def search(): try: brower.get('http://gou.jd.com/') # 打开URL地址 imput = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#inputkey'))# 查找输入框 元件的存在presence_of_element_located 使用CSS_SELECTOR选择器 ) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#search_2015 > div.search_box_2015.clearfix > a'))) # 点击按钮框 可以点击的元素element_to_be_clickable imput.send_keys('魅族手机') # 输入查找关键字 submit.click() # 点击查找 total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#page > span > b'))) # 关于查找小辣椒手机所有的页数 return total.text except TimeoutError:#判定失败继续返回执行查找
def find_element_by_suffix(driver, key, val, buffer=None): try: function = key.split('_')[0] option = key.split('_')[-1] isSyn = lambda option: True if option == 'syn' else False if function == 'url': driver.get(val) #old_page = driver.find_element_by_tag_name('html') #ui.WebDriverWait(driver, time_out).until(EC.staleness_of(old_page)) elif function == 'click': if isSyn(option): element = find_element_by_syn(driver, val.split(',')) else: locator = find_locator_by_option(option, val) ui.WebDriverWait(driver, time_out).until( EC.visibility_of_element_located(locator)) element = EC.element_to_be_clickable(locator)(driver) element.click() elif function == 'sendkeys': if isSyn(option): element = find_element_by_syn(driver, val.split(',')[1:]) else: locator = find_locator_by_option(option, val.split(',')[-1]) ui.WebDriverWait(driver, time_out).until( EC.visibility_of_element_located(locator)) element = EC.visibility_of_element_located(locator)(driver) element.send_keys(val.split(',')[0]) elif function == 'none': if isSyn(option): find_element_by_syn(driver, val.split(',')) else: locator = find_locator_by_option(option, val) ui.WebDriverWait(driver, time_out).until( EC.visibility_of_element_located(locator)) EC.visibility_of_element_located(locator)(driver) elif function == 'get': if isSyn(option): element = find_element_by_syn(driver, val.split(',')) else: locator = find_locator_by_option(option, val) ui.WebDriverWait(driver, time_out).until( EC.visibility_of_element_located(locator)) element = EC.visibility_of_element_located(locator)(driver) buffer[key] = element.text elif function == 'getAttribute': if isSyn(option): element = find_element_by_syn(driver, val.split(',')) else: locator = find_locator_by_option(option, val.split(',')[-1]) ui.WebDriverWait(driver, time_out).until( EC.visibility_of_element_located(locator)) element = EC.visibility_of_element_located(locator)(driver) buffer[key] = element.get_attribute(val.split(',')[0]) elif function == 'getByfilter': if isSyn(option): element = find_element_by_syn(driver, val.split(',')) else: locator = find_locator_by_option(option, val.split(',')[-1]) ui.WebDriverWait(driver, time_out).until( EC.visibility_of_element_located(locator)) element = EC.visibility_of_element_located(locator)(driver) myfilter = eval(val.split(',')[0]) buffer[key] = myfilter(element.text) elif function == 'verify': if isSyn(option): element = find_element_by_syn(driver, val.split(',')) else: verify_context = "" if val.split(',')[0][0] == '$': verify_context = buffer[val.split(',')[0][1:]] else: verify_context = val.split(',')[0] locator = find_locator_by_option(option, val.split(',')[-1]) ui.WebDriverWait(driver, time_out).until( EC.visibility_of_element_located(locator)) element = EC.visibility_of_element_located(locator)(driver) if verify_context not in element.text: raise VarFail("Verify failed:" + verify_context) elif function == 'touch': if isSyn(option): element = find_element_by_syn(driver, val.split(',')) else: touch = webdriver.TouchActions(driver) locator = find_locator_by_option(option, val) ui.WebDriverWait(driver, time_out).until( EC.visibility_of_element_located(locator)) element = EC.visibility_of_element_located(locator)(driver) touch.tap(element).perform() elif function == 'script': js = val.split(',')[0] if len(val.split(',')) == 1: driver.execute_script(js) else: parms = map(lambda pa: buffer[pa[1:]] if pa[0] == '$' else pa, val.split(',')[1:]) driver.execute_script(js, parms) elif function == 'post': account, password, method = val.split(',')[:3] parms = map(lambda pa: buffer[pa[1:]] if pa[0] == '$' else pa, val.split(',')[3:]) eval("WebsiteHelper()()." + method)(*parms) elif function == 'alert': alert = driver.switch_to.alert eval("alert." + val + "()") except Exception, ex: raise VarAbort(ex.__str__())
def contracts(date): print("NSE CONTRACTS") driver = webdriver.Chrome(r'E:\chromedriver.exe') #driver.get("https://www.zaubacorp.com/") driver.get("https://www.nseindia.com/products/content/equities/equities/oi_spurts.htm") time.sleep(4); wait = ui.WebDriverWait(driver, 10) wait.until(page_is_loaded) ##END OF UNDERLYNING button=driver.find_element_by_xpath('//*[@id="tab8"]') button.click() time.sleep(5) df1=pd.DataFrame() columns1=["Instrument","Symbol","Expiry","Strike Price","Type","LTP","Prev.Close","%Change in LTP",date+" OI","Jan24,2018 OI","OI Change","Volume in contracts","TurnOver in crores","Premium Turnover in crores","Underlyning Value","Type of OI Spurts","Current Business Date","Previous Business Date"] #s=['Rise in OI-Rise in Price','Rise in OI-Slide in Price','Slide in OI-Rise in Price','Slide in OI-Slide in Price'] k=0 ul=driver.find_elements_by_xpath('//*[@id="replacetext"]/div/ul') ul=['//*[@id="replacetext"]/div/ul/li[2]','//*[@id="replacetext"]/div/ul/li[3]','//*[@id="replacetext"]/div/ul/li[4]'] time.sleep(5) row_count=len(driver.find_elements_by_xpath('//*[@id="replacetext"]/table/tbody/tr')) col_count=len(driver.find_elements_by_xpath('//*[@id="replacetext"]/table/tbody/tr[2]/td')) print (row_count) print (col_count) i=2 d1={} first_str='//*[@id="replacetext"]/table/tbody/tr[' second_str=']/td[' third_str=']' print ("FIRST ") while i<=row_count: j=1 while j<=col_count: final_str=first_str+str(i)+second_str+str(j)+third_str d1[columns1[j-1]]=driver.find_element_by_xpath(final_str).text j=j+1 d1[columns1[j-1]]="Rise in OI-Rise in Price" j=j+1 d1[columns1[j-1]]=date j=j+1 d1[columns1[j-1]]='Jan24,2018' i=i+1 df1=df1.append(d1,ignore_index=True) button=driver.find_element_by_xpath('//*[@id="riseinOIslideinPrice"]') button.click() time.sleep(5) print ("SECOND ") row_count=len(driver.find_elements_by_xpath('//*[@id="replacetext"]/table/tbody/tr')) col_count=len(driver.find_elements_by_xpath('//*[@id="replacetext"]/table/tbody/tr[2]/td')) i=2 while i<=row_count: j=1 while j<=col_count: final_str=first_str+str(i)+second_str+str(j)+third_str d1[columns1[j-1]]=driver.find_element_by_xpath(final_str).text j=j+1 d1[columns1[j-1]]="Rise in OI-Slide in Price" j=j+1 d1[columns1[j-1]]=date j=j+1 d1[columns1[j-1]]='Jan24,2018' i=i+1 df1=df1.append(d1,ignore_index=True) button=driver.find_element_by_xpath('//*[@id="slideinOIriseinPrice"]') button.click() time.sleep(5) print ("third") row_count=len(driver.find_elements_by_xpath('//*[@id="replacetext"]/table/tbody/tr')) col_count=len(driver.find_elements_by_xpath('//*[@id="replacetext"]/table/tbody/tr[2]/td')) i=2 while i<=row_count: j=1 while j<=col_count: final_str=first_str+str(i)+second_str+str(j)+third_str d1[columns1[j-1]]=driver.find_element_by_xpath(final_str).text j=j+1 d1[columns1[j-1]]="Slide in OI-Rise in Price" j=j+1 d1[columns1[j-1]]=date j=j+1 d1[columns1[j-1]]='Jan24,2018' i=i+1 df1=df1.append(d1,ignore_index=True) button=driver.find_element_by_xpath('//*[@id="slideinOIslideinPrice"]') button.click() time.sleep(5) print ("fourth") row_count=len(driver.find_elements_by_xpath('//*[@id="replacetext"]/table/tbody/tr')) col_count=len(driver.find_elements_by_xpath('//*[@id="replacetext"]/table/tbody/tr[2]/td')) i=2 while i<=row_count: j=1 while j<=col_count: final_str=first_str+str(i)+second_str+str(j)+third_str d1[columns1[j-1]]=driver.find_element_by_xpath(final_str).text j=j+1 d1[columns1[j-1]]="Slide in OI-Slide in Price" j=j+1 d1[columns1[j-1]]=date j=j+1 d1[columns1[j-1]]='Jan24,2018' i=i+1 df1=df1.append(d1,ignore_index=True) f = open(date + ".xlsx") st = os.path.realpath(f.name) path = st book = load_workbook(path) writer = pd.ExcelWriter(path, engine='openpyxl') writer.book = book df1.to_excel(writer,sheet_name="NSE CONTRACTS",columns=["Instrument","Symbol","Expiry","Strike Price","Type","LTP","Prev.Close","%Change in LTP",date+" OI","Jan24,2018 OI","OI Change","Volume in contracts","TurnOver in crores","Premium Turnover in crores","Underlyning Value","Type of OI Spurts","Current Business Date","Previous Business Date"]) writer.save() writer.close() driver.close()
def is_visible(self, locator_type, locator, timeout=2): try: ui.WebDriverWait(self.driver, timeout).until(EC.visibility_of_element_located((locator_type, locator))) return True except TimeoutException: return False
def is_clickable(self, locator_type, locator, timeout=2): try: ui.WebDriverWait(self.driver, timeout).until(EC.element_to_be_clickable((locator_type, locator))) return True except TimeoutException: return False
import os import os.path import re import unicodedata disp = False usuario = 'julioazt' senha = '9517539' if disp: display = Display(visible=0, size=(800, 600)) aux = display.start() chromedriver = "chromedriver" os.environ["webdriver.chrome.driver"] = chromedriver driver = webdriver.Chrome(chromedriver) wait = ui.WebDriverWait(driver, 20) url_login = '******' driver.get(url_login) wait.until(lambda driver: len(driver.find_elements_by_id("_CTL")) > 0) time.sleep(1) driver.find_element_by_id('_USR_LOGIN').send_keys(usuario) driver.find_element_by_id('_CTL').click() wait.until(lambda driver: len( driver.find_element_by_id("span__USR_NOME").get_attribute('innerHTML')) > 0 ) driver.find_element_by_id('_CTL').send_keys(senha + Keys.ENTER) time.sleep(5) def espera(): wait.until(lambda driver: ('display: none' in driver.find_element_by_id(
def get_main_data(): table = driver.find_element_by_id("bibview") data = {"Alias": alias} for tr in table.find_elements_by_tag_name("tr"): tds = tr.find_elements_by_tag_name("td") even = False key = None for td in tds: if not even: key = td.text.strip() else: if key and td.text.strip() not in ["", "-"]: data[key] = _normalize_date_str( td.text.strip().replace(" all Inventors", "")) even = not even table = driver.find_element_by_id("bibviewTitle") td = table.find_elements_by_tag_name("td") data["Title"] = td[1].text.strip() try: print("ptaptetab") self._switch_page("javascript:submitTab('ptaptetab')") table = webDriverUi.WebDriverWait(driver, 20).until( (EC.presence_of_element_located([ By.XPATH, "//table[@id='ptaptesummarytable' and " "@cellpadding='3']" ]))) key = None for td in table.find_elements_by_tag_name("td"): if not key: key = td.text.strip() else: data[key] = _normalize_date_str(td.text.strip()) key = None except SiteUsLinkNotFound: print("No tab Patent term Adjustments") try: print("Correspondencetab") self._switch_page("javascript:submitTab('Correspondencetab')") table = webDriverUi.WebDriverWait(driver, 20).until( (EC.presence_of_element_located( [By.XPATH, "//table[@id='correspondence']"]))) key = None for td in table.find_elements_by_tag_name("td")[1:]: if not key: key = td.text.strip() else: data[key] = _normalize_date_str(td.text.strip()) key = None data["Agent"] = "Name: " + data["Name:"] + "\n\n" + \ "Address:\n" + data["Address:"] except SiteUsLinkNotFound: print("No tab Address & Attorney/Agent") try: print("continuitytab") self._switch_page("javascript:submitTab('continuitytab')") table = webDriverUi.WebDriverWait(driver, 10).until( (EC.presence_of_element_located( [By.XPATH, "//table[@id='continuityparent']"]))) try: con_data = [] keys = [] for td in table.find_elements_by_tag_name("th"): keys.append(td.text.strip()) for tr in table.find_elements_by_id("parentdata0"): key = None value = "" one = { "Alias": alias, } con_data.append(one) i = 0 for td in tr.find_elements_by_tag_name("td"): one[keys[i]] = _normalize_date_str(td.text.strip()) i += 1 self._saver.save_parent_continuity(con_data) except: print("No parent data.") try: str_ = "" for tr in driver.find_elements_by_id("childdata0"): str_ += "\n" + _normalize_date_str(tr.text.strip()) data["Child Continuity Data"] = str_.strip() except: print("No child data.") except SiteUsLinkNotFound: print("No tab Continuity Data") try: print("foreignPrioritiestab") self._switch_page( "javascript:submitTab('foreignPrioritiestab')") td = webDriverUi.WebDriverWait(driver, 20).until( (EC.presence_of_element_located( [By.XPATH, "//td[@id='forpriority']"]))) table = td.find_element_by_tag_name("table") key = "Country |Priority |Priority Date ;" value = "" for tr in table.find_elements_by_xpath( "//tr[@class='wpsTableNrmRow' or @class='wpsTableShdRow']" ): next_val = "" for td in tr.find_elements_by_tag_name("td"): next_val += " |" + td.text.strip() next_val = next_val[2:] value += "\n" + next_val data[key] = _normalize_date_str(value.strip()) except SiteUsLinkNotFound: print("No tab Foreign Priority") self._saver.save_main_data(data) return data
def _process_number(self, number, number_type, alias): def _normalize_date_str(date_str): date_str = re.sub(r'(\d{2})\-(\d{2})\-(\d{4})', r'\g<2>/\g<1>/\g<3>', date_str) return date_str def get_main_data(): table = driver.find_element_by_id("bibview") data = {"Alias": alias} for tr in table.find_elements_by_tag_name("tr"): tds = tr.find_elements_by_tag_name("td") even = False key = None for td in tds: if not even: key = td.text.strip() else: if key and td.text.strip() not in ["", "-"]: data[key] = _normalize_date_str( td.text.strip().replace(" all Inventors", "")) even = not even table = driver.find_element_by_id("bibviewTitle") td = table.find_elements_by_tag_name("td") data["Title"] = td[1].text.strip() try: print("ptaptetab") self._switch_page("javascript:submitTab('ptaptetab')") table = webDriverUi.WebDriverWait(driver, 20).until( (EC.presence_of_element_located([ By.XPATH, "//table[@id='ptaptesummarytable' and " "@cellpadding='3']" ]))) key = None for td in table.find_elements_by_tag_name("td"): if not key: key = td.text.strip() else: data[key] = _normalize_date_str(td.text.strip()) key = None except SiteUsLinkNotFound: print("No tab Patent term Adjustments") try: print("Correspondencetab") self._switch_page("javascript:submitTab('Correspondencetab')") table = webDriverUi.WebDriverWait(driver, 20).until( (EC.presence_of_element_located( [By.XPATH, "//table[@id='correspondence']"]))) key = None for td in table.find_elements_by_tag_name("td")[1:]: if not key: key = td.text.strip() else: data[key] = _normalize_date_str(td.text.strip()) key = None data["Agent"] = "Name: " + data["Name:"] + "\n\n" + \ "Address:\n" + data["Address:"] except SiteUsLinkNotFound: print("No tab Address & Attorney/Agent") try: print("continuitytab") self._switch_page("javascript:submitTab('continuitytab')") table = webDriverUi.WebDriverWait(driver, 10).until( (EC.presence_of_element_located( [By.XPATH, "//table[@id='continuityparent']"]))) try: con_data = [] keys = [] for td in table.find_elements_by_tag_name("th"): keys.append(td.text.strip()) for tr in table.find_elements_by_id("parentdata0"): key = None value = "" one = { "Alias": alias, } con_data.append(one) i = 0 for td in tr.find_elements_by_tag_name("td"): one[keys[i]] = _normalize_date_str(td.text.strip()) i += 1 self._saver.save_parent_continuity(con_data) except: print("No parent data.") try: str_ = "" for tr in driver.find_elements_by_id("childdata0"): str_ += "\n" + _normalize_date_str(tr.text.strip()) data["Child Continuity Data"] = str_.strip() except: print("No child data.") except SiteUsLinkNotFound: print("No tab Continuity Data") try: print("foreignPrioritiestab") self._switch_page( "javascript:submitTab('foreignPrioritiestab')") td = webDriverUi.WebDriverWait(driver, 20).until( (EC.presence_of_element_located( [By.XPATH, "//td[@id='forpriority']"]))) table = td.find_element_by_tag_name("table") key = "Country |Priority |Priority Date ;" value = "" for tr in table.find_elements_by_xpath( "//tr[@class='wpsTableNrmRow' or @class='wpsTableShdRow']" ): next_val = "" for td in tr.find_elements_by_tag_name("td"): next_val += " |" + td.text.strip() next_val = next_val[2:] value += "\n" + next_val data[key] = _normalize_date_str(value.strip()) except SiteUsLinkNotFound: print("No tab Foreign Priority") self._saver.save_main_data(data) return data def get_event_data(): table = driver.find_element_by_id("bibcontents") data = [] for tr in table.find_elements_by_xpath( "//tr[@class='wpsTableNrmRow' or @class='wpsTableShdRow']" ): tds = tr.find_elements_by_tag_name("td") data.append({ "Alias": alias, "Input": input_field, "Date": _normalize_date_str(tds[0].text.strip()), "Action": tds[1].text.strip(), }) self._saver.save_evt_history_data(data) return data def get_documents_data(): table = driver.find_element_by_id("ifwinnertable") data = [] for tr in table.find_elements_by_xpath( "//tr" "[@class='wpsTableNrmRow' or @class='wpsTableShdRow']"): tds = tr.find_elements_by_tag_name("td") data.append({ "Alias": alias, "Input": input_field, "Date": _normalize_date_str(tds[0].text.strip()), "Document type": tds[2].text.strip(), "Category": tds[3].text.strip(), "Number of pages": tds[4].text.strip(), }) dosnum = re.search( "document\.downloadForm\.dosnum\.value='(\d+)';", driver.page_source) dosnum = dosnum.group(1) sels = "0" * len(data) try: i = 0 for d in data: sel = sels[:i] + "1" + sels[i + 1:] url = "http://portal.uspto.gov/pair/download/ShowPdfBook?" \ "dosnum=%s&sels=%s" % (dosnum, sel) d["Link"] = url i += 1 except Exception as e: print(e) raise self._saver.save_documents_data(data) print("") if not self._do_download: print("Download is turned off.") return print("Downloading files.") self._fetcher.clear_cookies() s = self._fetcher.get_session() for cook in driver.get_cookies(): s.cookies[cook["name"]] = cook["value"] files_dir = os.path.dirname(os.path.realpath(__file__)) files_dir = os.path.join(files_dir, "../Output/%s" % alias) try: os.stat(files_dir) except: os.mkdir(files_dir) for i in range(len(sels)): sel = sels[:i] + "1" + sels[i + 1:] url = "http://portal.uspto.gov/pair/download/ShowPdfBook?" \ "dosnum=%s&sels=%s" % (dosnum, sel) filename = "%d - %s.pdf" % \ (i, fs.clean_filename(data[i]["Document type"])) print("Downloading file: %s" % filename) self._fetcher.download_file(url, os.path.join(files_dir, filename)) return data print("Processing number: %s (%s)" % (number, number_type)) input_field = "%s (%s)" % (number, number_type ) # setting up input field print("") print("Entering number...") self._switch_page("javascript:submitTab('pair_search')") driver = self._driver # waiting for JavaScript to finish webDriverUi.WebDriverWait(driver, 20) \ .until(EC.presence_of_element_located([By.ID, "SubmitPAIR"])) if number_type == "USA": driver.find_element_by_xpath( "//input[@title='application number']").click() elif number_type == "USPUB": driver.find_element_by_xpath( "//input[@title='publication number']").click() elif number_type == "USPAT": driver.find_element_by_xpath( "//input[@title='patent number']").click() else: raise Exception("Unknown number type: %s" % number_type) driver.find_element_by_id("number_id").send_keys(number) driver.find_element_by_id("SubmitPAIR").click() print("WAITING...") element = webDriverUi.WebDriverWait(driver, 20).until( self._wait([ "//img[@alt='Application Data']", "//div[@id='ERRORDIV']", "//div[@id='ERRORDIVPALMPROBLEM']", "//table[@class='epoTableBorder']//font[@color='red']" ])) if element.get_attribute('id') == 'ERRORDIVPALMPROBLEM': print("Overloaded, trying again in 5 seconds...") time.sleep(5) return self._process_number(number, number_type, alias) if element.text: if "Service not available at this time" in element.text: print("Service not available, trying again in 5 seconds...") time.sleep(5) return self._process_number(number, number_type, alias) raise SiteUsNoNumberException("Error: %s" % element.text.strip()) self._switch_page("javascript:submitTab('detailstab')") try: element = webDriverUi.WebDriverWait(driver, 20).until( self._wait(["//img[@src='/pair/img/tabs/image1on.gif']"])) except: print("Wrong tab opened?...") self._process_number(number, number_type, alias) # main data print("") print("Getting main data...") get_main_data() # history data print("") print("Getting history data...") try: self._switch_page("javascript:submitTab('fileHistorytab')") webDriverUi.WebDriverWait(driver, 20).until( (EC.presence_of_element_located([By.ID, "bibcontents"]))) get_event_data() except SiteUsException: print("No history data") print("") print("Getting document data...") try: self._switch_page("javascript:submitTab('ifwtab')") webDriverUi.WebDriverWait(driver, 20).until( (EC.presence_of_element_located([By.ID, "ifwinnertable"]))) get_documents_data() except SiteUsException: print("No document data")
def wait_element_is_clickable(self, method, element, sec=10): return ui.WebDriverWait(self.driver, sec).until( EC.element_to_be_clickable((method, element)))
def is_not_visible(locator, timeout=20): try: ui.WebDriverWait(WebOp.shared_wd, timeout).until_not(EC.visibility_of_element_located((By.XPATH, locator))) return True except TimeoutException: return False
def wait_for_sidebar_is_loaded(self, sec=10): ui.WebDriverWait(self.driver, sec).until( EC.presence_of_element_located( (by.By.CSS_SELECTOR, "div#sidebar li.active"))) time.sleep(0.5)
def downloadWebPMUpage(): try: connnexion = sqlite3.connect('../03 - BDD/BasePMU.db') except sqlite3.Error as er: print ('une erreur est survenue lors de la connection de la base' + er.message) exit(1) cursor = connnexion.cursor() url='https://info.pmu.fr' caps = DesiredCapabilities.FIREFOX.copy() caps['marionette'] = True br = webdriver.Firefox(capabilities=caps, executable_path='./Package/geckodriver.exe') br.get(url) soup='' wait = ui.WebDriverWait(br,5) br.find_element_by_class_name('cnil-close').click() wait = ui.WebDriverWait(br,5) # #------------ Janvier # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #-------------décembre # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #-------------Novembre # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #------------Octobre # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #-----------Septembre # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #----------- aout # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #----------- Juillet # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #-------------Juin # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #-------------mai # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #-------------Avril # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #-------------Février # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() # #-------------Janvier # time.sleep(2) # br.find_element_by_xpath("//div[@class='date']").click() # br.find_element_by_xpath("//a[@class='ui-datepicker-prev ui-corner-all']").click() # br.find_element_by_xpath(u'//a[text()="1"]').click() for v in range(1,8): time.sleep(2) wait = ui.WebDriverWait(br,5) br.find_element_by_xpath("//div[@class='date']").click() br.find_element_by_xpath(u'//a[text()="'+str(v)+'"]').click() wait = ui.WebDriverWait(br,5) time.sleep(10) link = br.find_element_by_xpath ("(//button[contains(text(),'Programme en détails')])") link.click() wait = ui.WebDriverWait(br,5) soup=BeautifulSoup(br.page_source,'lxml') writeFile(soup.prettify(),'../02 - Page Web/listeProgramme.html') pagePMU = codecs.open('../02 - Page Web/listeProgramme.html', 'r','utf-8') listCourse=listeCoursePMU(pagePMU) print(listCourse) for j in range(0,len(listCourse)): query = ('UPDATE COURSE SET HIPPODROME="%s" WHERE URL = "%s" ' % ( listCourse[j][0],listCourse[j][3]) ) print(query) cursor.execute(query) connnexion.commit() br.close()
def _do_login_step_2(self, args): wait = ui.WebDriverWait(self.browser, self.timeout) wait.until( expected_conditions.element_to_be_clickable((By.ID, "nav_logout")))
url = "https://www.instagram.com/p/CH-MgQOn-7E/" # Chagent this with any other competition " " #We go to the url time.sleep(timer) driver.get(url) time.sleep(timer) #Technical things counter = 0 friendList = ['@','@','@'] # Vale ta onomata ton filon sou opos vlepeis edo me komma endiamesa '@','@','@', .. while(True): try: # Create the comment send = friendList[random.randint(0,len(friendList)-1)] + " " + friendList[random.randint(0,len(friendList)-1)] + " " + friendList[random.randint(0,len(friendList)-1)] # lETS TRY comment_box = ui.WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "textarea.Ypffh"))) comment_box.send_keys(send) comment_box.send_keys(Keys.ENTER) time.sleep(timer) driver.refresh() # Doesnt sleep so this is an alternative #Counter increase and wait 5 seconds counter+=1 if(counter>=100): # If more than 20 comments, sleep time.sleep(60*20) except Exception as e: #If exception continue print(e) continue
def getURL_Title(): global save_name SUMRESOURES = 0 url = 'https://movie.douban.com' driver_item = webdriver.Firefox() wait = ui.WebDriverWait(driver_item, 15)
def get_element_id(self, el_name, sec=10): el = ui.WebDriverWait(self.driver, sec).until( EC.presence_of_element_located( (by.By.XPATH, consts.AppPackages.format(el_name)))) path = el.get_attribute("id") return path.split('__')[-1]
def is_not_visible(driver, locator, method, timeout=10): try: ui.WebDriverWait(driver, timeout).until_not(EC.visibility_of_element_located((method, locator))) return True except TimeoutException: return False
def check_element_on_page(self, method, value, sec=10): try: ui.WebDriverWait(self.driver, sec).until( EC.presence_of_element_located((method, value))) except exc.TimeoutException: self.fail("Element {0} is not preset on the page".format(value))
from selenium import webdriver import time import selenium.webdriver.support.ui as ui driver = webdriver.Firefox() url = "http://mail.163.com/" wait = ui.WebDriverWait(driver, 10) driver.get(url) time.sleep(5) #跳转到登陆frame frame = driver.find_element_by_id('mainBg').find_element_by_class_name( 'loginWrap').find_element_by_id('loginDiv').find_element_by_css_selector( 'iframe') driver.switch_to.frame(frame) time.sleep(5) #登陆邮箱 your_mail = '' your_pwd = '' driver.find_element_by_name("email").send_keys(your_mail) driver.find_element_by_name("password").send_keys(your_pwd) time.sleep(3) driver.find_element_by_id("dologin").click() time.sleep(6) driver.quit() print("login in")
def wait_for_alert_message(self, sec=5): locator = (by.By.CSS_SELECTOR, 'div.alert-success') logger.debug("Waiting for a success message") ui.WebDriverWait(self.driver, sec).until( EC.presence_of_element_located(locator))
def wait_element_visible(locator, timeOut=5): try: return ui.WebDriverWait(browser, timeOut).until( EC.visibility_of_element_located(locator)) except Exception: return False
def wait_for_error_message(self, sec=20): locator = (by.By.CSS_SELECTOR, 'div.alert-danger > p') logger.debug("Waiting for an error message") ui.WebDriverWait(self.driver, sec, 1).until( EC.presence_of_element_located(locator)) return self.driver.find_element(*locator).text
def daily_task(): global DATE DATE = str(datetime.date.today()) chromeOptions = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images":2} # chromeOptions.add_argument("--disable-javascript") chromeOptions.add_argument("--headless") chromeOptions.add_experimental_option("prefs",prefs) browser = webdriver.Chrome(options=chromeOptions,executable_path=CHROME_DRIVER_PATH) # browser = webdriver.Chrome() browser.set_window_position(400, 40) browser.set_window_size(1300, 1024) wait = ui.WebDriverWait(browser,60) urls = [] browser.get('https://www.careerlink.vn/en') soup = BeautifulSoup(browser.page_source, 'lxml') category_list = soup.find('div', id='search-by-category').find_all('a') for item in category_list: url = BASE_URL + item.get('href') if url not in urls: urls.append(url) write_html(browser.page_source, "All_cat_") j=0 while j < len(urls): browser.get(urls[j]) wait.until(lambda browser: browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/div[1]/p')) soup = BeautifulSoup(browser.page_source, 'lxml') category = soup.find('p', class_='lead-sm').find('strong').text.strip() category = category.replace('"', '') i=0 pagination = True while pagination: soup = BeautifulSoup(browser.page_source, 'lxml') if i != 0: if i == 1: browser.get(urls[j]) file_name = str(j+1) + "_" + str(i) + "_" write_html(browser.page_source, file_name) else: browser.get(href_glob) file_name = str(j+1) + "_" + str(i) + "_" write_html(browser.page_source, file_name) wait.until(lambda browser: browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[3]/nav/ul')) elements = browser.find_elements_by_css_selector('ul.pagination > li') if len(elements) == 1: pagination = False break c=0 while c < len(elements): class_name = elements[c].get_attribute("class") if "active" in class_name: if len(elements)-1 >= c+1: href_glob = elements[c+1].find_element_by_css_selector('a').get_attribute("href") browser.get(href_glob) c+=1 break else: pagination = False c+=1 break c+=1 wait.until(lambda browser: browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[3]/nav/ul')) soup = BeautifulSoup(browser.page_source, 'lxml') list = soup.find('div', class_='list-group').find_all('div', class_='list-group-item') if i == 0: soup = BeautifulSoup(browser.page_source, 'lxml') list = soup.find('div', class_='list-group').find_all('div', class_='list-group-item') if pagination == False: break # print(len(list)) # print(i+1) for item in list: # if item.find('div', class_='ct_title') != None: # title = item.find('div', class_='ct_title').text.strip() # else: # title = None href = BASE_URL + item.find('a').get('href') browser.get(href) wait.until(lambda browser: browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]')) soup = BeautifulSoup(browser.page_source, 'lxml') try: if soup.find('span', itemprop='baseSalary') != None: Salary = soup.find('span', itemprop='baseSalary').text.strip() else: Salary = None except: Salary = None try: if soup.find('span', itemprop='address') != None: Work_location = soup.find('span', itemprop='address').text.strip() else: Work_location = None except: Work_location = None try: if soup.find('div', itemprop='skills') != None: Job_Requirement = soup.find('div', itemprop='skills').text.strip() else: Job_Requirement = None except: Job_Requirement = None try: if soup.find('div', itemprop='description') != None: Job_Description = soup.find('div', itemprop='description').text.strip() else: Job_Description = None except: Job_Description = None # 5555555555 ---Salary, # 5555555555 ---Work location, (shown as “Work Location”) # ---Job level, (shown as “Career Level”) # ---Industry, (shown as “Job Category”) # ---Job type, (shown as “Position Type”) # ---Age, (shown as “Age”) # ---Gender, (shown as “Gender Require”) # ---Experience, (shown as “Experience Level”) # ---Education, (shown as “Education Level”) # 5555555555 ---Job Description, (shown as “Job Description Detail”) # 5555555555 ---Job Requirement, (shown as “Required Experience/Skills Detail) # ---Benefits, (if exists) Job_level = None Industry = None Job_type = None Age = None Gender = None Experience = None Education = None try: ul = soup.select('div.job-data > ul.list-unstyled')[1] lis = ul.find_all('li') for li in lis: txt = li.text.strip() if "Career Level" in txt: Job_level = txt # Job_level = li.text.strip() Job_level = Job_level.replace('Career Level:','') Job_level = Job_level.strip() continue if "Job Category" in txt: Industry = txt # Industry = li.text.strip() Industry = Industry.replace('Job Category:','') Industry = Industry.strip() continue if "Position Type" in txt: Job_type = txt # Job_type = li.text.strip() Job_type = Job_type.replace('Position Type:','') Job_type = Job_type.strip() continue if "Age" in txt: Age = txt # Age = li.text.strip() Age = Age.replace('Age:','') Age = Age.strip() continue if "Gender Require" in txt: Gender = txt # Gender = li.text.strip() Gender = Gender.replace('Gender Require:','') Gender = Gender.strip() continue if "Experience Level" in txt: Experience = txt # Experience = li.text.strip() Experience = Experience.replace('Experience Level:','') Experience = Experience.strip() continue if "Education Level" in txt: Education = txt # Education = li.text.strip() Education = Education.replace('Education Level:','') Education = Education.strip() continue except: Job_level = None Industry = None Job_type = None Age = None Gender = None Experience = None Education = None data = {'category': category, 'Salary': Salary, 'Work_location': Work_location, 'Job_level': Job_level, 'Industry': Industry, 'Job_type': Job_type, 'Age': Age, 'Gender': Gender, 'Experience': Experience, 'Education': Education, 'Job_Description': Job_Description, 'Job_Requirement': Job_Requirement, 'date': DATE} write_csv(data) # file_name = str(j+1) + "_" + str(i+1) + "_" # write_html(browser.page_source, file_name) i+=1 j+=1 # Close browser browser.close() browser.service.process.send_signal(signal.SIGTERM) browser.quit() compress_data()
def read(path, loadjs=False, session=None, driver=None, timeout=60, clear_cookies=True, loadjs_wait_time=3, loadjs_wait_for_callback=None, strict=True): """Reads from source and returns contents Args: path: (str) url or local path to download loadjs: (boolean) indicates whether to load js (optional) session: (requests.Session) session to use to download (optional) driver: (selenium.webdriver) webdriver to use to download (optional) timeout: (int) Maximum number of seconds to wait for the request to complete. clear_cookies: (boolean) whether to clear cookies. loadjs_wait_time: (int) if loading JS, seconds to wait after the page has loaded before grabbing the page source loadjs_wait_for_callback: (function<selenium.webdriver>) if loading JS, a callback that will be invoked to determine when we can grab the page source. The callback will be called with the webdriver, and should return True when we're ready to grab the page source. For example, pass in an argument like: ``lambda driver: driver.find_element_by_id('list-container')`` to wait for the #list-container element to be present before rendering. strict: (bool) If False, when download fails, retry but allow parsing even if there is still minimal network traffic happening. Useful for sites that regularly poll APIs. Returns: str content from file or page """ session = session or DOWNLOAD_SESSION if clear_cookies: session.cookies.clear() try: if loadjs: # Wait until js loads then return contents if USE_PYPPETEER: content = asyncio.get_event_loop().run_until_complete(load_page(path)) return content if PHANTOMJS_PATH: driver = driver or webdriver.PhantomJS(executable_path=PHANTOMJS_PATH) else: driver = driver or webdriver.PhantomJS() driver.get(path) if loadjs_wait_for_callback: selenium_ui.WebDriverWait(driver, 60).until(loadjs_wait_for_callback) time.sleep(loadjs_wait_time) return driver.page_source else: # Read page contents from url retry_count = 0 max_retries = 5 while True: try: response = session.get(path, stream=True, timeout=timeout) break except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: retry_count += 1 print("Error with connection ('{msg}'); about to perform retry {count} of {trymax}." .format(msg=str(e), count=retry_count, trymax=max_retries)) time.sleep(retry_count * 1) if retry_count >= max_retries: raise e response.raise_for_status() return response.content except (requests.exceptions.MissingSchema, requests.exceptions.InvalidSchema): with open(path, 'rb') as fobj: # If path is a local file path, try to open the file return fobj.read()