def test_saysth(): browser = Browser() # 用户打开了主页 browser.get(index_url) left_bar = browser.find_element_by_class_name('layout-aside-left') # 用户看见了登录按键 login_button = left_bar.find_element_by_class_name('q-card-main') assert '登录' in login_button.text # 用户点击了登录 login_button.click() sleep(1) # 用户登录了网站 assert try_login_with(browser, TESTING_CARD_ID, TESTING_PASSWORD) # 用户点击了"广场" browser.find_element_by_xpath( '//*[@id="q-app"]/div/aside/div[1]/div[2]/div[3]/div[2]/div').click() # 用户点击了"+"按钮 browser.find_element_by_css_selector( 'div.layout-page-container.transition-generic > main > div:nth-child(1) > div.z-fixed.fixed-bottom-right > button' ).click() sleep(1) # 用户输入了一些话 browser.find_element_by_xpath( '/html/body/div[3]/div/div/div[2]/div[1]/div/div/div/div/a/div[2]/div[2]/textarea' ).send_keys('测试') # 用户点击了发布 browser.find_element_by_xpath( '/html/body/div[3]/div/div/div[1]/div/button[2]/span').click() sleep(1) # 用户看见了自己的帖子 the_post = browser.find_element_by_css_selector( '#q-app > div > div.layout-page-container.transition-generic > main > div:nth-child(1) > div.q-infinite-scroll > div.q-infinite-scroll-content > div:nth-child(3)' ) post_text = the_post.text assert '测试' in post_text # 用户点击了删除 the_post.find_element_by_tag_name('button').click() sleep(1) browser.find_element_by_class_name( 'modal-buttons').find_elements_by_tag_name('button')[1].click() sleep(1) # 用户不再能看见了自己的帖子了 try: the_post = browser.find_element_by_css_selector( '#q-app > div > div.layout-page-container.transition-generic > main > div:nth-child(1) > div.q-infinite-scroll > div.q-infinite-scroll-content > div:nth-child(3)' ) except: pass else: post_text = the_post.text assert '测试' not in post_text browser.close()
def extract_citation_for_publication(link): """ this function craws the list of articles from a given link. If it has next page, it will continue to it until there is none @param[in] profile_url the link of google scholar profile you want to crawl @return the list of articles as a list where each entry is dictionary """ browser = Browser('chromedriver.exe') citation = {} # go the citation view # as the page is written is javascript, we are not able to get its content via urllib2 # intead we will use Selenium to simulate a web browser to render the page # req=urllib2.Request(publication[k]['link'], headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'}) # p=urllib2.urlopen(req) # sub_soup=BeautifulSoup(p.readlines()[0], 'html.parser') # s=sub_soup.find(id='gs_ccl') browser.get(link) while True: citation_root = browser.find_element_by_id('gs_ccl') citation_list = citation_root.find_elements_by_class_name('gs_r') for citation_item in citation_list: # title title = citation_item.find_element_by_class_name('gs_rt').text # try to get the downloading link, if there is one try: link = citation_item.find_element_by_id('gs_ggsW2') link = link.find_element_by_link_text( link.text).get_attribute('href') except: link = None # author author_line = citation_item.find_element_by_class_name('gs_a') author_name = author_line.text.split(', ') author = {} # for each of the author, find its link if its exits for a in author_name: try: print '.', # there is a google scholar profile with author item = author_line.find_element_by_link_text(a) author[a] = item.get_attribute('href') except: # there is not such profile author[a] = None # we can also press the cite button to get the detailed citation information, skipped here citation[title] = {'link': link, 'author': author} # go to next page, if there is one if not next_page(browser): break # close browser.close() return citation
def extract_movies(max_page_num=5): browser = Browser() browser.get(URL) movies = {} while True: movie_list = browser.find_elements_by_class_name('item') for movie in movie_list: title = movie.find_element_by_tag_name("p").text.strip() rating = movie.find_element_by_tag_name("strong").text.strip() movies[title] = rating if max_page_num > 0: max_page_num -= 1 if not have_more(browser): break else: break browser.close() return movies
def extract_hongren(max_page_num=5): suffix = "hongren" # 正常情况下,把driver文件所在路径加到Path环境变量里就可以了 # 但是我这里不知道怎么回事就是不行,干脆放在代码所在目录下面了 browser = Browser('chromedriver.exe') browser.get(BASE_URL + suffix) items = {} while True: item_list = browser.find_elements_by_class_name('wall_item') for item in item_list: href = item.find_element(By.CSS_SELECTOR, ".pic_box.pic").get_attribute("href") desc = item.find_elements_by_class_name("desc")[0].text.strip() items[href] = desc if max_page_num > 0: max_page_num -= 1 if not scroll_to_next(browser): break else: break browser.close() return items
def test_login(): browser = Browser() # 用户打开了主页 browser.get(index_url) left_bar = browser.find_element_by_class_name('layout-aside-left') # 用户看见了登录按键 login_button = left_bar.find_element_by_class_name('q-card-main') assert '登录' in login_button.text # 用户点击了登录 login_button.click() sleep(1) # 用户尝试用不正确的密码登录 success = try_login_with(browser, TESTING_CARD_ID, '000000') # 并没有登录成功 assert not success # 用户尝试输入正确的密码 browser.refresh() sleep(1) success = try_login_with(browser, TESTING_CARD_ID, TESTING_PASSWORD) assert success # 登录成功 assert '首页' in browser.find_element_by_class_name('q-toolbar-title').text browser.close()
def thread(queue: Queue): while True: lp = queue.get() br = Browser() matches = LOGIN_PASSWORD_FORMAT.match(lp) login, password = matches.group('login'), matches.group('password') br.get(LOGIN_URL) while not (tag := br.find_element_by_id("email")).is_displayed(): time.sleep(0.1) tag.send_keys(login) completed = False while not completed: # Может быть случай когда сначало логин и потом кнопку продолжить для пароля if (tag := br.find_element_by_id("password")).is_displayed(): tag.send_keys(password) try: br.find_element_by_id("btnLogin").click() completed = True except: ... elif (tag := br.find_element_by_id("btnNext")).is_displayed(): try: tag.click() except: ...
from selenium.webdriver import PhantomJS as Browser import json import time import re proxy_list_url = "http://spys.one/socks/" proxies = [] br = Browser() br.get(proxy_list_url) sizes = [25, 50, 100, 200, 300, 500] pattern = re.compile(r"[.\s]+\((\d+)\)") for country_id in range(1, 171): try_counter = 0 count = 0 while (elm := br.find_element_by_id('tldc')).find_element_by_xpath( f"./option[@selected]").get_attribute("value") != str(country_id): elm = elm.find_element_by_xpath(f'./option[@value="{country_id}"]') elm.click() try_counter += 1 if try_counter >= 2: break if try_counter >= 2: continue count = int(pattern.findall(elm.text)[0]) key = 0 for key, size in enumerate(sizes): if int(size) > count: break try_counter = 0 while (elm := br.find_element_by_id("xpp")).find_element_by_xpath( "./option[@selected]").get_attribute("value") != str(key):
from selenium.webdriver import Firefox as Browser driver = Browser() driver.get("www.google.com")
def extract_publication(profile_url, verbose=verbose_citation_list): """ this function crawl the publication list from the google scholar profile @param[in] profile_url the link of google scholar profile you want to crawl @param[in] verbose the level of information you want to scrawl. By default, we will scraw the detailed citation list for each of your publicaiton @return the list of pulication as a list, where each entry is a dictionary """ # scholar's artical list browser = Browser() browser.get(profile_url) publication = {} while True: publication_list = browser.find_elements_by_class_name('gsc_a_tr') for publication_item in publication_list: title = publication_item.find_element_by_class_name( 'gsc_a_at').text print title author = publication_item.find_elements_by_class_name( 'gs_gray')[0].text.split(', ') vendor = publication_item.find_elements_by_class_name( 'gs_gray')[1].text try: citation = int( publication_item.find_element_by_class_name( 'gsc_a_ac').text) link = publication_item.find_element_by_class_name( 'gsc_a_ac').get_attribute('href') except: citation = 0 link = None try: year = int( publication_item.find_element_by_class_name( 'gsc_a_h').text) except: year = None """ # to get citation for every paper, but will be detected as robot if citation>0 and verbose>=verbose_citation_list: print 'and its citation list', # to solve anti-crawl, but not work # time.sleep(2) cited_by=extract_citation_for_publication(link) else: cited_by=None print 'finished' publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': cited_by, 'year':year} """ publication[title] = { 'link': link, 'author': author, 'vendor': vendor, 'citation': citation, 'cited by': citation, 'year': year } if not next_page_new(browser): break browser.close() return publication