def main(): # 先读取用户名和密码 infile = open("user.txt", "r") username = str(infile.readline()) password = str(infile.readline()) print "username", username, type(username) infile.close() url = Links.login() browser = Browser('firefox') browser.visit(url) time.sleep(1) browser.find_by_id('user_name').fill(username) browser.find_by_id('password').fill(password) browser.find_by_id('submit_button').click() time.sleep(1) browser.click_link_by_href(Links.xuankejieguo()) # time.sleep(8) # 通过命令行来控制抓取 print "ready to catch information!" cmd = raw_input("main>>>") while cmd != "exit": strlst = cmd.split(' ') cmd0 = strlst[0] if cmd0 == "get": print "begin catching information" pkuget = PkuGet(browser, 3) if pkuget.state == 1: pkuget.getinfo(strlst) cmd = raw_input("main>>>") browser.quit()
def jpl_mars(): #JPL Mars Space Images - Featured Image #Importing packages from selenium import webdriver import pandas as pd from splinter import Browser from bs4 import BeautifulSoup import requests global featured_image_url executable_path = {'executable_path': 'C:/webdrivers/chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url ='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) browser.click_link_by_partial_text('more news') for x in range(1, 3): html = browser.html soup = BeautifulSoup(html, 'html.parser') # get the number of pics to be reviewed within this page vl_jpl_pics = soup.find_all('li', class_='slide') #news_title='33' v_count = 0 v_pag =0 vl_href=[] for vc_jpl_pics in vl_jpl_pics: link_href = vc_jpl_pics.find('a') link_href= link_href['href'] vl_links_t= vc_jpl_pics.find('div', class_='content_title') # finding the picture if vl_links_t.text.strip() == news_title[0]: print('===founded====') print('Title:',v_count," ",vl_links_t.text.strip() ) # Click on the picture link reference browser.click_link_by_href(link_href) html = browser.html soup = BeautifulSoup(html, 'html.parser') xvl_jpl_pics = soup.find_all('div', class_='article_image_container') for xvc_jpl_pics in xvl_jpl_pics: xlink_href = xvc_jpl_pics.find('a') featured_image_url= xlink_href['href'] print('Link to Img :',featured_image_url) break vl_href.append(vl_links_t) v_count +=1 v_pag +=1 print ('pag',x,"Count",v_count) # browser.click_link_by_partial_text('MORE') # Close the browser after scraping browser.quit()
def parse_wordstat(login, password, request_word, number_of_request_pages = 10): # открытие страницы wordstat и авторизация browser = Browser() url = "https://wordstat.yandex.ru/" browser.visit(url) browser.click_link_by_href('https://passport.yandex.ru/passport?mode=auth&msg=&retpath=https%3A%2F%2Fwordstat.yandex.ru%2F') browser.find_by_id('b-domik_popup-username').fill(login) sleep(random.randint(5, 10)/10) browser.find_by_id('b-domik_popup-password').fill(password) sleep(random.randint(5, 10)/10) button = browser.find_by_css('input[class="b-form-button__input"]')[2] button.click() sleep(random.randint(5, 10)/10) # ввод запроса word_search_input = browser.find_by_css('input[class="b-form-input__input"]').first word_search_input = word_search_input.fill(request_word) sleep(random.randint(5, 10)/10) span_button = browser.find_by_css('input[class="b-form-button__input"]')[0] span_button.click() sleep(random.randint(5, 10)/10) queries = [] frequency = [] # обход заданного числа страниц for _ in range(number_of_request_pages): # получение табличных данных iter_element = browser.find_by_css('td[class*="b-word-statistics__td"]') word_flag = True for i in range(100): if word_flag: queries.append(iter_element[i].text) else: frequency.append(int(''.join((iter_element[i].text.split())))) word_flag = not word_flag browser.click_link_by_href('#next_page') sleep(random.randint(5, 10)/10) result = pd.DataFrame(dict(queries = queries, frequency = frequency)) result.sort_values(by=['frequency'], ascending=False) result.to_excel("output.xlsx", columns=['queries', 'frequency'], index=False) browser.quit()
def advanced_search(pub_id, search_term, user_name, password, year=None, sports=False, phantom=False): if phantom == True: browser = Browser('phantomjs') else: browser = Browser('firefox') login(browser, user_name, password) button = browser.find_by_name('doLogin').first.click() browser.click_link_by_href('advanced') browser.fill('queryTermField', pub_id) browser.fill('queryTermField_0', search_term) browser.select('fieldsSelect_0', 'all') if sports == False: browser.select('opsSelect_0', 'NOT') browser.fill('queryTermField_1', 'sports') browser.select('fieldsSelect_1', 'ti') if year != None: browser.select('select_multiDateRange', 'ON') browser.fill('year2', year) search_but = browser.find_by_name('searchToResultPage').first.click() return browser
def get_login(url): browser = Browser("phantomjs", service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) browser.visit(url) browser.find_by_id('username').fill('*****@*****.**') # MORPH #browser.find_by_id('username').fill('*****@*****.**') #browser.find_by_id('username').fill('*****@*****.**') #browser.find_by_id('username').fill('*****@*****.**') # Morph uk browser.find_by_id('password').fill('Nrjn1gsa') browser.find_by_name('submit').first.click() time.sleep(1) print browser.url browser.click_link_by_href("/business/opportunitySearchForm.html") time.sleep(1) browser.select('status', "") browser.find_by_value("Search").first.click() time.sleep(2) print browser.url return browser
def start_requests(self): display = Display(visible=0, size=(1024, 768)) display.start() browser = Browser(); browser.visit('https://msu.us.company.com') browser.fill('ssousername','*****@*****.**') browser.fill('password','password') browser.click_link_by_href('javascript:doLogin(document.LoginForm);') self.cookies = browser.cookies.all() browser.quit() display.stop() file = open("input.bin") for line in file: line = line.strip('\n') self.machine_ids.append(line) print len(self.machine_ids) for i, machine_id in enumerate(self.machine_ids): url = "https://msu.us.company.com/index.php?search_hostname=&search_domain=&search_serial_no=" + machine_id + "&search_cost_center=&search_building=&search_room=&search_location=&search_group_name=&search_logical_group=&search_system_usage=&search_contact=&search_sa_contact=&search_security_level=&search_in_service=&search_lastmodtime=&search_lastmodflds=&search_codename=&search_model=&search_notes=&search_admin_notes=&search_motherboard_model=&search_bios_info=&search_cpu_info=&compare_cpu_count=%3D&search_cpu_count=&compare_cpu_speed=%3D&search_cpu_speed=&search_architecture=&compare_memory=%3D&search_memory=&search_disk=&compare_storage_numdevices=%3D&search_storage_numdevices=&compare_storage_capacity=%3D&search_storage_capacity=&search_ether=&search_expansion_slots=&search_sound_cards=&search_video_cards=&search_ethernet_cards=&search_network_comments=&search_usertag1=&search_usertag2=&search_usertag3=&search_usertag4=&search_os_bits=&search_ip_addr=&search_os_version=&search_os_build=&search_system_state=&search_system_state_detail=&search_system_state_timestamp=&search_last_ping=&search_last_ping_att=&search_uptime=&search_console_access=&search_console_type=&search_sp_access=&search_sp_type=&search_console_patch_port=&search_conserver_server=&search_rpc=&search_rpc_type=&search_outlet=&search_outlet_type=&search_reservation_status=Any&search_reservation_type=&search_reserved_by=&compare_reserved_start_time=%3D&search_reserved_start_time_mm=&search_reserved_start_time_dd=&search_reserved_start_time_yy=&compare_reserved_end_time=%3D&search_reserved_end_time_mm=&search_reserved_end_time_dd=&search_reserved_end_time_yy=&search_reserved_comment=&search_reserved_project=&search_gq_pri=&search_gq_max=&search_gq_beg=&search_gq_end=&search_reservation_limit=&output=Standard&AdvancedSearch=1&Search=Search" yield Request(url, cookies = self.cookies, headers = self.headers, meta = {'id':machine_id}, callback = self.search_result, dont_filter=True)
def getRoutes(start, end): browser = Browser(driver_name="firefox") browser.visit('https://www.hopstop.com/search?xfr=cityscape') print(browser.url) browser.fill('address1', str(start)) browser.fill('address2', str(end)) browser.find_by_name('get_dirs').click() print(browser.url) if browser.is_text_present('Did you mean?'): browser.click_link_by_href("#") if browser.is_text_present('Did you mean?'): browser.click_link_by_href("#") browser.click_link_by_href("#") links = browser.find_link_by_partial_href("/station?stid") results = [] for link in links: results.append(link.value) return results
def getRoutes(start, end): browser = Browser(driver_name="firefox") browser.visit("https://www.hopstop.com/search?xfr=cityscape") print(browser.url) browser.fill("address1", str(start)) browser.fill("address2", str(end)) browser.find_by_name("get_dirs").click() print(browser.url) if browser.is_text_present("Did you mean?"): browser.click_link_by_href("#") if browser.is_text_present("Did you mean?"): browser.click_link_by_href("#") browser.click_link_by_href("#") links = browser.find_link_by_partial_href("/station?stid") results = [] for link in links: results.append(link.value) return results
def get_login(url): browser = Browser("phantomjs", service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) browser.visit(url) #browser.find_by_id('username').fill('*****@*****.**') # MORPH #browser.find_by_id('username').fill('*****@*****.**') #browser.find_by_id('username').fill('*****@*****.**') browser.find_by_id('username').fill('*****@*****.**') # MORPH UK browser.find_by_id('password').fill('Nrjn1gsa') browser.find_by_name('submit').first.click() time.sleep(1) print browser.url try: browser.click_link_by_href("/business/opportunitySearchForm.html") time.sleep(1) browser.click_link_by_href("opportunityAdvancedSearchForm.html") time.sleep(2) #browser.find_by_value('All').first.click() browser.select('status', "") browser.select('area', "9") # 'area' is 'class name' not just name? time.sleep(3) print browser.find_by_value('Add All') #TODO print browser.html browser.find_by_value('Add All').first.click() print 'added all England only' #TODO time.sleep(2) browser.find_by_value("Search").first.click() time.sleep(2) except Exception as e: print 'error: ', e browser.click_link_by_href("/business/logoutHosts.html") time.sleep(4) browser.quit() sys.exit("login failed") print browser.url return browser
user_email = raw_input("enter users email address ") user_pass = raw_input("enter users password ") browser.visit('http://www.facebook.com') browser.fill('email', user_email) browser.fill('pass', user_pass) #Here is what I made a slight change button = browser.find_by_id('loginbutton') button.click() #I didn't find the page saving function for facebook using Splinter but as an alternative I found screenshot feature. # The site we will navigate into, handling it's session browser.visit('http://www.baseball-reference.com/my/auth.cgi?return_to=http://www.baseball-reference.com/') browser.click_link_by_href('/my/auth.cgi?do=oauth_login&service=facebook&return_to=') # print response.read() # scrape_new_data(starting_year, ending_year, browser, only_pitching_data) if (pitching_data == 1) & (batting_data == 0): scrape_new_data(starting_year, ending_year, browser, 1) else: scrape_new_data(starting_year, ending_year, browser, 0) if start_time == 1: get_start_time_data() if team_schedules == 1: get_team_schedules(2004, 2015) convert_excel_date_format('team_schedules.csv', 0) if batting_order == 1:
change_brig(screenshot) crop_img("captcha.png") img = threshold("captcha.png") captcha = tesseract(img) #time.sleep(2) print captcha bro.fill('usuario','J311968199') bro.fill('contrasenia','J-311968199a') bro.fill('captcha', str(captcha)) bro.find_by_id('btnLoginSisap').click() flag = False while not flag: ejecutar() principal_menu = bro.find_by_id("principal-menu") if principal_menu != []: principal_menu.click() bro.click_link_by_href("/informacion-general/informacion-seniat") bro.click_link_by_href("#inf_accionistas") bro.click_link_by_href("/accionistas/gestion") bro.select("id_tipo_relacion_empresa", "526") bro.select("id_pais","229") bro.fill("correo", "*****@*****.**") bro.fill("cantidad_acciones","1234") #bro.find_by_id("btnAccionistas").mouse_over() flag = True #ipdb.set_trace()
def scrape_Mars1(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" browser.visit(url) html = browser.html soup = bs(html, 'html.parser') News_Title = soup.find('div', class_='content_title') News_Title = News_Title.text News_Title News_Paragraph = soup.find('div', class_='article_teaser_body') News_Paragraph = News_Paragraph.text News_Paragraph #---------------------------------- executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(10) browser.click_link_by_partial_text('more info') html = browser.html soup = bs(html, 'html.parser') img = soup.find('figure', class_='lede') full_size = img.find("a")["href"] browser.click_link_by_href(full_size) html = browser.html soup = bs(html, 'html.parser') featured_image_url = soup.find('img')['src'] #-------------------------------------------- executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = "https://twitter.com/marswxreport?lang=en" response = requests.get(url) soup = bs(response.text, 'html.parser') results = soup.find('div', class_="js-tweet-text-container") mars_weather = results.p.text #----------------------------------------------- url = "https://space-facts.com/mars/" mars_table = pd.read_html(url) mars_table df = mars_table[0] df.columns = ["Measurements", "Results"] df.set_index("Measurements", inplace=True) mars_html_table = df.to_html() df.to_html('table.html') #------------------------------------------- Url_List = [] Hemispheres = [ 'Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced' ] for Hemisphere in Hemispheres: executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) browser.click_link_by_partial_text(Hemisphere) html = browser.html soup = bs(html, 'html.parser') img_url = soup.find( 'div', class_="downloads").find('ul').find('li').find('a')['href'] Url_List.append(img_url) Hem_List = [[Hemispheres[0], Url_List[0]], [Hemispheres[1], Url_List[1]], [Hemispheres[2], Url_List[2]], [Hemispheres[3], Url_List[3]]] lables = {0: "Title", 1: "Img_Url"} Hem_Dict = [{lables[idx]: val for idx, val in enumerate(item)} for item in Hem_List] H1 = Hem_Dict[0] H2 = Hem_Dict[1] H3 = Hem_Dict[2] H4 = Hem_Dict[3] scrape = { "News_Title": News_Title, "News_Paragraph": News_Paragraph, "Featured_Image": featured_image_url, "Mars_Tweet": mars_weather, "Mars_Table": mars_html_table, 'Cerberus': H1["Title"], 'Cerberus_Img': H1["Img_Url"], 'Schiaparelli': H2["Title"], 'Schiaparelli_Img': H2["Img_Url"], 'Syrtis': H3["Title"], 'Syrtis_Img': H3["Img_Url"], 'Valles': H4["Title"], 'Valles_Img': H4["Img_Url"], } return scrape
def scrape_iwp(a_startpage=1, a_pagecount=20000): # Initialize PyMongo to work with MongoDBs conn = 'mongodb://localhost:27017' client = pymongo.MongoClient(conn) # Define database and collection db = client.etl_db # Setup the splinter Browser executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # URL of page to be scraped # url_iwp = 'https://iwaspoisoned.com' # UPDATE: Added the "?page=" to restart scraping on pages not already obtained url_iwp = 'https://iwaspoisoned.com/?page=' + str(a_startpage) # Visit the IWP page browser.visit( url_iwp ) # Extract incidents from multiple pages page_target = int(a_pagecount) # How long to wait between pages to avoid triggering issues on website page_wait = 2 # Count the number of pages visited n_pages = 0 # Loop until no more pages or until page target is reached full_incident_list = [] for j in range(page_target): # Get a page full of incidents from the USA i_list = parse_incident_page(browser.html) n_pages += 1 # Add this list of incidents to a running list # full_incident_list.extend(i_list) # Add this list of incidents to the Mongo database try: # Attempt the insert insert_results = db.iwp.insert_many(i_list) # Print a progress marker print(f"Page {n_pages} of {a_pagecount}: {len(insert_results.inserted_ids)} of {len(i_list)} incidents added to DB. Total incidents: {db.iwp.count_documents({})}") except TypeError: # It's possible the incident list was empty, which could trigger a TypeError. # This is the case since it is being filtered for only Illinois, USA incidents print(f">> Page {n_pages}: No incidents captured") # Check to see if a hyperlink with attribute 'rel' = 'next' is present soup_thispage = BeautifulSoup(browser.html, 'lxml') next_tag = soup_thispage.find('a', {'rel' : 'next'}) if next_tag: # Ok, there is a next page - get the hyperlink # print(f"DEBUG: Going to next page (next_tag = '{next_tag}' ") try: next_page_url = next_tag['href'] # Wait for a specified number of seconds time.sleep(page_wait) # Click it! browser.click_link_by_href(next_page_url) #DEBUG **************************************** # if n_pages > 3: # break # If KeyError occurs, then this tag has no html link for some reason except KeyError: break else: # No more pages - break out of this loop break # Close the Browser browser.quit() # Return the number of pages scraped return n_pages # EXAMPLE: # Command to Start at Page 1 of iwaspoisoned.com and Scrape 10 Pages, # only keeping Incidents that occurred in Illinois, USA # # In a _separate_ Python file, include the code below: #******************************************************************************* # Import ETL Scraper function `scrape_iwp` from the local file `etl_scrape_iwp` # from etl_scrape_iwp import scrape_iwp # # Use the function to scape pages # pages_scraped = scrape_iwp(1, 10) #*******************************************************************************
browser.find_by_xpath('//h1') browser.find_by_tag('h1') browser.find_by_name('name') browser.find_by_text('Hello World!') browser.find_by_id('firstheader') browser.find_by_value('query') # get element first_found = browser.find_by_name('name').first last_found = browser.find_by_name('name').last second_found = browser.find_by_name('name')[1] # Get value of an element browser.find_by_css('h1').first.value # Clicking links,return the first link browser.click_link_by_href('http://www.the_site.com/my_link') browser.click_link_by_partial_href('my_link') browser.click_link_by_text('my link') browser.click_link_by_partial_text('part of link text') browser.click_link_by_id('link_id') # element is visible or invisible browser.find_by_css('h1').first.visible #fill content browser.find_by_id('productName').fill( 'splinter - python acceptance testing for web applications') browser.fill('q', 'splinter - python acceptance testing for web applications') # Verifying if element has a className browser.find_by_css('.content').first.has_class('content')
#Parse home page to gather links to traverse bs = BeautifulSoup(browser.html, 'html.parser') results = bs.find_all('a',class_="player-name") aref_list = [] for result in results: aref_list.append(result['href']) #Utilize list of href to scrape player bio and stats pages and store into a list of DataFrames. bs = BeautifulSoup(browser.html, 'html.parser') list_df_hitter = [] list_df_hitter_stats = [] for aref in aref_list[:TopN]: bio_dict = {} time.sleep(1) browser.click_link_by_href(aref) time.sleep(1) bio = BeautifulSoup(browser.html,'html.parser') #Calculate player name bio_name = bio.find('div','pull-left primary-heading-subheading') player = bio_name.text.lstrip().split('\n')[0].rstrip() print(player, flush=True) bio_results = bio.find_all('span','bio-detail') #Get Player Bio information college = '' for bio_result in bio_results: attr = bio_result.text.split(':')[0] if (attr == "Age"): age = int(bio_result.text.split(':')[1]) elif (attr == "College"):
class ChopeBrowser: def __init__(self, headless=False): self.chrome = Browser('chrome', headless=headless) def time_delay(self, time): self.chrome.is_element_present_by_name('!@#$%^&*())(*&^%$#@!', wait_time=time) def login(self, usr, pwd, domain='STUDENT'): url = 'https://ntupcb.ntu.edu.sg' url += '/fbscbs/Account/SignIn?ReturnUrl=%2ffbscbs' self.chrome.visit(url) dropdown = self.chrome.find_by_tag('option') for option in dropdown: if option.text == domain: option.click() self.chrome.fill('Username', usr) self.chrome.fill('Password', pwd + '\n') def first_setup(self): button = self.chrome.find_by_id('tdFacilityBook') button.click() self.chrome.click_link_by_href('#8') self.chrome.click_link_by_href('#-1') self.chrome.click_link_by_href('/fbscbs/Booking/Create?resourceId=69') self.chrome.click_link_by_id('book') self.chrome.click_link_by_id('changeResource') self.chrome.click_link_by_href('#-1') self.chrome.click_link_by_id('book') def is_registered(event): if event.has_class('noShowWhite'): return False if event.has_class('currentEvent'): return False return True def check_facility(self, evFacilities): columnWeek = self.chrome.find_by_css('.wc-event-column') evWeek = [] for columnDay in columnWeek: evToday = [] evList = columnDay.find_by_css('.ui-corner-all') for event in evList: if not event.has_class('noShowWhite'): if not event.has_class('currentEvent'): event = event.text if not event.find('—') == -1: if event == '': continue evToday.append(event.split('—')) evWeek.append(evToday) evFacilities.append(evWeek) def click_next(self, counter, evFacilities): # Kerja rekursif dengan check_facility. # Milih option facility berdasarkan counter. dropdown = self.chrome.find_by_id('ResourceId') options = dropdown.find_by_tag('option') if counter < len(options): nextOption = options[counter] nextOption.click() self.check_facility(counter, evFacilities) else: return evFacilities def scrape_seats(self, usr, pwd): self.login(usr, pwd) self.first_setup() evFacilities = [] dropdown = self.chrome.find_by_id('ResourceId') options = dropdown.find_by_tag('option') for opt in options: nextOption = opt nextOption.click() self.time_delay(0.1) # while loadingTitle.visible: # pass evFacilities.append(opt.text) self.check_facility(evFacilities) return evFacilities def quit(self): self.chrome.quit()
parser = SafeConfigParser() parser.read('config.ini') browser = Browser(parser.get('Config', 'Browser')) browser.driver.maximize_window() browser.visit('https://fsweb.no/studentweb/login.jsf?inst=' + parser.get('Config', 'Institution')) browser.find_by_text('Norwegian ID number and PIN').first.click() browser.find_by_id('login-box') browser.fill('j_idt129:j_idt131:fodselsnummer', parser.get('Config', 'Fodselsnummer')) browser.fill('j_idt129:j_idt131:pincode', parser.get('Config', 'Pin')) browser.find_by_text('Log in').first.click() browser.click_link_by_href('/studentweb/resultater.jsf') tags = browser.find_by_tag('tr') chars = [] for tag in tags: if tag.has_class('resultatTop') or tag.has_class('none'): inner_tags = tag.find_by_tag('td') course_id = inner_tags[1].text.split("\n")[0] course_name = inner_tags[1].text.split("\n")[1] grade = inner_tags[5].text if grade != 'passed': chars.append(grade) print "%s\t%-30s\t%s" % (course_id, course_name, grade)
executable_path = {"executable_path": "chromedriver"} browser = Browser("chrome", **executable_path, headless=False) base_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(base_url) # html = browser.html # soup = BeautifulSoup(html, 'html.parser') # img_urls = [] # pic_dict = {'title': [], 'img_url': [],} # pictures = soup.find_all('div',{"class":"item"}) # a=pictures[0].find_all("a") # print(a[0]["href"]) browser.click_link_by_href('/search/map/Mars/Viking/cerberus_enhanced') # print(pictures) # for pic in pictures: # try: # t = pic.get_text() # title = t.strip('Enhanced') # time.sleep(5) # browser.click_link_by_href('/search/map/Mars/Viking/cerberus_enhanced') # except: # raise # finally: # browser.quit() # browser.click_link_by_partial_href('Enhanced') # browser.click_link_by_partial_text(t)
assert browser.url != "http://54.191.193.7:5000/email" print("=========================================================") #Test Case 6 print("Running Test Case 6: Classify image and click on link for more information") browser.visit('http://54.191.193.7:5000/') print("Visiting browser...") time.sleep(2) element = browser.driver.find_element_by_id("imageFile") pathToImage = os.path.abspath("static/testing/Capture5.JPG") element.send_keys(pathToImage) print("Image chosen...") time.sleep(2) browser.click_link_by_id('submit') assert browser.is_text_present('Image Uploaded') == True browser.click_link_by_href("https://www.hse.ie/eng/health/az/h/hives%20-%20acute/causes-of-urticaria.html") print("Link clicked...") time.sleep(2) browser.windows.current = browser.windows[1] assert browser.url == "https://www.hse.ie/eng/health/az/h/hives%20-%20acute/causes-of-urticaria.html" browser.windows[1].close() browser.windows.current = browser.windows[0] print("=========================================================") #Test Case 7 print("Running Test Case 7: Classify image and click on both links for more information") browser.visit('http://54.191.193.7:5000/') print("Visiting browser...") time.sleep(2) element = browser.driver.find_element_by_id("imageFile") pathToImage = os.path.abspath("static/testing/Capture5.JPG")
parser.read('config.ini') browser = Browser(parser.get('Config', 'Browser')) browser.driver.maximize_window() browser.visit('https://fsweb.no/studentweb/login.jsf?inst=' + parser.get('Config', 'Institution')) browser.find_by_text('Norwegian ID number and PIN').first.click() browser.find_by_id('login-box') browser.fill('j_idt129:j_idt131:fodselsnummer', parser.get('Config', 'Fodselsnummer')) browser.fill('j_idt129:j_idt131:pincode', parser.get('Config', 'Pin')) browser.find_by_text('Log in').first.click() browser.click_link_by_href('/studentweb/resultater.jsf') tags = browser.find_by_tag('tr') chars = [] for tag in tags: if tag.has_class('resultatTop') or tag.has_class('none'): inner_tags = tag.find_by_tag('td') course_id = inner_tags[1].text.split("\n")[0] course_name = inner_tags[1].text.split("\n")[1] grade = inner_tags[5].text if grade != 'passed': chars.append(grade) print "%s\t%-30s\t%s" % (course_id, course_name, grade)
def looping(self): NETID = self.controller.frames["MainPage"].ID_entry.get() PASSWD = self.controller.frames["MainPage"].PW_entry.get() CLS_LST = self.controller.frames["MainPage"].targets URL = "https://schedule.msu.edu" URL_PLAN = "https://schedule.msu.edu/Planner.aspx" b = Browser('chrome', headless=HEADLESS, **executable_path) for course in CLS_LST: tar = course.split() TERM = "{} {}".format(tar[1], tar[0]) SUB = tar[2] SUB_NUM = tar[3] SEC = "{:03}".format(int(tar[4][3:])) try: # put all the list class in to user account planner b.visit(URL) # term = b.find_by_text(TERM).value term = re.findall( '<option .*?value="(.+)??".*?>{}(-Tentative)?</option>'. format(TERM), b.html)[0][0] # b.find_by_id("MainContent_SrearchUC_ddlTerm").select(term) # b.find_by_id("MainContent_SrearchUC_ddlSubject").select(SUB) # b.find_by_id("MainContent_SrearchUC_txtCourseNumber").fill(SUB_NUM) # b.find_by_id("MainContent_SrearchUC_btnSubmit").click() b.find_by_id("MainContent_ddlTerm").select(term) b.find_by_id("MainContent_ddlSubject").select(SUB) b.find_by_id("MainContent_txtCourseNumber").fill(SUB_NUM) b.find_by_id("MainContent_btnSubmit").click() combo = "{} {} Section {}".format(SUB, SUB_NUM, SEC) link = re.findall( '<a href="(.+)?" title="[^"]+add {} to your planner"?>'. format(combo), b.html)[0] b.click_link_by_href(link) self.checkLogin([b], URL_PLAN, NETID, PASSWD) self.status_table[course] = "READY" self.ready_table[course] = ["-1", combo] except: # print("Error:", sys.exc_info()[0]) self.status_table[course] = "ERROR" self.updateStatus(CLS_LST) # now go to the planner b.visit(URL_PLAN) self.checkLogin([b], URL_PLAN, NETID, PASSWD) # find the plan idx self.updateReady(b.html) # print(self.ready_table) STATUS_CODE = "MainContent_UCPlan_rptPlanner_tdStatus_" ENROLL_CODE = "MainContent_UCPlan_rptPlanner_imgEnroll_" CONTINUE_CODE = "MainContent_btnContinue" to_delete = None # looping arround while len(self.ready_table) > 0: b.visit(URL_PLAN) self.checkLogin([b], URL_PLAN, NETID, PASSWD) for course in self.ready_table: plan_idx = self.ready_table[course][0] combo = self.ready_table[course][1] # print(b.find_by_id(STATUS_CODE+plan_idx).text) if "Open" in b.find_by_id(STATUS_CODE + plan_idx).text: # section open!! enroll the class b.find_by_id(ENROLL_CODE + plan_idx).click() b.find_by_id(CONTINUE_CODE).click() if b.html.find( "The course has been added to your schedule." ) != -1: # enroll successfully self.status_table[course] = "ENROLLED" else: # FAILED self.status_table[course] = "FAILED" to_delete = course self.updateStatus(CLS_LST) break if to_delete != None: b.visit(URL_PLAN) self.checkLogin([b], URL_PLAN, NETID, PASSWD) del self.ready_table[to_delete] self.updateReady(b.html) to_delete = None else: time.sleep(1) # sleep 1 second self.updateStatus(CLS_LST, True) b.quit()
class ChopeBrowser: def __init__(self, headless=False): self.chrome = Browser('chrome', headless=headless) def time_delay(self, time): self.chrome.is_element_present_by_name('!@#$%^&*())(*&^%$#@!', wait_time=time) def login(self, usr, pwd, domain='STUDENT'): url = 'https://ntupcb.ntu.edu.sg' url += '/fbscbs/Account/SignIn?ReturnUrl=%2ffbscbs' self.chrome.visit(url) dropdown = self.chrome.find_by_tag('option') for option in dropdown: if option.text == domain: option.click() self.chrome.fill('Username', usr) self.chrome.fill('Password', pwd + '\n') # PC BOOKING STARTS HERE # Tries to book the PC of selected type def pc_setup(self, usr, pwd, Type): self.login(usr, pwd) button = self.chrome.find_by_id('tdPcBook') button.click() time.sleep(2) with self.chrome.get_iframe('frmAdminViewControls') as iframe: iframe.find_by_id('pnlInsLoc3').click() self.type_number(Type) data = self.scrape_pc() can_book = self.book_pc(data[1], data[2]) self.chrome.quit() return data[0], can_book # identify pc type requested def type_number(self, Types): for i in range(0, 4): with self.chrome.get_iframe('frmAdminViewControls') as iframe: page = iframe.find_by_id('pnlInsPcGrp' + str(i)) if page != []: page = page.html page = BeautifulSoup(page, "lxml") page = page.find("span", { "style": "display:inline-block;height:20px;width:80px;" }) page = page.get_text() if page == Types: page = iframe.find_by_id('pnlInsPcGrp' + str(i)).click() return return 0 # Scrape all PC in the current screen def scrape_pc(self): with self.chrome.get_iframe('frmSeating') as iframe: for i in range(0, 6): for j in range(1, 11): btnID = 'grdSeating_tblCol' + str(j) + '_' + str(i) parse = iframe.find_by_id(btnID) if parse == []: return 'no pc', 100, 100 if parse != []: color = self.color(parse.html) if (color == '#FFFFFF'): return self.name_pc(parse.html), j, i no_pc = 'no pc' j = 100 i = 100 return no_pc, j, i # Identify name of PC def name_pc(self, codes): soup = BeautifulSoup(codes, "lxml") mydivs = soup.findAll("span", {"class": "lblPcName"}) return mydivs[0].get_text() # Check availability of PC, by detecting background color def color(self, code): soup = BeautifulSoup(code, "lxml") tag = soup.findAll('td', {"style": "background-color: #FFFFFF"}) if tag != []: return '#FFFFFF' else: return 'blabla' # Try to book the selected PC def book_pc(self, col, row): with self.chrome.get_iframe('frmSeating') as iframe: if (col != 100) and (row != 100): try: time.sleep(1) butt = iframe.find_by_id("grdSeating_divOuterCol" + str(col) + "_" + str(row)) if butt != []: butt.click() time.sleep(1) sub = iframe.find_by_name("btnsumit") sub.click() return "booked" except: pyautogui.press('enter') return "cannot book" return "cannot book" # Initialize booking site until arriving to the booking table def first_setup(self): button = self.chrome.find_by_id('tdFacilityBook') button.click() self.chrome.click_link_by_href('#8') self.chrome.click_link_by_href('#-1') self.chrome.click_link_by_href('/fbscbs/Booking/Create?resourceId=69') self.chrome.click_link_by_id('book') self.chrome.click_link_by_id('changeResource') self.chrome.click_link_by_href('#-1') self.chrome.click_link_by_id('book') # Eliminates unnecessary booking slots def is_registered(event): if event.has_class('noShowWhite'): return False if event.has_class('currentEvent'): return False return True # Adds weekly booked slots for selected facility # Each list of weekly bookings contain list of daily bookings # each containing lists booked slots, determined by start and end time def check_facility(self, evFacilities): columnWeek = self.chrome.find_by_css('.wc-event-column') evWeek = [] for columnDay in columnWeek: evToday = [] evList = columnDay.find_by_css('.ui-corner-all') for event in evList: if not event.has_class('noShowWhite'): if not event.has_class('currentEvent'): event = event.text if not event.find('—') == -1: if event == '': continue evToday.append(event.split('—')) evWeek.append(evToday) evFacilities.append(evWeek) def click_next(self, counter, evFacilities): # Recursively check facilities. # Choose facility based on counter dropdown = self.chrome.find_by_id('ResourceId') options = dropdown.find_by_tag('option') if counter < len(options): nextOption = options[counter] nextOption.click() self.check_facility(counter, evFacilities) else: return evFacilities # Scrape seats main function # OPTIMIZE: by multithreading # and by runnnig multiple browser at once def scrape_seats(self, usr, pwd): self.login(usr, pwd) self.first_setup() evFacilities = [] dropdown = self.chrome.find_by_id('ResourceId') options = dropdown.find_by_tag('option') optRange = range(len(options)) for i in optRange: opt = options[i] nextOption = opt nextOption.click() self.time_delay(0.2) # while loadingTitle.visible: # pass evFacilities.append(opt.text) self.check_facility(evFacilities) self.quit() return evFacilities def quit(self): self.chrome.quit()
self.classes.append(each_class) print "Visiting: https://gymbox.legendonlineservices.co.uk/enterprise/account/Login" browser.visit("https://gymbox.legendonlineservices.co.uk/enterprise/account/Login") print "\tSuccess" # LOGIN # print "Logging In.." browser.find_by_id("login_Email").fill(username) browser.find_by_id("login_Password").fill(password) browser.find_by_id("login").click() print "\tSuccess" # NAVIGATE TO CLASSES # print "Navigating to: /enterprise/BookingsCentre/MemberTimetable" browser.click_link_by_href("/enterprise/BookingsCentre/MemberTimetable") print "\tSuccess" # CREATE LIST OF OBJECTS CONTAINING DAYS AND CLASSES # print "Building Classes Timetable.." list_of_days = [] for each_row in browser.find_by_id("MemberTimetable").find_by_tag("tr"): each_class = {} if each_row.has_class("dayHeader"): day_obj = Day(each_row) else: day_obj.add_class(each_row) if day_obj not in list_of_days: list_of_days.append(day_obj) print "\tSuccess\n"
def getRoutes(start,end): browser = Browser( driver_name="firefox" ) browser.visit('https://www.hopstop.com/search?xfr=cityscape') print(browser.url) browser.fill('address1',str(start)) browser.fill('address2',str(end)) browser.find_by_name('get_dirs').click() print(browser.url) if browser.is_text_present('Did you mean?'): print "better at least get here" #browser.click_link_by_href("#") for link in browser.find_link_by_href("#"): print "Okay" if link.visible == True: print link.text browser.click_link_by_text(link.text) break browser.click_link_by_href("#") links = browser.find_link_by_partial_href("/station?stid") results = [] for link in links: results.append(link.value) browser.quit() return results