class CaptchaPage(): def __init__(self): print "Captcha Page Initializing" parser = ConfigParser.ConfigParser() base_path = os.path.join(os.environ['HOME'], '.mozilla/firefox/') parser.read(os.path.join(base_path, "profiles.ini")) profile_path = os.path.join(base_path, filter(lambda x: x[0].lower() == 'path', parser.items('Profile0'))[0][1]) try: profile = FirefoxProfile(profile_path) except OSError: raise Exception("You must execute the following command:\nsudo chmod +r -R %s" % profile_path) self.driver = Firefox(profile) self.driver.get("file://%s/index.html" % os.getcwdu()) def get_url_sound(self): self.driver.find_element_by_xpath('//*[@id="recaptcha_switch_audio"]').click() return self.driver.find_element_by_xpath('//*[@id="recaptcha_audio_download"]').get_attribute('href') def get_recaptcha_challenge_field(self): return self.driver.find_element_by_xpath('//*[@id="recaptcha_challenge_field"]').get_attribute('value') def get_captcha_textbox(self): print "Getting Captcha Textbox" return Textbox(self.driver.find_element_by_xpath('//*[@id="recaptcha_response_field"]')) def get_submit_button(self): print "Getting Submit Form Button" return Button(self.driver.find_element_by_xpath("/html/body/form/input")) def close(self): print "Closing Captcha Page" self.driver.close()
def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser() parser.add_argument('--url', default='http://127.0.0.1:8000/static/index.html') args = parser.parse_args(argv) url = args.url browser = WebDriver() browser.get(url) tags = browser.find_elements_by_css_selector('li') for tag in tags: print(tag.text) browser.close()
class FunctionalTests(LiveServerTestCase): """Base para os testes funcionais.""" def setUp(self): """Inicializa serviços necessários para execução dos testes funcionais.""" self.driver = Firefox() self.driver.maximize_window() self.driver.implicitly_wait(5) def tearDown(self): """Finaliza serviços.""" self.driver.close() def get_live_url(self, url_name): """Obtém url_name em relação ao servidor de testes.""" return '{}{}'.format(self.live_server_url, reverse(url_name))
class ContentRetrieverUsingSelenium: def __init__(self, timeout): self.browser = Firefox() self.timeout = timeout def getContentOfPage(self, url): self.browser.get(url) time.sleep(self.timeout) page_source = self.browser.page_source page_source = page_source.encode('gbk', 'ignore') return (self.browser.current_url, BeautifulSoup(page_source)) def close(self): self.browser.close()
def read_url(url): driver = Firefox(options=options) driver.maximize_window() driver.get(url) time.sleep(4) height = driver.execute_script("return document.body.scrollHeight") print(height) position = 0 while position < height: driver.execute_script(f"window.scrollTo(0, {position});") delta = random.randint(50, 500) position += delta duration = delta // 20 # print(height, position, delta, duration) time.sleep(duration) driver.close()
## id/pwd 정보 set 후 login id_txt.send_keys('korea7030') pw_txt.send_keys('akachiki10!') login_btn.submit() chrome.implicitly_wait(50) for article in article_url: chrome.get(article) chrome.switch_to.frame('cafe_main') soup_body = BeautifulSoup(chrome.page_source, "lxml") try: with open('content.csv', 'a', encoding='utf8') as f: writer = csv.writer(f) article_body = soup_body.find('div', {"id": "tbody"}) print(article_body.text) writer.writerow([article_body.text]) reply_tag = soup_body.find_all('span', class_='comm_body') for reply in reply_tag: with open('reply.csv', 'a', encoding='utf8') as f: writer_reply = csv.writer(f) print("reply text : -----------" + reply.text) writer_reply.writerow([reply.text]) except AttributeError as e: print("Error row : " + article) # print(reply_tag) chrome.close()
#!/usr/bin/python3 import requests from bs4 import BeautifulSoup from selenium.webdriver import Firefox b = Firefox() b.get("https://service.cloud.teu.ac.jp/moodle/course/view.php?id=7661") soup = BeautifulSoup(b.page_source, "html.parser") print(soup.find_all("a")) b.close()
button.click() # # Identify all states in the list, read as text using Selenium list_item = browser.find_element_by_class_name('drop-down-list') states = list_item.text # sel_st = raw_input('Type in 2 letter st abbreviation: ') find_st = browser.find_element_by_link_text('CO') find_st.click() sleep(1) list_region = browser.find_element_by_xpath('//*[@id="select-region"]/div[2]') # find and click Choose Region button list_region.click() cur_state = browser.find_element_by_xpath('//*[@id="select-region"]/div[3]') # text_region = cur_state.find_elements_by_tag_name('a') # find all regions by <a> tag l_regions = cur_state.text # creates text file with all the region regions = open('Regions.text', 'w') regions.write(l_regions) print l_regions # Place region points on map gmaps2.gmaps_mapit(l_regions) # url = './mymap.html' # webbrowser.open_new_tab(url) browser.close()
help='Number of adults: default 0', default=0) url, is_round_trip = get_url(argparser.parse_args()) opts = Options() opts.set_headless() driver = Firefox(options=opts) print("Departure Arrival Price Duration ") if is_round_trip: print("-> Return Flight Details") get_flights(driver, url, is_round_trip) driver.close() # -------------------------------------------------------------------------------------------------------------------------- # Source: https://github.com/Shruti-Pattajoshi/Travel-Website-Scraping-for-the-Cheapest-Fares/blob/master/flights_scraping.py from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys import pandas as pd import time import datetime import smtplib from email.mime.multipart import MIMEMultipart
from selenium.webdriver import Firefox from time import sleep browser = Firefox() url = 'http://selenium.dunossauro.live/aula_06_a.html' browser.get(url) browser.find_elements_by_css_selector('div.form-group') browser.find_elements_by_css_selector( 'div.form-group + br' #br irmão de div com class form-group )[1].get_attribute('id') # da tag div com a classe form-group pegue o filho com id "dentro-nome" b.find_element_by_css_selector('div.form-group > #dentro-nome') sleep(10) browser.close() browser.quit()
def find(self): # CONNECT THE DATABASE connector = sqlite3.connect('games-db') cursor = connector.cursor() cursor.execute('DROP TABLE allResults') cursor.execute('CREATE TABLE allResults(win TEXT,' ' home_team TEXT, away_team TEXT, home_score DECIMAL,' ' away_score DECIMAL, home_odd REAL,' ' draw_odd REAL, away_odd REAL)') # SET UP THE DRIVER options = FirefoxOptions() options.headless = True driver = Firefox(options=options, executable_path='C://Windows/geckodriver.exe') driver.get(self.WEB_LINKS["football"]) time.sleep(1) driver.find_element_by_css_selector( '#live-table > div.tabs > div.calendar > div:nth-child(1) > div' ).click() time.sleep(1) html = driver.execute_script( 'return document.documentElement.outerHTML;') # CLOSE THE BROWSER driver.close() # GET THE DATA soup = bs4.BeautifulSoup(html, 'html.parser') matches = soup.find_all(class_=re.compile('event__match')) all_games = [ list(game)[2:] for game in matches if 'Finished' in str(game) ] # WORK WITH THE DATA for game in all_games: items = { "home_team": "", "away_team": "", "home_score": "", "away_score": "", "home_odd": "", "draw_odd": "", "away_odd": "" } for element in game: if 'participant--home' in str(element): pattern = r'(\"\>([A-z0-9]+.+)\<[s][v][g][ ]|\"\>[A-z0-9].+\<\/[d][i])' home_team = re.search(pattern, str(element)) home_team_token = home_team.group(1)[2:].split('<') items["home_team"] = home_team_token[0] elif 'participant--away' in str(element): pattern = r'(\"\>([A-z0-9]+.+)\<[s][v][g][ ]|\"\>[A-z0-9].+\<\/[d][i])' away_team = re.search(pattern, str(element)) team_away_token = away_team.group(1)[2:].split('<') items["away_team"] = team_away_token[0] elif 'event__scores' in str(element): pattern = r'[n]\>(\d+)\<\/' tokens = re.findall(pattern, str(element)) items["home_score"] = int(tokens[0]) items["away_score"] = int(tokens[1]) elif 'o_1' in str(element): pattern = r'\"\>(\d{1,2}\.\d{2})\<\/[s]' try: home_odd = re.search(pattern, str(element)) items["home_odd"] = home_odd.group(1) except AttributeError: items["home_odd"] = "1.00" elif 'o_0' in str(element): pattern = r'\"\>(\d{1,2}\.\d{2})\<\/[s]' try: draw_odd = re.search(pattern, str(element)) items["draw_odd"] = draw_odd.group(1) except AttributeError: items["draw_odd"] = "1.00" elif 'o_2' in str(element): pattern = r'\"\>(\d{1,2}\.\d{2})\<\/[s]' try: away_odd = re.search(pattern, str(element)) items["away_odd"] = away_odd.group(1) except AttributeError: items["away_odd"] = "1.00" # INSERT THE DATA INTO THE DATABASE cursor.execute( 'INSERT INTO allResults(home_team, away_team, home_score,' ' away_score, home_odd, draw_odd, away_odd) VALUES' ' (?, ?, ?, ?, ?, ?, ?)', (items["home_team"], items["away_team"], items["home_score"], items["away_score"], items["home_odd"], items["draw_odd"], items["away_odd"])) connector.commit() connector.close()
class BandLeader(): def __init__(self, csvpath=None): # Database states self.database_path = csvpath self.database = [] self._current_track_record = None # Load database from disk if possible if isfile(self.database_path): with open(self.database_path, newline='') as dbfile: dbreader = csv.reader(dbfile) next(dbreader) # To ignore the header line self.database = [TrackRec._make(rec) for rec in dbreader] # Create a headless browser opts = Options() opts.headless = True self.browser = Firefox(options=opts) self.browser.get(BANDCAMP_FRONTPAGE) # Track list related state self._current_track_number = 1 self.track_list = [] self.tracks() # Database maintenance thread self.thread = Thread(target=self._maintain) self.thread.daemon = True # Kills the thread when the main process dies self.thread.start() def save_db(self): with open(self.database_path, 'w', newline='') as dbfile: dbwriter = csv.writer(dbfile) dbwriter.writerow(list(TrackRec._fields)) for entry in self.database: dbwriter.writerow(list(entry)) def _maintain(self): while True: self._update_db() sleep(20) # Check every 20 seconds def _update_db(self): try: check = (self._current_track_record is not None and (len(self.database) == 0 or self.database[-1] != self._current_track_record) and self.is_playing()) if check: self.database.append(self._current_track_record) self.save_db() except Exception as e: print('error while updating the db: {}'.format(e)) def tracks(self): ''' Query the page to populate a list of available tracks ''' # Sleep to give the browser time to render and finish any animations sleep(1) # Get the container for the visible track list discover_section = self.browser.find_elements_by_class_name( 'discover-results') left_x = discover_section[0].location['x'] right_x = left_x + discover_section[0].size['width'] # Filter the items in the list to include only those we can click discover_items = self.browser.find_elements_by_class_name( 'discover-item') self.track_list = [ t for t in discover_items if t.location['x'] >= left_x and t.location['x'] < right_x ] # Print the available tracks to the screen for (i, track) in enumerate(self.track_list): print('[{}]'.format(i + 1)) lines = track.text.split('\n') print('Album : {}'.format(lines[0])) print('Artist : {}'.format(lines[1])) if len(lines) > 2: print('Genre : {}'.format(lines[2])) def catalogue_pages(self): ''' Print the available pages in the catalogue that are presently accessible ''' print('PAGES') for e in self.browser.find_elements_by_class_name('item-page'): print(e.text) print('') def more_tracks(self, page='next'): ''' Advance the catalogue and repopulates the track list. We can pass in a number to advance any of the available pages ''' next_btn = [ e for e in self.browser.find_elements_by_class_name('item-page') if e.text.lower().strip() == str(page) ] if next_btn: next_btn[0].click() self.tracks() def play(self, track=None): ''' Play a track. If no track number is supplied, the presently selected track will play ''' if track is None: self.browser.find_elements_by_class_name('playbutton')[0].click() elif type(track) is int and track <= len( self.track_list) and track >= 1: self._current_track_number = track self.track_list[self._current_track_number - 1].click() sleep(0.5) if self.is_playing(): self._current_track_record = self.currently_playing() def play_next(self): ''' Plays the next available track ''' if self._current_track_number < len(self.track_list): self.play(self._current_track_number + 1) else: self.more_tracks() self.play(1) def pause(self): ''' Pauses the playback ''' self.play() def is_playing(self): ''' Returns `True` if a track is presently playing ''' playbtn = self.browser.find_elements_by_class_name('playbutton') return playbtn[0].get_attribute('class').find('playing') > -1 def currently_playing(self): ''' Returns the record for the currently playing track or None if nothing is playing ''' try: if self.is_playing(): title = self.browser.find_elements_by_class_name( 'title')[0].text album_detail = self.browser.find_elements_by_css_selector( '.detail-album > a') album_title = album_detail[0].text album_url = album_detail[0].get_attribute('href').split('?')[0] artist_detail = self.browser.find_elements_by_css_selector( '.detail-artist > a') artist = artist_detail[0].text artist_url = artist_detail[0].get_attribute('href').split( '?')[0] return TrackRec(title, artist, artist_url, album_title, album_url, ctime()) except Exception as e: print('There was an error: {}'.format(e)) return None def quit(self): ''' Quit out of Selenium and close browser ''' self.browser.close() quit()
} for x, y, w, z in zip( d.find_elements_by_css_selector("._3wU53n"), d.find_elements_by_css_selector("._2rQ-NK"), d.find_elements_by_xpath("//div[@class='_3BTv9X']/img"), d.find_elements_by_xpath("//a[@class='_31qSD5']")) if containsall(x.text.lower(), pn.lower().split()) }) try: d.execute_script('''window.open("{}","_blank");'''.format( d.find_element_by_xpath( "//span[contains(text(),'Next')]/parent::a[@class='_3fVaIS']"). get_attribute("href"))) sleep(3) d.close() d.switch_to.window(d.window_handles[0]) sleep(3) except: break d.close() try: os.remove(w + r"\Database\flipkart.json") except: pass with open(w + r"\Database\flipkart.json", "w") as p: json.dump( { a: b for i in m for a, b in i.items() if b["Price"] == min([j[n]["Price"] for j in m for n in j])
class WeixinSelenium(Base): def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST self.driver = Firefox() self.client = MongoClient(HOST, PORT) self.collection = self.client[DB][COLLECTION] self.all_uids = self.uids def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() self.driver.implicitly_wait(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format( e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass def get_query_words(self): query_words = [] for docs in self.collection.find({}, { 'rel': 1, 'conp': 1 }).sort([('_id', 1)]): w = docs['conp'] if w not in query_words: query_words.append(w) for item in docs['rel']: if item not in query_words: query_words.append(item) self.client.close() return query_words @property def uids(self): return { docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs } def extract_urls_uids(self, word): urls_uids = [] timestamp = [ _t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p') ] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a') ] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.all_uids: self.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @staticmethod def query_index(words, cut_word): try: index = words.index(cut_word) return index except ValueError: pass return 0 @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait( self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl(self, word=None, go=0): is_go = True is_break = False go_page = int(go) next_page_css = 'sogou_page_%s' query_words = self.get_query_words() ind = self.query_index(query_words, word) for index, word in enumerate(query_words[ind:], 1): next_ind = ind + index is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break, new open browser!' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() wt = randint(10, 40) if page % 5 == 0 else randint(5, 18) self.logger.info( 'Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'. format(next_ind, word, page, wt)) self.driver.implicitly_wait(wt) if is_break: break in_client.close() self.close_browser() def close_browser(self): try: self.driver.close() except (NoSuchWindowException, ): pass
def query(phone_number, debug, proxy, tor_proxy, outfile): """ :param phone_number: query this number :param debug: don't use headless mode (for debugging) :param proxy: use this proxy server :param tor_proxy: configure browser for tor proxy :param outfile: log to this file :return: -- """ opts = Options() if not debug: opts.set_headless() assert opts.headless # Operating in headless mode profile = webdriver.FirefoxProfile() # set FF preference to socks proxy if proxy: print('Setting proxy...') proxy = proxy.split(':') proxy_host = proxy[0] proxy_port = proxy[1] proxy_port = int(proxy_port) profile.set_preference("network.proxy.type", 1) if not tor_proxy: profile.set_preference("network.proxy.http", proxy_host) profile.set_preference("network.proxy.http_port", proxy_port) profile.set_preference('network.proxy.https', proxy_host) profile.set_preference('network.proxy.https', proxy_port) profile.set_preference('network.proxy.ssl', proxy_host) profile.set_preference('network.proxy.ssl_port', proxy_port) profile.set_preference("network.proxy.socks", proxy_host) profile.set_preference("network.proxy.socks_port", proxy_port) profile.set_preference("network.proxy.socks_version", 5) profile.set_preference('network.proxy_dns', 'true') profile.update_preferences() browser = Firefox(options=opts, firefox_profile=profile) get_url = 'https://www.cid.ninja/phone-numbers/?query=#' + str( phone_number) browser.get(get_url) title = browser.title if title == 'Home - CID Ninja': print(Fore.RED + 'Maximum lookups for this IP reached, use a new proxy') browser.close() raise ValueError('IP Blacklist') phone_number = browser.find_element_by_id('details-phone-number').text printlog('Phone Number: ' + phone_number, outfile) details_location = browser.find_element_by_id('details-location').text printlog('Location: ' + details_location, outfile) cid_name = browser.find_element_by_id('details-cnam').text printlog('CID Name: ' + cid_name, outfile) carrier_name = browser.find_element_by_id('details-carrier').text printlog('Carrier Name: ' + carrier_name, outfile) details_sms = browser.find_element_by_id('details-sms').text printlog('SMS Email: ' + details_sms, outfile) details_old_carrier = browser.find_element_by_id('details-carrier-o').text printlog('Old Carrier:' + details_old_carrier, outfile) details_mms = browser.find_element_by_id('details-mms').text printlog('MMS Email: ' + details_mms, outfile) details_tel_num = browser.find_element_by_id('details-tel-num').text printlog('Carrier Help Line: ' + details_tel_num, outfile) details_slogan = browser.find_element_by_id('details-slogan').text printlog('Carrier Slogan: ' + details_slogan, outfile) browser.close()
def run_get_courses(): # run browser started = False while not started: try: browser = Firefox() started = True except: pass browser.set_page_load_timeout(8) browser.set_script_timeout(8) # get courses' numbers courses_ex = get_all_existing_courses(browser) with open('courses_ex.pickle', 'wb') as file: pickle.dump(courses_ex, file) courses = get_all_courses(browser, courses_ex) with open('courses1.pickle', 'wb') as file: pickle.dump(courses, file) browser.close() # with open('courses1.pickle', 'rb') as file: # courses = pickle.load(file) print('{} courses loaded'.format(len(courses))) with open('faculties.pickle', 'rb') as file: faculties = pickle.load(file) courses[104223].requires = [[104013, 104016, 104131], [104014, 104016, 104131], [104020, 104016, 104131], [104022, 104016, 104131], [104281, 104016, 104131], [104013, 104171, 104131], [104014, 104171, 104131], [104020, 104171, 104131], [104022, 104171, 104131], [104281, 104171, 104131], [104013, 104016, 104135], [104014, 104016, 104135], [104020, 104016, 104135], [104022, 104016, 104135], [104281, 104016, 104135], [104013, 104171, 104135], [104014, 104171, 104135], [104020, 104171, 104135], [104022, 104171, 104135], [104281, 104171, 104135], [104013, 104016, 104285], [104014, 104016, 104285], [104020, 104016, 104285], [104022, 104016, 104285], [104281, 104016, 104285], [104013, 104171, 104285], [104014, 104171, 104285], [104020, 104171, 104285], [104022, 104171, 104285], [104281, 104171, 104285]] courses = course_class.fill_required(courses) courses = course_class.fill_all_required(courses) courses = course_class.fill_faculties(courses, faculties) courses = course_class.fill_requirement_depth(courses) with open('courses.pickle', 'wb') as file: pickle.dump(courses, file) print('Processing finished')
class BaseTestCase(unittest.TestCase): def setUp(self): self.browser = Firefox() def tearDown(self): self.browser.close()
def seltabup(dirc, uname, destination): ee.Initialize() options = Options() options.add_argument('-headless') authorization_url = "https://code.earthengine.google.com" uname = str(username) passw = str(password) if os.name == "nt": driver = Firefox(executable_path=os.path.join(lp, "geckodriver.exe"), firefox_options=options) elif os.name == "posix": driver = Firefox(executable_path=os.path.join(lp, "geckodriver"), firefox_options=options) driver.get(authorization_url) time.sleep(5) username = driver.find_element_by_xpath('//*[@id="identifierId"]') username.send_keys(uname) driver.find_element_by_id("identifierNext").click() time.sleep(5) #print('username') passw = driver.find_element_by_name("password").send_keys(passw) driver.find_element_by_id("passwordNext").click() time.sleep(5) #print('password') try: driver.find_element_by_xpath( "//div[@id='view_container']/form/div[2]/div/div/div/ul/li/div/div[2]/p" ).click() time.sleep(5) driver.find_element_by_xpath( "//div[@id='submit_approve_access']/content/span").click() time.sleep(5) except Exception as e: pass cookies = driver.get_cookies() s = requests.Session() for cookie in cookies: s.cookies.set(cookie['name'], cookie['value']) driver.close() try: i = 1 path, dirs, files = next(os.walk(dirc)) file_count = len(files) #print(file_count) for item in os.listdir(dirc): if item.endswith('.zip'): r = s.get( "https://code.earthengine.google.com/assets/upload/geturl") d = ast.literal_eval(r.text) upload_url = d['url'] file_path = os.path.join(dirc, item) with open(file_path, 'rb') as f: upload_url = d['url'] files = {'file': f} resp = s.post(upload_url, files=files) gsid = resp.json()[0] asset_full_path = destination + '/' + item.split('.')[0] #print(asset_full_path) output = subprocess.check_output( 'earthengine upload table --asset_id ' + str(asset_full_path) + ' ' + str(gsid), shell=True) print('Ingesting ' + str(i) + ' of ' + str(file_count) + ' ' + str(os.path.basename(asset_full_path)) + ' task ID: ' + str(output).strip()) i = i + 1 except Exception as e: print(e)
# Which table we are processing title = table.find_previous_sibling('h3') if title: title = title.string else: title = table.parent.find_previous_sibling('h3') if title: title = title.string else: title = 'No inmediate name' df['region'] = pd.Series(data=[current_region] * len(df.index)) df['description'] = pd.Series(data=[title] * len(df.index)) # Decide in which list to put the extracted table if 'SLES Premium' in df.columns: suse_list.append(df) else: pricing_list.append(df) print('{0}: {1}'.format(title, df.shape)) if not os.path.exists('./azure/data'): os.mkdir('./azure/data') save_df_list(suse_list, './azure/data/azure_pricing_vm_suse.csv') save_df_list(pricing_list, './azure/data/azure_pricing_vm_common.csv') driver.close()
class Site: def __init__(self, args): self.args = args self.site_name = type(self).__name__ self.site_displayname = BashColor.HEADER + BashColor.BOLD + self.site_name + BashColor.END \ if sys.stdout.isatty() else self.site_name self.config = ConfigParser() self.__read_config_file('credentials.cfg.orig') self.__read_config_file('credentials.cfg') self._parse_credentials() self._parse_configuration() self._init_browser() def __read_config_file(self, filename): self.config.read(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, filename))) def _parse_credentials(self): if os.environ.get(self.site_name.upper() + '_USERNAME'): self.USERNAME = os.environ.get(self.site_name.upper() + '_USERNAME') else: self.USERNAME = self.config[self.site_name]['USERNAME'] if os.environ.get(self.site_name.upper() + '_PASSWORD'): self.PASSWORD = os.environ.get(self.site_name.upper() + '_PASSWORD') else: self.PASSWORD = self.config[self.site_name]['PASSWORD'] def _parse_configuration(self): # this method should be overwritten by a site, if there are more configs to parse than just the credentials pass def _init_browser(self): if self.args and not self.args.show_browser: self.display = Xvfb() self.display.start() profile = FirefoxProfile() profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference("browser.download.dir", EXPORTS_FOLDER) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv, application/zip") profile.set_preference("browser.helperApps.alwaysAsk.force", False) profile.set_preference("devtools.jsonview.enabled", False) profile.set_preference("media.volume_scale", "0.0") # https://github.com/mozilla/geckodriver/issues/858#issuecomment-322512336 profile.set_preference("dom.file.createInChild", True) self.browser = Firefox(firefox_profile=profile) # http://stackoverflow.com/questions/42754877/cant-upload-file-using-selenium-with-python-post-post-session-b90ee4c1-ef51-4 # pylint: disable=line-too-long self.browser._is_remote = False # pylint: disable=protected-access self.login() time.sleep(1) self._check_login_successful() def login(self): sys.stdout.write('===== ' + self.site_displayname + ': performing login') sys.stdout.flush() self.browser.get(self.LOGIN_PAGE) time.sleep(1) try: self._insert_login_credentials() self._click_login_button() except NoSuchElementException: time.sleep(2) # wait for page to load and try again self._insert_login_credentials() self._click_login_button() def _check_login_successful(self): if len(self.browser.find_elements_by_xpath(self.LOGIN_BUTTON_SELECTOR)) > 0 \ and len(self.browser.find_elements_by_xpath(self.LOGIN_USERNAME_SELECTOR)) > 0 \ and len(self.browser.find_elements_by_xpath(self.LOGIN_PASSWORD_SELECTOR)) > 0: command_line.error("Login to %s failed." % self.site_name) sys.stdout.write("Please check if the credentials are correctly set in your credentials.cfg\r\n") sys.stdout.flush() self.kill_browser() sys.exit(1) def _insert_login_credentials(self): login_field_user = self.browser.find_element_by_xpath(self.LOGIN_USERNAME_SELECTOR) login_field_user.send_keys(self.USERNAME) login_field_password = self.browser.find_element_by_xpath(self.LOGIN_PASSWORD_SELECTOR) login_field_password.send_keys(self.PASSWORD) def _click_login_button(self): login_button = self.browser.find_element_by_xpath(self.LOGIN_BUTTON_SELECTOR) login_button.click() time.sleep(2) # wait for page to load def kill_browser(self): self.browser.stop_client() self.browser.close() try: self.browser.quit() except WebDriverException: pass if self.args and not self.args.show_browser: self.display.stop() def get_json_from_html(self): response = self.browser.find_element_by_tag_name("pre").text.strip() return json.loads(response)
def eereposnap(destination, mode): options = Options() if mode == "active": print("Trying this in live browser") elif mode is None: options.add_argument("-headless") authorization_url = "https://code.earthengine.google.com/" try: uname = str(raw_input("Enter your Username: "******"Enter your Username: "******"Enter your password: "******"nt": driver = Firefox( executable_path=os.path.join(lp, "geckodriver.exe"), options=options ) else: driver = Firefox( executable_path=os.path.join(lp, "geckodriver"), options=options ) driver.get(authorization_url) username = driver.find_element_by_xpath('//*[@id="identifierId"]') username.send_keys(uname) driver.find_element_by_id("identifierNext").click() time.sleep(5) passw = driver.find_element_by_name("password").send_keys(passw) driver.find_element_by_id("passwordNext").click() time.sleep(5) try: driver.find_element_by_xpath( "//div[@id='view_container']/form/div[2]/div/div/div/ul/li/div/div[2]/p" ).click() time.sleep(5) driver.find_element_by_xpath( "//div[@id='submit_approve_access']/content/span" ).click() time.sleep(10) except Exception as e: pass source = driver.page_source soup = BeautifulSoup(source, "lxml") source = soup.find("script", text=re.compile("window._ee_flag_initialData")) try: json_data = json.loads( source.string.replace("window._ee_flag_initialData = ", "") .replace(";", "") .strip() ) for items in json_data["preferences"]["FAST_REPO_LISTS"]: if items["access"] == "owner": owner.append("https://earthengine.googlesource.com/" + items["name"]) if items["access"] == "reader": reader.append("https://earthengine.googlesource.com/" + items["name"]) if items["access"] == "writer": writer.append("https://earthengine.googlesource.com/" + items["name"]) except Exception as e: print(e) driver.get("https://earthengine.googlesource.com/") driver.find_element_by_link_text("Sign in").click() time.sleep(3) driver.find_element_by_xpath( "//div[@id='view_container']/div/div/div[2]/div/div/div/form/span/section/div/div/div/div/ul/li/div/div/div/div[2]/div" ).click() time.sleep(10) cookies = driver.get_cookies() session = requests.Session() for cookie in cookies: session.cookies.set(cookie["name"], cookie["value"]) driver.close() if not len(writer) == 0: for items in writer: base_path = os.path.join( destination, "writer_" + str(pendulum.now()).split("T")[0] ) if not os.path.exists(base_path): os.makedirs(base_path) base_path = os.path.join( destination, "writer_" + str(pendulum.now()).split("T")[0] ) r = session.get(items + str("/+archive/refs/heads/master.tar.gz")) if r.status_code == 200: filename = ( r.headers["Content-Disposition"].split("filename=")[1].split("/")[0] ) local_path = os.path.join(base_path, filename + ".tar.gz") if not os.path.exists(local_path): try: print("Downloading to: " + str(local_path)) f = open(local_path, "wb") for chunk in r.iter_content(chunk_size=512 * 1024): if chunk: f.write(chunk) f.close() shutil.unpack_archive( local_path, local_path.replace("-refs.tar.gz", "") ) os.remove(local_path) except Exception as e: print(e) else: print("File already exists: " + str(local_path)) else: sys.exit("Failed with " + r.status_code) if not len(reader) == 0: for items in reader: base_path = os.path.join( destination, "reader_" + str(pendulum.now()).split("T")[0] ) if not os.path.exists(base_path): os.makedirs(base_path) r = session.get(items + str("/+archive/refs/heads/master.tar.gz")) if r.status_code == 200: filename = ( r.headers["Content-Disposition"].split("filename=")[1].split("/")[0] ) local_path = os.path.join(base_path, filename + ".tar.gz") if not os.path.exists(local_path): try: print("Downloading to: " + str(local_path)) f = open(local_path, "wb") for chunk in r.iter_content(chunk_size=512 * 1024): if chunk: f.write(chunk) f.close() shutil.unpack_archive( local_path, local_path.replace("-refs.tar.gz", "") ) os.remove(local_path) except Exception as e: print(e) else: print("File already exists: " + str(local_path)) else: sys.exit("Failed with " + r.status_code) if not len(owner) == 0: for items in owner: base_path = os.path.join( destination, "owner_" + str(pendulum.now()).split("T")[0] ) if not os.path.exists(base_path): os.makedirs(base_path) r = session.get(items + str("/+archive/refs/heads/master.tar.gz")) if r.status_code == 200: filename = ( r.headers["Content-Disposition"].split("filename=")[1].split("/")[0] ) local_path = os.path.join(base_path, filename + ".tar.gz") if not os.path.exists(local_path): try: print("Downloading to: " + str(local_path)) f = open(local_path, "wb") for chunk in r.iter_content(chunk_size=512 * 1024): if chunk: f.write(chunk) f.close() shutil.unpack_archive( local_path, local_path.replace("-refs.tar.gz", "") ) os.remove(local_path) except Exception as e: print(e) else: print("File already exists: " + str(local_path)) else: sys.exit("Failed with " + r.status_code)
def click_on(id): WebDriverWait(ff,10).until(lambda x:ff.find_element_by_id(id)) ff.find_element_by_id(id).click() def get_price_list(): global out price_list = ff.find_element_by_id("pricelist") rows = price_list.find_elements_by_xpath("./tbody")[0] rows = rows.get_property("innerHTML").encode("utf-8") out.write(rows) # for row in rows: # pass URL="https://csgo.steamanalyst.com/list" URL2="https://dota2.steamanalyst.com/list" OUTFILE="/tmp/table.xls" out=open(OUTFILE,"w") out.write("<table>") ff = Firefox() #ff = Firefox(capabilities={"marionette":True}) ff.get(URL) for x in xrange(10): raw_input("Press Enter to continue") get_price_list() ff.find_element_by_id("pricelist_next").find_elements_by_tag_name("a")[0].click() out.write("</table>") out.close() ff.close()
class Browser: max_wait = 10 def __init__(self, name, headless=False): self.name = name self.headless = headless self.username = None self.start() def start(self): self.log('starting') options = Options() if self.headless: options.add_argument('--headless') self.driver = Firefox(options=options) self.elem = None self.log('started') def get(self, url): self.driver.get(url) def maximize(self): self.driver.maximize_window() self.log('maximize') def js(self, js): out = self.driver.execute_script(js) self.log('js', out=out) def bottom(self): self.js('window.scrollTo(0, document.body.scrollHeight);') def size(self, width=800, height=600): self.driver.set_window_size(width, height) self.log(f'width: {width}, height: {height}') def user(self): self.username = input('username: '******'password: '******'{self.name}.pkl', 'wb') as f: pickle.dump(cookies, f) self.log('save loaded') def load_cookies(self): with open(f'{self.name}.pkl', 'rb') as f: cookies = pickle.load(f) for cookie in cookies: self.driver.add_cookie(cookie) self.log('cookies loaded') def log(self, message, **kwargs): print(f'browser: {message}', kwargs) def html(self): html = self.driver.page_source self.log(html) def done(self): self.log('closing') self.elem = None self.username = None self.password = None self.driver.close() self.log('done') def pause(self, seconds): self.log('sleep', seconds=seconds) time.sleep(seconds) def find(self, selector): self.log('finding', selector=selector) wait = WebDriverWait(self.driver, self.max_wait) self.elem = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, selector))) self.log('found', elem=self.elem) def type(self, value): self.elem.send_keys(value) if value == self.password: self.log('type password') else: self.log(f'type: {value}') def click(self): self.elem.click() self.log('click') def enter(self): self.type(Keys.ENTER) def screenshot(self, name, show=False): image = Image.open(BytesIO(self.elem.screenshot_as_png)) fname = f'./{name}.png' image.save(fname) self.log(fname) if show: image.show()
class Scraper: """ Scrapping instance, scrapes all Orders in the given year range and outputs it into FILE_NAME """ def __init__( self, email: str, password: Optional[str], headless: bool, start: int, end: int, extensive: bool, progress_observer_callback: Callable[[float], None] = None) -> None: assert email, "no E-Mail provided" assert '@' in email and '.' in email, "incorrect email layout" # Todo replace by regex assert start <= end, "start year must be before end year" assert end >= 2010, "Amazon order history works only for years after 2009" assert end <= datetime.datetime.now( ).year, "End year can not be in the future" self.logger = logging.getLogger(__name__) self.progress_observer_callback: Callable[ [float], None] = progress_observer_callback self.email = email self.password = password if password else file_handler.load_password() if not self.password: self.logger.error( colored("Password not given nor pw.txt found", 'red')) raise PasswordFileNotFound self.start_date: datetime.date = datetime.date(year=start, month=1, day=1) self.end_date: datetime.date = datetime.datetime.now().date() if end == datetime.datetime.now().year \ else datetime.date(year=end, month=12, day=31) self.start_scraping_date: datetime.date = datetime.date(year=start, month=1, day=1) self.headless = headless self.extensive = extensive self.orders: List[Order] = [] self.browser: WebDriver self._setup_scraping() self._get_orders() file_handler.save_file( FILE_NAME, json.dumps([order.to_dict() for order in self.orders])) self.browser.quit() def _notify_progress_observers(self, progress: float) -> None: if self.progress_observer_callback: self.progress_observer_callback(progress) def _setup_scraping(self) -> None: """ prepares the WebDriver for scraping the data by: - setting up the WebDrive - log in the user with the given credentials - skipping the adding phone number dialog (should it appear) :raise LoginError if not possible to login """ firefox_profile = FirefoxProfile() firefox_profile.set_preference("browser.tabs.remote.autostart", False) firefox_profile.set_preference("browser.tabs.remote.autostart.1", False) firefox_profile.set_preference("browser.tabs.remote.autostart.2", False) opts = Options() opts.headless = self.headless if opts.headless: self.logger.info(colored("Run in headless mode.", 'blue')) self.browser = Firefox(options=opts, firefox_profile=firefox_profile) self._navigate_to_orders_page() self._complete_sign_in_form() if not self._signed_in_successful(): self.logger.error( colored( "Couldn't sign in. Maybe your credentials are incorrect?", 'red')) print( colored( "Couldn't sign in. Maybe your credentials are incorrect?", 'red')) self.browser.quit() raise LoginError self._skip_adding_phone_number() def _navigate_to_orders_page(self) -> None: """ navigates to the orders page """ self.browser.get( 'https://www.amazon.de/gp/css/order-history?ref_=nav_orders_first') def _complete_sign_in_form(self) -> None: """ searches for the sign in form enters the credentials and confirms if successful amazon redirects the browser to the previous site """ try: email_input = self.browser.find_element_by_id('ap_email') email_input.send_keys(self.email) password_input = self.browser.find_element_by_id('ap_password') password_input.send_keys(self.password) self.browser.find_element_by_name('rememberMe').click() sign_in_input = self.browser.find_element_by_id('signInSubmit') sign_in_input.click() except NoSuchElementException: self.logger.error( colored( "Error while trying to sign in, couldn't find all needed form elements", 'red')) print( colored( "Error while trying to sign in, couldn't find all needed form elements", 'red')) def _signed_in_successful(self) -> bool: """ simple check if we are still on the login page """ return bool( self.browser.current_url != "https://www.amazon.de/ap/signin") def _skip_adding_phone_number(self) -> None: """ find and click the 'skip adding phone number' button if found on the current page """ try: skip_adding_phone_link = self.browser.find_element_by_id( 'ap-account-fixup-phone-skip-link') skip_adding_phone_link.click() self.logger.info(colored('skipped adding phone number', 'blue')) except NoSuchElementException: self.logger.info( colored('no need to skip adding phone number', 'blue')) def _is_custom_date_range(self) -> bool: """ :param start: start date :param end: end date :return: whether the maximum date range is used or a custom user set range """ return self.start_date.year != 2010 or self.end_date.year != datetime.datetime.now( ).year def _are_orders_for_year_available(self) -> bool: """ checks if there are any orders in the current selected year :return: True if there were orders, False if not """ return bool( self.browser.page_source.find('keine Bestellungen aufgegeben') == -1) # No error! def _is_next_page_available(self) -> bool: """ as long as the next page button exists there is a next page :return: True if there is a next page, False if not""" pagination_element = self.browser.find_element_by_class_name( 'a-pagination') try: return 'Weiter' not in pagination_element.find_element_by_class_name( 'a-disabled').text except NoSuchElementException: return True @staticmethod def _is_digital_order(order_id: str) -> bool: """ checks if the order is digital (e.g. Amazon Video or Audio Book) :param order_id: the id of the order to check :return: True if order is digital, False if not """ return order_id[:3] == 'D01' def _is_paging_menu_available(self) -> bool: """ :returns: whether there are multiple pages for the current year by searching for a paging menu """ try: return self.browser.find_element_by_class_name( 'a-pagination') is not None except NoSuchElementException: return False def _get_orders(self) -> None: """ get a list of all orders in the given range (start and end year inclusive) to save network capacities it is checked if some orders got already fetched earlier in 'orders.json' """ if self._is_custom_date_range(): file_handler.remove_file(FILE_NAME) else: self.orders = file_handler.load_orders(FILE_NAME) if self.orders: self._scrape_partial() else: self._scrape_complete() self.orders = sorted(self.orders, key=lambda order: order.date) def _get_order_info( self, order_info_element: WebElement ) -> Tuple[str, float, datetime.date]: """ :param order_info_element: :returns: the OrderID, price and date """ order_info_list: List[str] = [ info_field.text for info_field in order_info_element.find_elements_by_class_name('value') ] # value tags have only generic class names so a constant order in form of: # [date, price, recipient_address, order_id] or if no recipient_address is available # [date, recipient_address, order_id] # is assumed if len(order_info_list) < 4: order_id = order_info_list[2] else: order_id = order_info_list[3] # price is usually formatted as 'EUR x,xx' but special cases as 'Audible Guthaben' are possible as well order_price_str = order_info_list[1] if order_price_str.find('EUR') != -1: order_price = self._price_str_to_float(order_price_str) else: order_price = 0 date_str = order_info_list[0] date = ut.str_to_date(date_str) return order_id, order_price, date def _scrape_complete(self) -> None: """ scrapes all the data without checking for duplicates (when some orders already exist) """ self.orders = self._scrape_orders() def _scrape_partial(self) -> None: """ scrape data until finding duplicates, at which point the scraping can be canceled since the rest is already there """ self.orders = sorted(self.orders, key=lambda order: order.date) self.start_scraping_date = self.orders[-1].date scraped_orders: List[Order] = self._scrape_orders() # check for intersection of fetched orders existing_order_ids = list( map(lambda order: order.order_id, self.orders)) new_orders: List[Order] = list( filter(lambda order: order.order_id not in existing_order_ids, scraped_orders)) self.orders.extend(new_orders) def _scrape_orders(self) -> List[Order]: """ :returns: a list of all orders in between given start year (inclusive) and end year (inclusive) """ orders: List[Order] = [] # order filter option 0 and 1 are already contained in option 2 [3months, 6months, currYear, lastYear, ...] start_index = 2 + (datetime.datetime.now().year - self.end_date.year) end_index = 2 + (datetime.datetime.now().year - self.start_scraping_date.year) + 1 for order_filter_index in range(start_index, end_index): # open the dropdown ut.wait_for_element_by_id(self.browser, 'a-autoid-1-announce') self.browser.find_element_by_id('a-autoid-1-announce').click() # select and click on a order filter id_order_filter = f'orderFilter_{order_filter_index}' ut.wait_for_element_by_id(self.browser, id_order_filter) dropdown_element = self.browser.find_element_by_id(id_order_filter) dropdown_element.click() pages_remaining = self._are_orders_for_year_available() while pages_remaining: orders_on_page: List[Order] = self._scrape_page_for_orders() orders.extend(orders_on_page) current_date: datetime.date = orders_on_page[-1].date if orders_on_page and self.start_scraping_date > current_date: break if self._is_paging_menu_available(): pagination_element = self.browser.find_element_by_class_name( 'a-pagination') else: break pages_remaining = self._is_next_page_available() if pages_remaining: next_page_link = pagination_element.find_element_by_class_name('a-last') \ .find_element_by_css_selector('a').get_attribute('href') self.browser.get(next_page_link) return orders def _scrape_page_for_orders(self) -> List[Order]: """ :returns a list of all orders found on the currently open page """ orders = [] for order_element in self.browser.find_elements_by_class_name('order'): ut.wait_for_element_by_class_name(order_element, 'order-info', timeout=3) order_info_element = order_element.find_element_by_class_name( 'order-info') order_id, order_price, date = self._get_order_info( order_info_element) items = [] # looking in an order there is a 'a-box' for order_info and and 'a-box' for each seller containing detailed # items info for items_by_seller in order_element.find_elements_by_class_name( 'a-box')[1:]: for index, item_element in enumerate( items_by_seller.find_elements_by_class_name( 'a-fixed-left-grid')): seller = self._get_item_seller(item_element) title, link = self._get_item_title(item_element) item_price = order_price if self._is_digital_order(order_id) else \ self._get_item_price(item_element, index, order_element) categories = self._get_item_categories( link) if self.extensive else dict() items.append( Item(item_price, link, title, seller, categories)) orders.append(Order(order_id, order_price, date, items)) current_date: datetime.date = orders[-1].date progress: float = self._get_progress(current_date=current_date) self._notify_progress_observers(progress) return orders @staticmethod def _get_item_seller(item_element: WebElement) -> str: """ :param item_element: the item div :return: returns the seller """ try: seller_raw: str = item_element.text.split('durch: ')[1] seller: str = seller_raw.split('\n')[0] return seller except IndexError: return 'not available' @staticmethod def _get_item_title(item_element: WebElement) -> Tuple[str, str]: """ :param item_element: the item div :return: returns the title and link of an item """ item_elements = item_element.find_element_by_class_name('a-col-right') \ .find_elements_by_class_name('a-row') item_title_element = item_elements[0] title = item_title_element.text try: link = item_title_element.find_element_by_class_name( 'a-link-normal').get_attribute('href') except NoSuchElementException: link = 'not available' return title, link def _get_item_price(self, item_element: WebElement, item_index: int, order_element: WebElement) -> float: """ :param item_element: the item div :param item_index: the index of the item in the order :param order_element: the order div :return: returns the price of an item """ try: item_price_str = item_element.find_element_by_class_name( 'a-color-price').text item_price = self._price_str_to_float(item_price_str) except (NoSuchElementException, ValueError): item_price = self._get_item_price_through_details_page( order_element, item_index) return item_price def _get_item_price_through_details_page(self, order_element: WebElement, item_index: int) -> float: """ :param order_element: the order div :param item_index: the index of the item in the order :returns: the item price found on the order details page """ item_price: float = 0 try: order_details_link = order_element.find_element_by_class_name( 'a-link-normal').get_attribute('href') self.browser.execute_script( f'''window.open("{order_details_link}","_blank");''') self.browser.switch_to.window(self.browser.window_handles[1]) if not ut.wait_for_element_by_class_name(self.browser, 'od-shipments'): return item_price od_shipments_element = self.browser.find_element_by_class_name( 'od-shipments') price_fields: List[ WebElement] = od_shipments_element.find_elements_by_class_name( 'a-color-price') item_price = self._price_str_to_float( price_fields[item_index].text) except (NoSuchElementException, ValueError): item_price = 0 self.logger.warning( colored( f'Could not parse price for order:\n{order_element.text}', 'yellow')) finally: self.browser.close() self.browser.switch_to.window(self.browser.window_handles[0]) return item_price def _get_item_categories(self, item_link: str) -> Dict[int, str]: """ :param item_link: the link to the item itself :returns: a dict with the categories and the importance as key """ categories: Dict[int, str] = dict() self.browser.execute_script( f'''window.open("{item_link}","_blank");''') self.browser.switch_to.window(self.browser.window_handles[1]) if ut.wait_for_element_by_id(self.browser, 'wayfinding-breadcrumbs_container'): categories = self._get_item_categories_from_normal() self.browser.close() self.browser.switch_to.window(self.browser.window_handles[0]) return categories if ut.wait_for_element_by_class_name(self.browser, 'dv-dp-node-meta-info'): categories = self._get_item_categories_from_video() self.browser.close() self.browser.switch_to.window(self.browser.window_handles[0]) return categories self.browser.close() self.browser.switch_to.window(self.browser.window_handles[0]) return categories def _get_item_categories_from_normal(self) -> Dict[int, str]: """ :return: the categories for a normal ordered item """ categories = dict() categories_element = self.browser.find_element_by_id( 'wayfinding-breadcrumbs_container') for index, category_element in enumerate( categories_element.find_elements_by_class_name("a-list-item")): element_is_separator = index % 2 == 1 if element_is_separator: continue depth = int(index // 2 + 1) categories[depth] = category_element.text return categories def _get_item_categories_from_video(self) -> Dict[int, str]: """ :return: the genre of a movie as categories """ categories = dict() text: str = self.browser.find_element_by_class_name( 'dv-dp-node-meta-info').text genre = text.split("\n")[0] genre_list: List[str] = genre.split(", ") genre_list[0] = genre_list[0].split(" ")[1] for index, genre in enumerate(genre_list): categories[index] = genre categories[len(genre_list)] = 'movie' return categories @staticmethod def _price_str_to_float(price_str: str) -> float: """ converts the price str to a float value :param price_str: the price in string format as it is scraped :return: the price as float """ return float((price_str[4:]).replace(',', '.')) def _get_progress(self, current_date: datetime.date) -> float: """ calculates the progress by months :returns the progress in percentage """ total_days = self.end_date.day - self.start_scraping_date.day + \ (self.end_date.month - self.start_scraping_date.month) * 31 + \ (self.end_date.year - self.start_scraping_date.year) * 12 * 31 scraped_days = self.end_date.day - current_date.day + \ (self.end_date.month - current_date.month) * 31 + \ (self.end_date.year - current_date.year) * 12 * 31 progress: float = scraped_days / total_days if total_days > 0 else 1.0 return progress if progress <= 1 else 1.0
class BrowserEngine: options = Options() profile = FirefoxProfile() # Set preferences at the class level profile.set_preference("permissions.default.image", 2) # Supposed to help with memory issues profile.set_preference("dom.ipc.plugins.enabled.libflashplayer.so", False) profile.set_preference("browser.cache.disk.enable", False) profile.set_preference("browser.cache.memory.enable", False) profile.set_preference("browser.cache.offline.enable", False) profile.set_preference("network.http.use-cache", False) profile.accept_untrusted_certs = True def __init__(self, wait=5, proxy=None, headless=True): self.proxy = None if not proxy else self.proxy(proxy) self.options.headless = headless self.driver = Firefox(options=self.options, firefox_profile=self.profile, desired_capabilities=self.proxy) self.driver.set_window_position( 0, 0) # TODO: Not sure if these help or not with optimization self.driver.set_window_size(1024, 768) self.wait = WebDriverWait(self.driver, wait) def proxy(self, proxy): proxy = Proxy({ "proxyType": ProxyType.MANUAL, "httpProxy": proxy, "ftpProxy": proxy, "sslProxy": proxy, "noProxy": "" }) capabilities = DesiredCapabilities.FIREFOX proxy.add_to_capabilities(capabilities) return capabilities def quit(self): self.driver.quit() def close(self): self.driver.close() def refresh(self): self.driver.refresh() def back(self): self.driver.execute_script("window.history.go(-1)") def clear_cookies(self): self.driver.delete_all_cookies() def get(self, url): self.driver.get(url) def find_element(self, type_, value): try: return self.wait.until( lambda driver: driver.find_element(getattr(By, type_), value)) except TimeoutException: return False def populate_element(self, element, value): element.send_keys(value) def is_clickable(self, type_, value): return self.wait.until( EC.element_to_be_clickable((getattr(By, type_), value))) def click(self, button): button.click() def select_dropdown(self, element, value): select = Select(element) select.select_by_value(value) def submit(self, form): form.submit() def execute_script(self, code): self.driver.execute_script(code) def screenshot(self, filename): self.driver.get_screenshot_as_file(filename)
def webtest(): options = Options() # get firefox webdriver options options.add_argument('-headless') # run tests in headless mode CMD firefox = Firefox(firefox_options=options) # intialize firefox web driver firefox.get('http://localhost:5000') # test against flask app firefox.close()
class DriverProperty(object): """ selenium webdriverのラッパー """ def __init__(self, base_url=None, headless=False, browser_name='chrome'): """ :param bool headless: ヘッドレスオプション :return: """ self.driver = None self.base_url = base_url self.options = Options() self.options.add_argument("--ignore-certificate-errors") self.options.add_argument("--allow-running-insecure-content") self.options.add_argument("--disable-web-security") if headless: self.options.add_argument("--headless") self.options.add_argument("--disable-gpu") self.options.add_argument("--disable-desktop-notifications") self.options.add_argument("--disable-extensions") self._open_browser(browser_name=browser_name.lower()) def set_driver(self, driver=None): """ driverの引き継ぎに使用する :param selenium.webdriver driver: 引き継ぎ対象 :return: DriverProperty self """ self.driver = driver return self def _open_browser(self, browser_name): """ :return: webdriver """ if browser_name == 'chrome': if self.options is not None: self.driver = Chrome(chrome_options=self.options) else: self.driver = Chrome() elif browser_name == 'ie': self.driver = Ie() elif browser_name == 'safari': self.driver = Safari() elif browser_name == 'edge': self.driver = Edge() elif browser_name == 'firefox': self.driver = Firefox() else: raise Exception('Faild input browser name') self.driver.get(self.base_url) return self.driver def visit(self, url): """ :param str url: :return: self """ if url is None: raise Exception('input url.') try: self.driver.get(url) except WebDriverException: print('No such a url') self.driver.quit() def current_url(self): """ :return: string """ return self.driver.current_url() def close(self): """ driverはクローズしない :return: """ self.driver.close() def refresh(self): """ 現在のURLを開き直す :return: """ self.driver.refresh() def authentication(self, user_name, pass_word): """ ログイン認証を行う :param str user_name: ユーザ名 :param str pass_word: パスワード :return: """ self.driver.switch_to.alert.authenticate(user_name, pass_word) def accept(self): """ 警告を承認 :return: """ try: Alert(self.driver).accept() except NoAlertPresentException: pass
class BrowserHandler: def __init__(self, args): self.args = args if self.args and not self.args.show_browser: self.display = Xvfb() self.display.start() log_level = self._define_log_level(self.args) capabilities = self._create_browser_capabilities(log_level) options = self._create_browser_options(log_level) profile = self._create_browser_profile() self.browser = Firefox( firefox_profile=profile, capabilities=capabilities, firefox_options=options, log_path="{timestamp}_geckodriver.log".format(timestamp=TIMESTAMP)) # https://stackoverflow.com/questions/42754877/cant-upload-file-using-selenium-with-python-post-post-session-b90ee4c1-ef51-4 # pylint: disable=line-too-long self.browser._is_remote = False # pylint: disable=protected-access self.browser.maximize_window() @staticmethod def _define_log_level(args): if args and args.verbose and args.verbose >= 3: log_level = 'trace' elif args and args.verbose and args.verbose == 2: log_level = 'debug' elif args and args.verbose and args.verbose == 1: log_level = 'info' else: log_level = 'warn' return log_level @staticmethod def _create_browser_capabilities(log_level): capabilities = DesiredCapabilities.FIREFOX.copy() capabilities["moz:firefoxOptions"] = { "log": { "level": log_level, }, } return capabilities @staticmethod def _create_browser_options(log_level): options = Options() options.log.level = log_level return options @staticmethod def _create_browser_profile(): profile = FirefoxProfile() profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference("browser.download.dir", EXPORTS_FOLDER) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv, application/zip") profile.set_preference("browser.helperApps.alwaysAsk.force", False) profile.set_preference("devtools.jsonview.enabled", False) profile.set_preference("media.volume_scale", "0.0") # https://github.com/mozilla/geckodriver/issues/858#issuecomment-322512336 profile.set_preference("dom.file.createInChild", True) return profile def kill(self): self.browser.stop_client() self.browser.close() try: self.browser.quit() except WebDriverException: pass if self.args and not self.args.show_browser: self.display.stop()
def test_selenium_login(): options = Options() options.add_argument('-headless') firefox = Firefox(executable_path="./geckodriver", options=options) firefox.get("http://localhost:8080/app.py/") firefox.close()
class SasCrawler(CrawlerInterface): def __init__(self): self.logger = logging.getLogger(str(self.__class__)) self.url = 'https://classic.flysas.com/' self.flight = { 'from': 'ARN', 'to': 'LHR', 'date_from': '2018-11-05', 'date_to': '2018-11-11', } logname = '{0}_{1}-{2}.log'.format( datetime.strftime(datetime.now(), '%Y-%m-%d_%H%M%S'), self.flight['from'], self.flight['to']) # set up logging to file logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='log_files/' + logname, filemode='w') opts = Options() opts.headless = True self.browser = Firefox(options=opts) self.run() def run(self): self.load_search_form() self.fill_search_form() self.submit_search_form() self.browser.close() def load_search_form(self): self.browser.get(self.url) # select language self.browser.find_element_by_xpath( '//*[@id="lstMarkets"]/tbody/tr[8]/td[3]').click() def fill_search_form(self): # flight from elem = self.browser.find_element_by_name( 'ctl00$FullRegion$MainRegion$ContentRegion$ContentFullRegion$ContentLeftRegion$CEPGroup1$CEPActive$cepNDPRevBookingArea$predictiveSearch$txtFrom' ) elem.send_keys(self.flight['from']) self.browser.implicitly_wait(1.5) self.browser.find_element_by_css_selector( '#resultFrom .selected').click() # flight to elem = self.browser.find_element_by_name( 'ctl00$FullRegion$MainRegion$ContentRegion$ContentFullRegion$ContentLeftRegion$CEPGroup1$CEPActive$cepNDPRevBookingArea$predictiveSearch$txtTo' ) elem.send_keys(self.flight['to']) self.browser.implicitly_wait(1.5) self.browser.find_element_by_css_selector( '#resultTo .selected').click() # from (field) elem = self.browser.find_element_by_class_name('flOutDate') elem.click() self.find_outward_date() # return (field) elem = self.browser.find_element_by_class_name('flInDate') elem.click() self.find_return_date() def find_outward_date(self): # from (datepicker) datetime_object = self.get_datepicker_date() date_from = datetime.strptime(self.flight['date_from'], '%Y-%m-%d') while datetime_object.month != date_from.month: self.browser.find_element_by_class_name( 'ui-datepicker-month-link').click() datetime_object = self.get_datepicker_date() for el in self.browser.find_elements_by_css_selector( '.ui-datepicker-calendar td'): if el.text.strip() == str(date_from.day): el.click() break def find_return_date(self): # return (datepicker) datetime_object = self.get_datepicker_date() date_to = datetime.strptime(self.flight['date_to'], '%Y-%m-%d') while datetime_object.month != date_to.month: self.browser.find_element_by_class_name( 'ui-datepicker-month-link').click() datetime_object = self.get_datepicker_date() for el in self.browser.find_elements_by_css_selector( '.ui-datepicker-calendar td'): if el.text.strip() == str(date_to.day): el.click() break def submit_search_form(self): # submit form elem = self.browser.find_element_by_id( 'ctl00_FullRegion_MainRegion_ContentRegion_ContentFullRegion_ContentLeftRegion_CEPGroup1_CEPActive_cepNDPRevBookingArea_Searchbtn_ButtonLink' ) elem.click() def get_datepicker_date(self): cur_day = int( self.browser.find_element_by_class_name('ui-state-active').text) cur_month = self.browser.find_element_by_class_name( 'ui-datepicker-month').text cur_year = int( self.browser.find_element_by_class_name('ui-datepicker-year').text) return datetime.strptime( '{year} {month} {day}'.format(year=cur_year, month=cur_month, day=cur_day), '%Y %B %d') def get_data(self): pass
class RegistrationWebTest(TestCase): """ Test all facets of the registration process """ @classmethod def clear_database(cls): """ Clear the database before and after use """ collection = cls.mongo.collection for user in ['UnittestExistingTestUser', 'UnittestNonExistingTestUser']: test_user = collection.find_one({ 'username': user, }) if test_user: collection.remove(test_user) @classmethod def setUpClass(cls): """ Setup test data, browser and server """ cls.mongo = UserDatabaseConnectivity() cls.clear_database() test_user = { 'username': '******', 'salt': '000', 'password': '******', 'enabled': False, } cls.mongo.collection.save(test_user) cls.config = dict() prepare_test(cls) cls.base_url = 'http://{:s}:{:d}/static/index.xhtml'.format(cls.config['bind_ip'], cls.config['bind_port']) @classmethod def tearDownClass(cls): """ Disconnect from mongo and cleanup browser, server, etc. """ cls.clear_database() del cls.mongo cleanup(cls) def setUp(self): """ Force a page refresh between tests """ self.webdriver = Firefox() self.webdriver.implicitly_wait(10) def tearDown(self): """ Throw test user out of database """ self.webdriver.close() self.webdriver.quit() collection = self.mongo.collection test_user = collection.find_one({ 'username': '******', }) if test_user: collection.remove(test_user) def __util_get_reg_button(self): """ Get the registration form button """ self.webdriver.get(self.base_url) self.webdriver.implicitly_wait(10) sleep(3) button = self.webdriver.find_element_by_xpath('//xhtml:button[@data-formaction="registrationForm"]') return button def __util_open_dialog(self): """ Open the registration dialog """ button = self.__util_get_reg_button() button.click() self.webdriver.implicitly_wait(10) sleep(5) def test_find_button(self): """ Is the button there? """ self.assertIsNotNone(self.__util_get_reg_button()) def test_open_dialog(self): """ Can we open the dialog? """ dialog_xpath = '//xhtml:div[contains(@class, "bootstrap-dialog")]' # Test that there is no dialog open at the moment self.assertRaises(NoSuchElementException, self.webdriver.find_element_by_xpath, dialog_xpath) self.__util_open_dialog() dialog = self.webdriver.find_element_by_xpath(dialog_xpath) self.assertIsNotNone(dialog) def __util_get_form_and_username_field(self, reopen=True): """ Find the form and the username field """ if reopen: self.__util_open_dialog() form = self.webdriver.find_element_by_xpath( '//xhtml:div[contains(@class, "bootstrap-dialog") and contains(@class, "modal") and @id]' '//xhtml:div[@class="bootstrap-dialog-body"]' '//xhtml:form[@id="formlib_registration"]' ) username_field = form.find_element_by_name('username') return form, username_field def test_enter_existing_username(self): """ Test with an already existing username """ form, username_field = self.__util_get_form_and_username_field() username_field.click() username_field.send_keys('UnittestExistingTestUser') username_field.send_keys(Keys.ENTER) self.webdriver.implicitly_wait(5) error_msg = form.find_element_by_xpath( '//xhtml:div[@data-fieldref="formlib_registration_username" and @role="alert"]' ) self.assertTrue(error_msg.text.endswith('username_not_available')) @staticmethod def util_get_password_fields(form): """ Find the two password fields in the form """ sleep(3) pwd_field_1 = form.find_element_by_name('password1') pwd_field_2 = form.find_element_by_name('password2') return pwd_field_1, pwd_field_2 def __util_enter_non_existing_username(self, username_field): """ Enter a username that works """ if username_field.is_enabled(): username_field.click() username_field.send_keys('UnittestNonExistingTestUser') username_field.send_keys(Keys.ENTER) self.webdriver.implicitly_wait(5) sleep(3) def __util_test_single_pwd_error_message(self, form): """ Check if there is only a single pwd error message """ self.webdriver.implicitly_wait(5) sleep(1) error_message = form.find_element_by_xpath( '//xhtml:div[@data-fieldref="formlib_registration_password1" and @role="alert"]' ) self.assertEqual('Password invalid', error_message.text) self.assertRaises( NoSuchElementException, form.find_element_by_xpath, '//xhtml:div[@data-fieldref="formlib_registration_password2" and @role="alert"]' ) def test_non_exist_uname_pwd1_too_short(self): """ Test with a too short password in password 1 """ form, username_field = self.__util_get_form_and_username_field() self.__util_enter_non_existing_username(username_field) pwd1, dummy = RegistrationWebTest.util_get_password_fields(form) pwd1.click() pwd1.send_keys('123') pwd1.send_keys(Keys.ENTER) self.__util_test_single_pwd_error_message(form) def test_non_exist_uname_pwd1_pwd2_too_short_but_eq(self): """ Test with two passwords equal, but to short """ form, username_field = self.__util_get_form_and_username_field() self.__util_enter_non_existing_username(username_field) pwd1, pwd2 = RegistrationWebTest.util_get_password_fields(form) for pwd in [pwd1, pwd2]: pwd.click() pwd.send_keys('123') pwd2.send_keys(Keys.ENTER) self.__util_test_single_pwd_error_message(form) def test_non_exist_uname_w_val_pwd1_a_inval_repeat(self): """ Test with long enough passwords, but not equal """ form, username_field = self.__util_get_form_and_username_field() self.__util_enter_non_existing_username(username_field) pwd1, pwd2 = RegistrationWebTest.util_get_password_fields(form) pwd1.click() pwd1.send_keys('test1234') pwd1.send_keys(Keys.ENTER) pwd2.send_keys('test1235') pwd2.send_keys(Keys.ENTER) self.assertRaises( NoSuchElementException, form.find_element_by_xpath, '//xhtml:div[@data-fieldref="formlib_registration_password1" and @role="alert"]' ) error_message = form.find_element_by_xpath( '//xhtml:div[@data-fieldref="formlib_registration_password2" and @role="alert"]' ) self.assertEqual('Passwords do not match', error_message.text) def test_full_registration_flow(self): """ Test the registration flow completely """ form, username_field = self.__util_get_form_and_username_field() self.__util_enter_non_existing_username(username_field) pwd1, pwd2 = RegistrationWebTest.util_get_password_fields(form) for pwd in [pwd1, pwd2]: pwd.click() pwd.send_keys('test1234') pwd.send_keys(Keys.ENTER) self.webdriver.implicitly_wait(5) for field in ['password1', 'password2']: self.assertRaises( NoSuchElementException, form.find_element_by_xpath, '//xhtml:div[@data-fieldref="formlib_registration_{:s}" and @role="alert"]'.format(field) ) success_message = form.find_element_by_xpath( '//xhtml:div[contains(@class, "alert") and contains(@class, "alert-success") and @role="alert"]' ) self.assertEqual('Registration successful', success_message.text) form.find_element_by_xpath( '//xhtml:button[contains(@class, "btn") and contains(@class, "btn-default")]' )
class BasicSpider: total = 0 name = 'basic' ip_url = "https://icanhazip.com" # start_url = 'http://sh.lianjia.com/ershoufang/' def __init__(self, start_url, filename): self.driver = Firefox() self.start_url = start_url self.file = open(filename, "w") self.csvWriter = csv.writer(self.file, delimiter='\t') # initial dist_name urls self.lvl1_urls = set() self.lvl1_urls_retrived = set() self.lvl0_urls = set() self.lvl0_urls_retrived = set() self.lvl0_urls_retrived.add(self.start_url) self._get_start_urls(self.start_url) def scrapingAll(self): for url in self.lvl1_urls: self.parseCategroy(url) def parseCategroy(self, url=None): if not url: url = self.start_url self._sleep() self.driver.get(url) element = WebDriverWait(self.driver, 50).until( EC.presence_of_element_located((By.CLASS_NAME, "c-pagination"))) for item in self.getItems(): item.append(url) self.csvWriter.writerow(item) self.file.flush() # current = self.driver.find_element_by_xpath('//*[@class="c-pagination"]/' # 'a[@class="current"]') next = self.driver.find_elements_by_xpath( '//*[@class="c-pagination"]/' 'a[@gahref="results_next_page"]') # currenturl = urljoin(url, current.get_attribute("href")) if len(next) == 1: nexturl = urljoin(url, next[0].get_attribute("href")) # timeout = random.randint(1,100) # if timeout > 80: # self.refreshDriver() print(nexturl) self.parseCategroy(nexturl) else: print("in parseCategroy: %d next find, url -> %s" % (len(next), url)) # self.driver.close() def getItems(self): # ul = self.driver.find_element_by_xpath('//ul[@class="js_fang_list"]') bs = BeautifulSoup(self.driver.page_source) l = [] for li in bs.find("ul", {"class": "js_fang_list"}).findAll("li"): prop_title = li.find("div", {"class": "prop-title"}).find("a") title = prop_title.text.replace(" ", "") url = urljoin(self.start_url, prop_title["href"]) infos = li.findAll("div", {"class": "info-row"}) price = infos[0].find("div", { "class": "info-col price-item main" }).text.replace("\n", "") address = ":".join([tag.text for tag in infos[1].findAll("a")]) l.append([title, url, price, address]) self.total += len(l) print("total current: ", self.total) return l def changeIP(self): with Controller.from_port(port=9051) as controller: controller.authenticate() controller.signal(Signal.NEWNYM) def refreshDriver(self): # make sure Tor and Proxies has installed and configure if hasattr(self, "driver"): self.driver.close() self.changeIP() proxy_address = "localhost:8118" proxy = Proxy() proxy.socksProxy = proxy_address profile = FirefoxProfile() proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': proxy_address, 'httpsProxy': proxy_address, 'ftpProxy': proxy_address, 'sslProxy': proxy_address, 'noProxy': "" }) profile.set_proxy(proxy) driver = Firefox(firefox_profile=profile) self.driver = driver def _get_start_urls(self, url): # get first dist_name level self._sleep() self.driver.get(url) self._get_level0_urls() if len(self.lvl0_urls): new_url = self.lvl0_urls.pop() print("new url: %s" % new_url) self.lvl0_urls_retrived.add(new_url) self._get_level1_urls() self._get_start_urls(new_url) def _get_level0_urls(self): level0_urls = set() districts = self.driver.find_element_by_id("plateList") for district in districts.find_elements_by_xpath( "//div[@class='level1']/a"): dist_url = urljoin(self.start_url, district.get_attribute("href")) if dist_url not in self.lvl0_urls_retrived: level0_urls.add(dist_url) self.lvl0_urls.update(level0_urls) print("refresh level0 %s" % level0_urls) return level0_urls def _get_level1_urls(self): districts = self.driver.find_element_by_id("plateList") level1_urls = set([ i.get_attribute("href") for i in districts.find_elements_by_xpath( "//div[@class='level2-item']/a") if i.get_attribute("href") not in self.lvl0_urls and self.lvl0_urls_retrived ]) self.lvl1_urls.update(level1_urls) return level1_urls def __del__(self): self.driver.close() self.file.close() def _sleep(self): time.sleep(random.randint(1, 5)) def _conect_db(self): cnx = sqlite3.connect("urls.db") self.cnx = cnx self.cursor = cnx.cursor() def _insert(self, table_name, url): stmt = "insert into %s(url) values(?) " % table_name self.cursor.execute(stmt, [url]) self.cnx.commit() def _retrive(self, table_name): stmt = "select url from %s" % table_name self.cursor.execute(stmt) return set([item[0] for item in self.cursor.fetchall()]) def _exist(self, tablename, url): stmt = "select url from %s where url = %s" % (tablename, url) self.cursor.execute(stmt) return len(self.cursor.fetchall()) > 0
def getHydrawiseData(outFileDir): profile = FirefoxProfile() profile.set_preference('browser.download.folderList', 2) profile.set_preference('browser.download.manager.showWhenStarting', False) profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/vnd.ms-excel') profile.set_preference('browser.download.dir', outFileDir) opts = Options() opts.set_headless() assert opts.headless browser = Firefox(options=opts, firefox_profile=profile) browser.implicitly_wait(10) browser.get('https://app.hydrawise.com/config/login') assert (browser.title == 'Hydrawise') hydraUser = os.environ['HYDRA_USER'] hydraPW = os.environ['HYDRA_PW'] login_form = browser.find_elements_by_class_name('form-control') login_form[0].send_keys(hydraUser) login_form[1].send_keys(hydraPW) login_button = browser.find_element_by_class_name('login-btn') login_button.click() try: element = WebDriverWait(browser, 10).until( EC.title_is('Hydrawise Configuration')) except TimeoutException: print('logged in failed!') assert (browser.title == 'Hydrawise Configuration') # After logging in, short circuit all the button clicking by just loading up where we # want to go browser.get('https://app.hydrawise.com/config/reports') downloadButton = -1 todayButton = -1 dayButton = -1 downloadRe = re.compile(r'>Download<') todayRe = re.compile(r'>today<') dayRe = re.compile(r'>day<') buttons = browser.find_elements_by_tag_name('button') for (i, button) in enumerate(buttons): html = button.get_attribute('outerHTML') if downloadRe.search(html): downloadButton = i if todayRe.search(html): todayButton = i if dayRe.search(html): dayButton = i buttons[dayButton].click() buttons[todayButton].click() buttons[downloadButton].click() browser.close()
class Insta_automate: def __init__(self, username, passwd): self.username, self.passwd = username, passwd self.open_instagram() print(its_ok + "Opened instagram") self.login() print(its_ok + "logged in") def open_instagram(self): if os.name == "nt": # if its windows try: self.brows = Chrome(executable_path=chrome_path, chrome_options=headless_for_chrome()) except WebDriverException: print(its_not_ok + "Cannot find Chrome binary...\nTrying Firefox") self.brows = Firefox(service=Service(firefox_path), options=headles_for_firefox()) else: try: self.brows = Firefox(service=Service(firefox_path), options=headles_for_firefox()) except WebDriverException: print(its_not_ok + "Cannot find gecko...\nTrying install") install_to_os() print( its_ok + "Installed Successfully Again do you want to headles or not ?" ) self.brows = Firefox(service=Service(firefox_path), options=headles_for_firefox()) self.brows.maximize_window() self.brows.implicitly_wait(20) self.brows.get("https://www.instagram.com/") self.find_username = self.brows.find_element( By.NAME, "username").send_keys(self.username) self.find_passwd = self.brows.find_element( By.NAME, "password").send_keys(self.passwd) def login(self): self.login_but = self.brows.find_element(By.XPATH, xpaths["login_but"]).click() self.dont_save_but = self.brows.find_element(By.CLASS_NAME, "cmbtv").click() def go_to_my_profile(self): self.brows.get("https://www.instagram.com/{}/".format(username)) self.get_profile_info(self.brows.page_source) def go_to_user_page(self, name_list): for name in name_list: self.brows.get("https://www.instagram.com/{}/".format(name)) def get_followers_and_followings(self, name): self.brows.get("https://www.instagram.com/{}/".format( name)) #go to desired userpage print( f"Scrape {Fore.MAGENTA}{name}{Fore.RESET}'s follower or following users [Default is 1]" ) print(f""" [{Fore.BLUE} {Fore.RESET}] POST : {Fore.GREEN} {self.get_profile_info()["post"]} {Fore.YELLOW}(in maintenance) {Fore.RESET} [{Fore.BLUE}1{Fore.RESET}] FOLLOWING : {Fore.GREEN} {self.get_profile_info()["following"]} {Fore.RESET} [{Fore.BLUE}2{Fore.RESET}] FOLLOWERS : {Fore.GREEN} {self.get_profile_info()["followers"]} {Fore.RESET} """) self.which = input(f"\n -->") if self.which == "1": self.phr = "3" self.total_ = self.get_profile_info()["following"] elif self.which == "2": self.phr = "2" self.total_ = self.get_profile_info()["followers"] else: self.phr = "3" self.total_ = self.get_profile_info()["following"] self.brows.find_element(By.XPATH, f"//ul/li[{self.phr}]/a/div").click( ) #click followers or followed button self.data = [] if self.total_ < 24: self.total_ = 5 else: self.total_ = round(self.total_ / 3) self.start_time = time.time() for i in range(self.total_): #page down print(f" Loop : {self.total_}/{Fore.CYAN}{i}", end="\r", flush=True) self.brows.find_element(By.CLASS_NAME, "isgrP").send_keys(Keys.END) self.soup = BS( self.brows.page_source, "lxml" ) #in every loop after page down make new instance for new usernames self.all_span = [ i.text for i in self.soup.find_all("span", attrs={"class": "Jv7Aj"}) ][len(self.data):] for name in self.all_span: if name not in self.data: self.data.append(name) print( f"\t[Users Scraped : {Fore.MAGENTA}{len(self.data)} {Fore.RESET}] Sec: {Fore.YELLOW} {round(time.time() - self.start_time)}" .expandtabs(30), end="\r", flush=True) print( f"\t[Users Scraped : {Fore.MAGENTA}{len(self.data)} {Fore.RESET}] | Sec: {Fore.YELLOW} {round(time.time() - self.start_time)}" .expandtabs(30)) def get_profile_info(self): self.info = {"post": "", "followers": "", "following": ""} self.soup = BS(self.brows.page_source, "lxml") for data, key in zip(self.soup.find_all("li"), self.info.keys()): checked_str = data.text.split()[0] try: self.info[key] = int(checked_str) except ValueError: self.info[key] = text_to_num(checked_str) return self.info def follow(self, username): self.brows.get("https://www.instagram.com/{}/".format(username)) self.brows.find_element(By.XPATH, xpaths["follow_but"]).click() print(its_ok + f"Followed > {Fore.BLUE}{username}") def unfollow(self, username): self.brows.get("https://www.instagram.com/{}/".format(username)) self.brows.find_element(By.XPATH, xpaths["follow_but"]).click() self.notfy_but = WebDriverWait(self.brows, 10).until( EC.element_to_be_clickable(mark=(By.XPATH, xpaths["unfollow_notfy"]))) self.notfy_but.click() print(its_ok + f"Unfollowed > {Fore.RED}{username}") def hack_with_foll_unfoll(self, user_list: list): self.follow_count = default_val("How much loop? [default 8] : ", "8") self.delay_unfollow = default_val( "Delay for unfollows [second] [default 10 sec] :", "10") self.delay_follow = default_val( "Delay for follows [second] [default 4 sec] :", "4") for i in range(self.follow_count): for name in user_list: try: self.follow(name) except: pass self.delay(self.delay_unfollow) for name in user_list: try: self.unfollow(name) except: pass self.delay(self.delay_follow) print(f"\nLoop : {i+1}/{self.follow_count}") def close_tab(self): self.brows.close() def close_all(self): self.brows.quit() @staticmethod def delay(delay): for i in range(delay, 0, -1): time.sleep(1) print(f" Waiting for [{Fore.YELLOW}{i}{Fore.RESET}] ", end="\r")
class FirefoxTestCase(LiveServerTestCase): def create_app(self): app = create_app("__test__") print(app.instance_path) print(app.root_path) print(app.template_folder) return app def setUp(self): options = Options() options.add_argument('--port=5000') options.log.level = 'debug' self.driver = Firefox(options=options) self.driver.implicitly_wait(10) self.driver.get(self.get_server_url()) def tearDown(self): self.driver.close() # Test that all navbar links are functional def test_navbar_links(self): driver = self.driver self.assertIn("Let's Get Fit", driver.title) foods_link = driver.find_element_by_link_text('Foods') foods_link.click() self.assertIn("CKC - Foods", driver.title) workouts_link = driver.find_element_by_link_text('Workouts') workouts_link.click() self.assertIn("CKC - Workouts", driver.title) gyms_link = driver.find_element_by_link_text('Gyms') gyms_link.click() self.assertIn("CKC - Gyms", driver.title) stores_link = driver.find_element_by_link_text('Stores') stores_link.click() self.assertIn("CKC - Stores", driver.title) about_link = driver.find_element_by_link_text('About') about_link.click() wait = WebDriverWait(driver, 10) element = wait.until(EC.title_is(('About Us'))) self.assertIn("About Us", driver.title) # Test that food grid card title links lead to the correct instance that displays the name properly def test_food_grid_links(self): driver = self.driver self.assertIn("Let's Get Fit", driver.title) driver.find_element_by_link_text('Foods').click() self.assertIn("CKC - Foods", driver.title) # Adding to make it wait food_item = driver.find_element_by_class_name('card-title') food_name = food_item.text food_item = driver.find_element_by_class_name('title-link') print(food_name) print(driver.title) food_item.click() wait = WebDriverWait(driver, 10) print(driver.title) element = wait.until(EC.title_is((food_name))) self.assertEqual(food_name, driver.find_element_by_tag_name('h1').text) # Test that workout grid card title links lead to the correct instance that displays the name properly def test_workout_grid_links(self): driver = self.driver self.assertIn("Let's Get Fit", driver.title) driver.find_element_by_link_text('Workouts').click() self.assertIn("CKC - Workouts", driver.title) workout_item = driver.find_element_by_class_name('card-title') workout_name = workout_item.text workout_item.click() wait = WebDriverWait(driver, 10) element = wait.until(EC.title_is((workout_name))) self.assertEqual(workout_name, driver.find_element_by_tag_name('h1').text) # Test that gym grid card title links lead to the correct instance that displays the name properly def test_gym_grid_links(self): driver = self.driver self.assertIn("Let's Get Fit", driver.title) driver.find_element_by_link_text('Gyms').click() self.assertIn("CKC - Gyms", driver.title) gym_item = driver.find_element_by_class_name('card-title') gym_name = gym_item.text gym_item.click() wait = WebDriverWait(driver, 10) element = wait.until(EC.title_is((gym_name))) self.assertEqual(gym_name, driver.find_element_by_tag_name('h1').text) # Test that store grid card title links lead to the correct instance that displays the name properly def test_store_grid_links(self): driver = self.driver self.assertIn("Let's Get Fit", driver.title) driver.find_element_by_link_text('Stores').click() self.assertIn("CKC - Stores", driver.title) store_item = driver.find_element_by_class_name('card-title') store_name = store_item.text store_item = driver.find_element_by_class_name('card-img-top') print(store_name) print(driver.title) store_item.click() wait = WebDriverWait(driver, 10) print(driver.title) element = wait.until(EC.title_is((store_name))) self.assertEqual(store_name, driver.find_element_by_tag_name('h1').text) # Test that going back and forth in the navigation history doesn't break the website def test_navigation_history(self): driver = self.driver wait = WebDriverWait(driver, 10) driver.find_element_by_link_text('About').click() element = wait.until(EC.title_is(('About Us'))) driver.find_element_by_link_text('Foods').click() element = wait.until(EC.title_is(('CKC - Foods'))) driver.find_element_by_link_text('Stores').click() driver.back() driver.forward() driver.back() driver.back() wait = WebDriverWait(driver, 10) element = wait.until(EC.title_is(('About Us'))) self.assertEqual("About Calorie Killer Club", driver.find_element_by_tag_name('h2').text)
def execute( self, webdriver: Firefox, browser_params: BrowserParams, manager_params: ManagerParams, extension_socket: ClientSocket, ) -> None: logger.debug("BROWSER %i: Profile dumping is currently unsupported. " "See: https://github.com/mozilla/OpenWPM/projects/2." % browser_params.browser_id) return browser_profile_folder = browser_params.profile_path # ensures that folder paths end with slashes if browser_profile_folder[-1] != "/": browser_profile_folder = browser_profile_folder + "/" if tar_location[-1] != "/": tar_location = tar_location + "/" if not os.path.exists(tar_location): os.makedirs(tar_location) if compress: tar_name = "profile.tar.gz" else: tar_name = "profile.tar" # see if this file exists first # if it does, delete it before we try to save the current session if os.path.isfile(tar_location + tar_name): os.remove(tar_location + tar_name) # if this is a dump on close, close the webdriver and wait for checkpoint if close_webdriver: webdriver.close() sleep_until_sqlite_checkpoint(browser_profile_folder) # backup and tar profile if compress: tar = tarfile.open(tar_location + tar_name, "w:gz", errorlevel=1) else: tar = tarfile.open(tar_location + tar_name, "w", errorlevel=1) logger.debug("BROWSER %i: Backing up full profile from %s to %s" % ( browser_params.browser_id, browser_profile_folder, tar_location + tar_name, )) storage_vector_files = [ "cookies.sqlite", # cookies "cookies.sqlite-shm", "cookies.sqlite-wal", "places.sqlite", # history "places.sqlite-shm", "places.sqlite-wal", "webappsstore.sqlite", # localStorage "webappsstore.sqlite-shm", "webappsstore.sqlite-wal", ] storage_vector_dirs = [ "webapps", # related to localStorage? "storage", # directory for IndexedDB ] for item in storage_vector_files: full_path = os.path.join(browser_profile_folder, item) if (not os.path.isfile(full_path) and full_path[-3:] != "shm" and full_path[-3:] != "wal"): logger.critical( "BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params.browser_id, full_path)) elif not os.path.isfile(full_path) and (full_path[-3:] == "shm" or full_path[-3:] == "wal"): continue # These are just checkpoint files tar.add(full_path, arcname=item) for item in storage_vector_dirs: full_path = os.path.join(browser_profile_folder, item) if not os.path.isdir(full_path): logger.warning( "BROWSER %i: %s NOT FOUND IN profile folder, skipping." % (browser_params.browser_id, full_path)) continue tar.add(full_path, arcname=item) tar.close()
return str(company_soup.find('span', { 'class': classItem }).text).strip() except: return None # In[62]: df = pd.read_excel('data.xlsx') counter = 0 for index, row in tqdm(df.iterrows()): if counter == 20: import ctypes ctypes.windll.user32.MessageBoxW(0, "IP blocked", "Error", 1) driver.close() drivertwo.close() break #print(index, row['Company Name']) url = "https://www.dnb.com/business-directory/company-search.html?term=" + row[ 'Company Name'] + "&page=1" driver.get(url) soup = BeautifulSoup(driver.page_source, 'html.parser') try: counter = 0 url = "https://www.dnb.com" + str( soup.find('div', { 'class': 'primary_name' }).find('a')['href']).strip() drivertwo.get(url) company_soup = BeautifulSoup(drivertwo.page_source, 'html.parser')
class GetCompanyInfo(object): """ 爬取天眼查下的企业的信息 """ def __init__(self): """ 初始化爬虫执行代理,使用firefox访问 """ self.username = '' self.password = '' self.options = webdriver.FirefoxOptions() self.options.add_argument('-headless') # 无头参数 self.geckodriver = r'geckodriver' self.driver = Firefox(executable_path=self.geckodriver, firefox_options=self.options) self.start_url = 'https://www.tianyancha.com' def test(self): """ 调试专用 :return: """ start_url = '' self.driver.get(start_url) for k, v in cookies.items(): self.driver.add_cookie({'name': k, 'value': v}) time.sleep(1) print(self.driver.page_source) self.driver.close() def login(self): """ 登录并检查状态 :return: """ try: self.driver.get(self.start_url) print(self.driver.get_cookies()) username = self.index_login() username_pattern = username[:3] + ' **** ' + username[-4:] print(username_pattern) page = self.driver.page_source is_login = page.find(username_pattern) print(is_login) if is_login != -1: print('登录成功') except Exception as e: print(e) def index_login(self): """ 主页下的登录模式 :return: """ get_login = self.driver.find_elements_by_xpath( '//a[@class="media_port"]')[0] # 登录/注册 print(get_login.text) # url为login的input get_login.click() login_by_pwd = self.driver.find_element_by_xpath( '//div[@class="bgContent"]/div[2]/div[2]/div') # 切换到手机登录 print(login_by_pwd.text) login_by_pwd.click() input1 = self.driver.find_element_by_xpath( '//div[@class="bgContent"]/div[2]/div/div[2]/input') # 手机号码 input2 = self.driver.find_element_by_xpath( '//div[@class="bgContent"]/div[2]/div/div[3]/input') # 密码 print(input1.get_attribute('placeholder')) print(input2.get_attribute('placeholder')) username, password = self._check_user_pass() input1.send_keys(username) input2.send_keys(password) login_button = self.driver.find_element_by_xpath( '//div[@class="bgContent"]/div[2]/div/div[5]') # 点击登录 print(login_button.text) time.sleep(1) # 必须等待否则鉴别是爬虫 login_button.click() return username def _check_user_pass(self): """ 检查是否有帐号密码 :return: """ if self.username and self.password: return self.username, self.password else: username = input('输入您的手机号码\n') password = input('输入您的密码\n') return username, password def login_page_login(self): """ url:www.tianyancha.com/login 在这个url下的登录模式 :return: """ input1 = self.driver.find_element_by_xpath( '//div[contains(@class,"in-block")' ' and contains(@class, "vertical-top")' ' and contains(@class, "float-right")' ' and contains(@class, "right_content")' ' and contains(@class, "mt50")' ' and contains(@class, "mr5")' ' and contains(@class, "mb5")' ']/div[2]/div[2]/div[2]/input') input2 = self.driver.find_element_by_xpath( '//div[contains(@class,"in-block")' ' and contains(@class, "vertical-top")' ' and contains(@class, "float-right")' ' and contains(@class, "right_content")' ' and contains(@class, "mt50")' ' and contains(@class, "mr5")' ' and contains(@class, "mb5")' ']/div[2]/div[2]/div[3]/input') print(input1.get_attribute('placeholder')) input1.send_keys("") print(input2.get_attribute('placeholder')) input2.send_keys('') login_button = self.driver.find_element_by_xpath( '//div[contains(@class,"in-block")' ' and contains(@class, "vertical-top")' ' and contains(@class, "float-right")' ' and contains(@class, "right_content")' ' and contains(@class, "mt50")' ' and contains(@class, "mr5")' ' and contains(@class, "mb5")' ']/div[2]/div[2]/div[5]') print(login_button.text) time.sleep(1) login_button.click() def get_company_info(self, company_name, company_onwer): """ 获取想要的公司信息 :param company_name: :param company_onwer: :return: """ try: time.sleep(1) index_input_company = self.driver.find_element_by_xpath( '//input[@id="home-main-search"]') # 主页搜索框 index_input_company.send_keys(company_name) self.driver.find_element_by_xpath( '//div[contains(@class, "input-group-addon")' ' and contains(@class, "search_button")' ' and contains(@class, " white-btn")' ']').click() # 点击搜索 # button_name = find_company_button.find_element_by_xpath('//span').text # span中的文本应该为【天眼一下】 # print(button_name) # time.sleep(1) company_list = self.driver.find_elements_by_xpath( '//div[contains(@class, "b-c-white")' ' and contains(@class, "search_result_container")' ']/div') # 获取当前页面所有公司的div company_info = list() for each_company in company_list: company_name_from_web = each_company.find_element_by_tag_name( 'img').get_attribute('alt') company_url = each_company.find_element_by_tag_name( 'a').get_attribute('href') company_reg_money = each_company.\ find_element_by_css_selector('div .search_row_new.pt20 div div:nth-child(2) span').text company_reg_time = each_company.\ find_element_by_css_selector('div .search_row_new.pt20 div div:nth-child(3) span').text company_score = each_company.find_element_by_css_selector( '.c9.f20').text company_info.append([ company_name_from_web, company_url, company_reg_money, company_reg_time, company_score + '分' ]) # 获取URL print(company_info[-1]) print('当前匹配公司数:', len(company_info)) if company_info: for each_list in company_info: if each_list[0] == company_name: return '爬取成功: ' + str(each_list) # self.driver.get(each_list[1]) # 进入公司详情页 # score = self.driver.find_element_by_class_name('td-score-img').get_attribute('alt') # print(score) return '爬取成功' else: return '爬取失败' except Exception as e: print(e) def main(self): self.login() msg = self.get_company_info('*****软件有限公司', '') print(msg) print('crawl finish...') self.driver.close()
def getBhavData(): try: url = "https://www.bseindia.com/markets/equity/EQReports/BhavCopyDebt.aspx?expandable=3&utm_campaign=website&utm_source=sendgrid.com&utm_medium=email" try: opts = Options() opts.set_headless() browser = Firefox( executable_path="/home/prajwal/project/cherryApp/geckodriver", options=opts) browser.get(url) except: print('failed to connect firefox') #gets iframe element using xpath iframeElements = browser.find_elements_by_xpath( '/html/body/form/div[3]/div/div[3]/div[2]/div/div[2]/div/div/table/tbody/tr/td/iframe' ) browser.switch_to.frame(iframeElements[0]) #switches to iframe element html_page = browser.page_source #gets page source of iframe soup = BeautifulSoup(html_page, 'html.parser') link = soup.find('a', attrs={'id': 'btnhylZip'}, href=True) download_link = link['href'] *_, file_name = download_link.split( '/') #gets file name from download url link print(file_name) current_dir = os.getcwd() download_dir = os.path.join( current_dir, 'bhavDownload') #path for download directory os.makedirs(download_dir, exist_ok=True) #creates dirctory if not exits #dowload file from link wget.download(download_link, out=download_dir) download_filename = os.path.join(download_dir, file_name) print(download_filename) #unzips downloaded file to download_dir and extracts csv file zip_ref = zipfile.ZipFile(download_filename, 'r') zip_ref.extractall(download_dir) zip_ref.close() #gets downloaded CSV File name csv_filename, *_ = file_name.split('_') csv_filename = '.'.join((csv_filename, 'CSV')) print(csv_filename) # files=os.listdir(download_dir) # print(files) try: csv_filepath = os.path.join(download_dir, csv_filename) print(csv_filepath) with open(csv_filepath, 'r') as csvFile: print('file opened') reader = csv.DictReader(csvFile) print(reader) rdb = RedisDb('localhost', 'eqlist') conn = rdb.connect() print(conn) index_key = 'id' rdb.deleteEquityList(conn) for row in reader: row = dict(row) field = dict([ (i, row[i]) for i in ['SC_CODE', 'SC_NAME', 'OPEN', 'CLOSE', 'LOW', 'HIGH'] ]) field['HIGH'] = float(field['HIGH']) field['LOW'] = float(field['LOW']) value = rdb.getNewId(index_key, conn) print(value) rdb.setequityListindex(conn, value) rdb.setequityHash(conn, value, field) print(rdb.getequityHash(conn, value)) print(rdb.getequityListindex(conn)) except: print('failed to read csv') time.sleep(5) print('downloaded file deleting started') try: #deletes all files in download dir files = os.listdir(download_dir) for file in files: os.remove(os.path.join(download_dir, file)) except OSError: pass browser.close() #closes browser except: print('faild to get data')
class WeixinSelenium(Base): def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST self.driver = Firefox() self.client = MongoClient(HOST, PORT) self.collection = self.client[DB][COLLECTION] self.all_uids = self.uids def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() self.driver.implicitly_wait(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass def get_query_words(self): query_words = [] for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]): w = docs['conp'] if w not in query_words: query_words.append(w) for item in docs['rel']: if item not in query_words: query_words.append(item) self.client.close() return query_words @property def uids(self): return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.all_uids: self.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @staticmethod def query_index(words, cut_word): try: index = words.index(cut_word) return index except ValueError: pass return 0 @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl(self, word=None, go=0): is_go = True is_break = False go_page = int(go) next_page_css = 'sogou_page_%s' query_words = self.get_query_words() ind = self.query_index(query_words, word) for index, word in enumerate(query_words[ind:], 1): next_ind = ind + index is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break, new open browser!' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() wt = randint(10, 40) if page % 5 == 0 else randint(5, 18) self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt)) self.driver.implicitly_wait(wt) if is_break: break in_client.close() self.close_browser() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
class Scraper: """ A Simple Scraper Example using Selenium """ def __init__(self, base_url, query_params): self.__take_results_backup() options = Options() options.add_argument("--headless") try: self.driver=Chrome(options=options) except Exception as e: print(f'Error occured during Chrome driver : {e}') self.driver=Firefox() self.driver.get(base_url + query_params) # set up the next page element self.nextpage_element=self.driver.find_element_by_css_selector( ".pager-next a") def __take_results_backup(self): if os.path.exists('outfile.csv'): stamp=f'outfile{time.asctime().replace(":", "-").replace(" ","_")}' shutil.move('outfile.csv', stamp) def __save_info(self, lines): """ This method saves the recently collected information line from webpage """ with open('outfile.csv', 'a') as f: for line in lines: f.write(line) def nextpage(self, css_locator): self.driver.find_element_by_css_selector( css_locator).click() def scrape_page(self): providers = self.driver.find_elements_by_css_selector(".provider-row") for provider in providers: try: name = provider.find_element_by_css_selector( ".provider-base-info h3 a").text email = provider.find_element_by_css_selector( ".provider-link-details .icon-mail+a").get_attribute( 'href').replace('mailto:','') website = provider.find_element_by_css_selector( ".provider-link-details .website-link a").get_attribute('href') location = provider.find_element_by_css_selector( ".provider-info__details div.list-item:nth-of-type(4)").text lineitem=f'{name.replace(",","-")},{email},{website},{location.replace(",","-")}' # append the results self.__save_info(lineitem + "\n") except NoSuchElementException: # skip information and continue scraping the page continue except Exception as e: # discontinue in case of unknown error raise ScrapePageError(f"Error occured during scrape page : {e}") def scrape(self): # scrape until nextpage function doesn't fail while True: print(f"scraping the website... ") try: self.scrape_page() self.nextpage(".pager-next a") except ScrapePageError as e: print(e) self.nextpage(".pager-next a") continue except Exception as e: print("Something went wrong: ", e) self.driver.close() break