def before(self): self.driver = init_driver() self.page_factory = PageFactory(self.driver) yield time.sleep(2) self.driver.quit()
def before_func(self): """前置操作""" self.driver = init_driver() # 获取驱动对象 self.page_factory = PageFactory(self.driver) yield time.sleep(2) self.driver.quit()
def get_user_information(user, driver=None): """ get user information if the "from_account" argument is specified """ driver = utils.init_driver() log_user_page(user, driver) if user is not None: try: following = driver.find_element_by_xpath( '//a[contains(@href,"/' + user + '/following")]/span[1]/span[1]').text followers = driver.find_element_by_xpath( '//a[contains(@href,"/' + user + '/followers")]/span[1]/span[1]').text except Exception as e: print(e) return try: span1 = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]//span[1]' ).text span2 = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]//span[2]' ).text join_date = span2 location = span1 except Exception as e: # print(e) join_date = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]//span[1]' ).text location = "" try: element = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]//a[1]' ) website = element.get_attribute("href") except Exception as e: print(e) website = "" try: desc = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserDescription")]').text except Exception as e: print(e) desc = "" return following, followers, join_date, location, website, desc else: print("You should specify the user.") return
def scrap(start_date, max_date, words=None, to_account=None, from_account=None, interval=5, navig="chrome", lang=None, headless=True, limit=float("inf"), display_type="Top", resume=False, proxy=None, hashtag=None): """ scrap data from twitter using requests, starting from start_date until max_date. The bot make a search between each start_date and end_date (days_between) until it reaches the max_date. return: data : df containing all tweets scraped with the associated features. save a csv file containing all tweets scraped with the associated features. """ # initiate the driver driver = init_driver(navig, headless, proxy) data = [] tweet_ids = set() save_dir = "outputs" write_mode = 'w' if not os.path.exists(save_dir): os.makedirs(save_dir) # start scraping from start_date until max_date init_date = start_date # used for saving file # add interval to start_date to get end_date for te first search if words: path = save_dir + "/" + words.split("//")[0] + '_' + str(init_date).split(' ')[0] + '_' + \ str(max_date).split(' ')[0] + '.csv' elif from_account: path = save_dir + "/" + from_account + '_' + str(init_date).split( ' ')[0] + '_' + str(max_date).split(' ')[0] + '.csv' elif to_account: path = save_dir + "/" + to_account + '_' + str(init_date).split( ' ')[0] + '_' + str(max_date).split(' ')[0] + '.csv' elif hashtag: path = save_dir + "/" + hashtag + '_' + str(init_date).split( ' ')[0] + '_' + str(max_date).split(' ')[0] + '.csv' if resume: start_date = str(get_last_date_from_csv(path))[:10] write_mode = 'a' # start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d') end_date = datetime.datetime.strptime( start_date, '%Y-%m-%d') + datetime.timedelta(days=interval) refresh = 0 # save_every = interval #save every "interval" days # keep searching until max_date with open(path, write_mode, newline='', encoding='utf-8') as f: header = [ 'UserScreenName', 'UserName', 'Timestamp', 'Text', 'Emojis', 'Comments', 'Likes', 'Retweets', 'Image link', 'Tweet URL' ] writer = csv.writer(f) if write_mode == 'w': writer.writerow(header) while end_date <= datetime.datetime.strptime(max_date, '%Y-%m-%d'): # number of scrolls scroll = 0 # log search page between start_date and end_date if type(start_date) != str: log_search_page(driver=driver, words=words, start_date=datetime.datetime.strftime( start_date, '%Y-%m-%d'), end_date=datetime.datetime.strftime( end_date, '%Y-%m-%d'), to_account=to_account, from_account=from_account, lang=lang, display_type=display_type, hashtag=hashtag) else: log_search_page(driver=driver, words=words, start_date=start_date, end_date=datetime.datetime.strftime( end_date, '%Y-%m-%d'), to_account=to_account, from_account=from_account, hashtag=hashtag, lang=lang, display_type=display_type) # number of logged pages (refresh each <between_days>) refresh += 1 # number of days crossed days_passed = refresh * interval # last position of the page : the purpose for this is to know if we reached the end of the page of not so # that we refresh for another <start_date> and <end_date> last_position = driver.execute_script("return window.pageYOffset;") # should we keep scrolling ? scrolling = True print("looking for tweets between " + str(start_date) + " and " + str(end_date) + " ...") # start scrolling and get tweets tweet_parsed = 0 driver, data, writer, tweet_ids, scrolling, tweet_parsed, scroll, last_position = \ keep_scroling(driver, data, writer, tweet_ids, scrolling, tweet_parsed, limit, scroll, last_position) # keep updating <start date> and <end date> for every search if type(start_date) == str: start_date = datetime.datetime.strptime( start_date, '%Y-%m-%d') + datetime.timedelta(days=interval) else: start_date = start_date + datetime.timedelta(days=interval) end_date = end_date + datetime.timedelta(days=interval) # close the web driver driver.close() return data
def test_func(self): self.driver = init_driver() # 驱动化设备实例对象 self.factor_page = FactoryPage(self.driver) #实例化工厂类 yield time.sleep(3) self.driver.quit()
args = parser.parse_args() conf = load_config(args.conf) parameters = conf["parameters"] credentials = conf["credentials"] CHROME_PATH = parameters["CHROME_PATH"] CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"] QUERIES = parameters["JOB_QUERIES"] LINUSERNAME = credentials["LINUSERNAME"] LINPWD = credentials["LINPWD"] MONGOUSER = credentials["MONGOUSER"] MONGOPWD = credentials["MONGOPWD"] HOST = parameters["HOST"] client = connect_mongo(HOST, MONGOUSER, MONGOPWD) db = client["linkedin"] jobs = db["jobs"] driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH) driver.get("https://www.linkedin.com") login(driver, LINUSERNAME, LINPWD) JOB_SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords=" for query in QUERIES: driver.get(JOB_SEARCH_URL + query) sleep(0.5) scroll_job_panel(driver) soup = BeautifulSoup(driver.page_source, 'html.parser') n_results_element = soup.find(class_="t-12 t-black--light t-normal") n_results_string = n_results_element.get_text() n_results = int(n_results_string.split()[0].replace(',', '')) job_urls = get_job_urls(soup) start = 25 url = JOB_SEARCH_URL + query + "&start=" + str(start) while start < n_results:
def test_driver(): driver = utils.init_driver(browser='chrome', debug=True) assert True
def get_user_information(users, driver=None, headless=True): """ get user information if the "from_account" argument is specified """ driver = utils.init_driver(headless=headless) users_info = {} for i, user in enumerate(users) : log_user_page(user, driver) if user is not None: try: following = driver.find_element_by_xpath( '//a[contains(@href,"/following")]/span[1]/span[1]').text followers = driver.find_element_by_xpath( '//a[contains(@href,"/followers")]/span[1]/span[1]').text except Exception as e: #print(e) return try: element = driver.find_element_by_xpath('//div[contains(@data-testid,"UserProfileHeader_Items")]//a[1]') website = element.get_attribute("href") except Exception as e: #print(e) website = "" try: desc = driver.find_element_by_xpath('//div[contains(@data-testid,"UserDescription")]').text except Exception as e: #print(e) desc = "" a=0 try: join_date = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[3]').text birthday = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[2]').text location = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[1]').text except Exception as e: #print(e) try : join_date = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[2]').text span1 = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[1]').text if hasNumbers(span1): birthday = span1 location = "" else : location = span1 birthday = "" except Exception as e: #print(e) try : join_date = driver.find_element_by_xpath( '//div[contains(@data-testid,"UserProfileHeader_Items")]/span[1]').text birthday = "" location = "" except Exception as e: #print(e) join_date = "" birthday = "" location = "" print("--------------- " + user + " information : ---------------") print("Following : ", following) print("Followers : ", followers) print("Location : ", location) print("Join date : ", join_date) print("Birth date : ", birthday) print("Description : ", desc) print("Website : ", website) users_info[user] = [following, followers, join_date, birthday, location, website, desc] if i == len(users)-1 : driver.close() return users_info else: print("You must specify the user") continue
"Poland Visa Application Center-Mogilev", "applicatest": [{ "name": "Bahdan", "last_name": "Kazlouski", "gender": "Male", "date_born": "***********", "nationality": "BELARUS", "passport_number": "*********", "expare_date": "*************", "code_phone": "44", "phone_number": "************", "email": "*******************" }] } driver = init_driver(logger, options) centre, category, sub_category = get_centre_category_sub_category(driver) count = 1 while True: try: logger.warning(f"try to booking {count=}") sleep(randint(3, 5) + random()) centre.send_keys(options['center']) sleep(randint(3, 5) + random()) category.send_keys(options['category']) sleep(randint(5, 8) + random()) sub_category.send_keys(options['sub_category']) sleep(random())