def start_driver(self, key, msg=None, captcha=False): if msg: self.logger.info(f"{msg} ffprofile[{key}] driver init") opts = Options() if captcha: opts.headless = False else: opts.headless = True prof_ff = FirefoxProfile(self._FF_PROF[key]) driver = None driver = Firefox(options=opts, firefox_profile=prof_ff) #driver.maximize_window() time.sleep(5) try: if not captcha: driver.install_addon("/Users/antoniotorres/projects/comic_getter/myaddon/web-ext-artifacts/myaddon-1.0.zip", temporary=True) driver.uninstall_addon('@VPNetworksLLC') except Exception as e: self.logger.warning(f"[start_driver][{key}] {e}") _title = driver.title driver.get("https://readcomiconline.li") WebDriverWait(driver, 30).until_not(ec.title_is(_title)) # driver.add_cookie({'name': 'rco_quality','value': 'hq', 'domain': 'readcomiconline.li', 'path' : '/'}) driver.add_cookie({'name': 'rco_readType','value': '1', 'domain': 'readcomiconline.li', 'path' : '/'}) driver.refresh() self.logger.debug(f"driver[{key}] cookies: {driver.get_cookies()}") return driver
def load_cookies(victim, driver: webdriver.Firefox): sessions = len(victim) selection = next(iter(victim.keys())) if sessions > 1: selection = load_selection_screen(sessions, victim) if selection == "s": print("[+] Skipping...") return False for domain in victim[selection]: for cookie in domain: cookie_obj = { "name": cookie.name, "value": cookie.value, "domain": cookie.domain } driver.add_cookie(cookie_obj) print("[+] Cookies loaded successfully")
def load_cookie(self, session_id): sessions = json.load(open( self.sessions_file )) cookie_path = sessions[str(session_id)]["session_path"] url = sessions[str(session_id)]["web_url"] # Setting useragent to the same one the session saved with useragent = sessions[str(session_id)]["useragent"] profile = FirefoxProfile() profile.set_preference("general.useragent.override", useragent ) cookies = pickle.load(open(cookie_path, "rb")) try: browser = Firefox(profile) except: error("Couldn't open browser to view session!") return browser.get(url) browser.delete_all_cookies() browser.execute_script("window.localStorage.clear()") # clear the current localStorage for cookie in cookies: browser.add_cookie(cookie) status(f"Session {session_id} loaded") browser.refresh() self.browsers.append(browser)
class AutoPostBot: def __init__(self, pagelink): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = Firefox( executable_path='/home/friend/TF2alertbot/geckodriver') self.driver.get('https://mbasic.facebook.com') self.cookies = pickle.load(open("cookies.pl", "rb")) for cookie in self.cookies: self.driver.add_cookie(cookie) self.driver.get(pagelink) def post_image(self, file_path, message): time.sleep(1) self.driver.find_element_by_name('view_photo').click() self.driver.find_element_by_name('file1').send_keys(file_path) self.driver.find_element_by_name('add_photo_done').click() time.sleep(1) for msg in message: self.driver.find_element_by_name('xc_message').send_keys( msg, Keys.ENTER) self.driver.find_element_by_name('view_post').click() self.driver.quit() self.display.stop()
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called ua = UserAgent().random request.headers.setdefault('User-Agent', ua) # pass # chrome_options = Options() # chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式 # chrome_options.add_argument('--disable-gpu') # chrome_options.add_argument('--no-sandbox') # # 指定谷歌浏览器路径 # self.driver = webdriver.Chrome(chrome_options=chrome_options, # executable_path='D://BDCloundDown//chromedriver') # print("request.url:", request.url) # print("request.method:", request.method) # if request.method != 'POST': # self.driver.get(request.url) # time.sleep(1) # html = self.driver.page_source # self.driver.quit() # return scrapy.http.HtmlResponse(url=request.url, body=html.encode('utf-8'), encoding='utf-8', # request=request) # from_data = { # 'UserNameInput': 'LarvwuAX3KTyBFXtXkCcjcFRLzpSb/Ft6P1r29CxZcOlpn9Le8Q+LCQ3iTeXnW2ZdCKJsmA0tOyn4wF4C92vjs1Tg11lGxaroeAgGmSgZvBqyLQha2UNOM/MDHMroF1m9W5j92oe2jg2QPS4rTsCVRsnMcZCCd3y2iY/2/PBtx0=', # 'undefined': '0', # 'PasswordInput': ' dAaIbU2BmGOtFUVYm/gEM5yaZojqmtjifUJP2N+gkamNFyBqwec5ETXZFcji8orszLywEZPaJ1fQHOvZidQKhWLNtKDqBObcbrXlwgsQuX7ePqYBtP6qAc5JIQ/tfPcPYT6S0s4cCdAWGzyitt/L0jqf27XCael00UjFFLDswAU=', # 'IsKeepLoginState': 'true', # 'loginType': 'PC', # } # # return scrapy.FormRequest(url='https://passport.ch.com/zh_cn/Login/DoLogin', # formdata=from_data, # callback=ChunqiuSpider.islogin) if spider.name == 'chunqiu': if 'AirFlights/ActivitiesSecondKill' in request.url: options = Options() options.add_argument('-headless') # geckodriver需要手动下载 driver = Firefox(executable_path= 'C:\\Users\\win7\\Desktop\\temp\\geckodriver', firefox_options=options) # driver.manage().addCookie(new Cookie("Name", "value", "域名", "/", 生效时间, false, false)); driver.get(request.url) time.sleep(3) # driver.add_cookie( # self.ReadTxtName('./cookie.txt') # ) driver.add_cookie({ 'name': 'limeimei', 'value': '222', 'domain': 'ch.com' }) driver.add_cookie({ 'name': 'limeimei', 'value': '222', 'domain': 'pages.ch.com' }) print(driver.get_cookies()) time.sleep(30) driver.get(request.url) print(driver.get_cookies()) # searchText = driver.find_element_by_xpath('//div[@class="threadlist_title pull_left j_th_tit "]/a')[0].text # search_results = WebDriverWait(driver,10).until( # # lambda d: d.find_elements_by_xpath('//h3[@class="t c-title-en"] | //h3[@class="t"]') # lambda e: e.find_elements_by_xpath('//div[contains(@class,"threadlist_title pull_left j_th_tit ")]/a[1]') # ) # for item in search_results: # print(item.text) # 打印搜索内容的标题 html = driver.page_source driver.quit() # 构建response, 将它发送给spider引擎 return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8')
class Instagram(): def __init__(self): self.opts = Options() self.opts.set_headless() assert self.opts.headless self.browser = Firefox(options=self.opts) self.url = "https://instagram.com" self.username = "" self.password = "" self.collected_info = {} self.collected_info['username'] = [] self.collected_info['biography'] = [] self.collected_info['external_url'] = [] self.collected_info['followers'] = [] self.collected_info['followed'] = [] self.collected_info['full_name'] = [] self.collected_info['profile_pic'] = [] self.collected_info['fbid'] = [] self.collected_info['is_private'] = [] self.followers_c = 0 self.followers_list = [] self.all_followers = [] self.followers_count = 0 self.following_count = 0 self.following_c = 0 self.following_list = [] self.all_following = [] self.variables = {} self.variables["first"] = 50 self.variables["id"] = 0 self.graphql_endpoint = "view-source:https://www.instagram.com/graphql/query/" self.graphql_followers = ( self.graphql_endpoint + "?query_hash=37479f2b8209594dde7facb0d904896a") self.graphql_following = ( self.graphql_endpoint + "?query_hash=58712303d941c6855d4e888c5f0cd22f") self.login() def login(self): print("[INFO] Autorization") self.browser.get(self.url) try: for cookie in pickle.load(open("Session.pkl", "rb")): self.browser.add_cookie(cookie) print("[INFO] Session has been restored") except: print("[WARNING] Session cant be restored") self.username = input("Please enter username: "******"Please enter password: "******"input[name='username']"))) password = WebDriverWait(self.browser, 10).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "input[name='password']"))) try: try: username.clear() username.send_keys(self.username) print("[INFO]Username successfuly entered") except: print("[ERROR] Username error") try: password.clear() password.send_keys(self.password) print("[INFO] Password successfuly entered") except: print("[ERROR] Password error") try: Login_button = WebDriverWait(self.browser, 2).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "button[type='submit']"))).click() print("[INFO]Successfuly logged in") except: print("[ERROR] Login error") try: not_now = WebDriverWait(self.browser, 10).until( EC.element_to_be_clickable(( By.XPATH, "//button[contains(text(), 'Not Now')]"))).click() print("[INFO] Login info dont saved") except: print("[ERROR] Login info popup error") try: not_now2 = WebDriverWait(self.browser, 10).until( EC.element_to_be_clickable(( By.XPATH, "//button[contains(text(), 'Not Now')]"))).click() print("[INFO] Notifications dont turn on") except: print("[ERROR] Notifications popup error") try: pickle.dump(self.browser.get_cookies(), open("Session.pkl", "wb")) print("[INFO] Session has been saved") except: print("[WARNING] Cant save session") except NoSuchElementException: print("[WARNING] Wrongusername or password") def get_id(self): try: user_id = self.browser.execute_script( "return window.__additionalData[Object.keys(window.__additionalData)[0]].data.graphql.user.id" ) except WebDriverException: user_id = self.browser.execute_script( "return window._sharedData.entry_data.ProfilePage[0].graphql.user.id" ) return user_id def progress(self, count, total, status=''): bar_len = 60 filled_len = int(round(bar_len * count / float(total))) percents = round(100.0 * count / float(total), 1) bar = '=' * filled_len + '-' * (bar_len - filled_len) sys.stdout.write('[%s] %s%s Collecting %s\r' % (bar, percents, '%', status)) sys.stdout.flush() def collect_info(self, username): print("[INFO] Collecting information") try: self.browser.get("view-source:https://www.instagram.com/" + username + "/?__a=1") except: print("[ERROR] Information error with open page") try: pre = self.browser.find_element_by_tag_name("pre").text except: print("[ERROR] with Information find element pre") try: self.data = json.loads(pre)["graphql"] except: print("[ERROR]", 'graphql') try: biography = self.data["user"]["biography"] except: print("[ERROR]", 'biography') try: external_url = self.data["user"]["external_url"] except: print("[ERROR]", 'external_url') try: followers = self.data["user"]["edge_followed_by"]['count'] except: print("[ERROR]", 'followers') try: followed = self.data["user"]["edge_follow"]['count'] except: print("[ERROR]", 'followed') try: full_name = self.data["user"]["full_name"] except: print("[ERROR]", 'full name') try: profile_pic = self.data["user"]["profile_pic_url_hd"] except: print("[ERROR]", 'profile pic') try: fbid = self.data["user"]["fbid"] except: print("[ERROR]", 'fbid') try: is_private = self.data["user"]["is_private"] except: print("[ERROR]", 'is_private') try: self.collected_info['username'] = username except: print("[ERROR]", 'append username') try: self.collected_info['biography'] = biography except: print("[ERROR]", '= biography') try: self.collected_info['external_url'] = external_url except: print("[ERROR]", '= external url') try: self.collected_info['followers'] = followers except: print("[ERROR]", '= followers') try: self.collected_info['followed'] = followed except: print("[ERROR]", '= followed') try: self.collected_info['full_name'] = full_name except: print("[ERROR]", '= full name') try: self.collected_info['profile_pic'] = profile_pic except: print("[ERROR]", '= profile pic') try: self.collected_info['fbid'] = fbid except: print("[ERROR]", "fbid") try: self.collected_info['is_private'] = is_private except: print("[ERROR]", "append is_private") try: collected_info = self.collected_info except: print("[ERROR]", 'collected info') self.collected_info = [] # print(collected_info) return collected_info def next_page_followers(self, id): try: time.sleep(5) pre = self.browser.find_element_by_tag_name("pre").text self.data = json.loads(pre)["data"] page_info = self.data["user"]["edge_followed_by"]["page_info"] self.variables["id"] = id self.variables["after"] = page_info["end_cursor"] url = "{}&variables={}".format(self.graphql_followers, str(json.dumps(self.variables))) self.browser.get(url) except: print('[ERROR] With open next page followers') def get_followers(self): try: pre = self.browser.find_element_by_tag_name("pre").text data = json.loads(pre)["data"] # print(pre) page_info = data["user"]["edge_followed_by"]["page_info"] edges = data["user"]["edge_followed_by"]["edges"] self.followers_count = data["user"]["edge_followed_by"]["count"] all_followers = [] for user in edges: self.followers_c += 1 all_followers.append(user["node"]["username"]) return all_followers except: print('[ERROR] With get followers') def collect_followers(self, username): try: self.browser.get(self.url + '/' + username) except: print("[ERROR] With opening page of", username) try: followers_list = [] all_followers = [] self.variables["id"] = self.get_id() self.i = 0 self.sc_rolled = 0 self.variables["first"] = 50 except: print("[ERROR] with set vars of followers") print("[INFO] Collecting followers of", username) try: followers_url = "{}&variables={}".format( self.graphql_followers, str(json.dumps(self.variables))) self.browser.get(followers_url) except: print("[ERROR] With opening followers page") pre = self.browser.find_element_by_tag_name("pre").text self.data = json.loads(pre)["data"] self.followers_count = self.data["user"]["edge_followed_by"]["count"] self.flag = True i = 1 try: print("[INFO] Followers count:", self.followers_count) while self.flag: users = self.get_followers() self.next_page = self.data["user"]["edge_followed_by"][ "page_info"]["has_next_page"] followers_list.append(users) self.progress(self.followers_c, self.followers_count, "Followers, page:" + str(i)) iter = self.followers_count / self.variables["first"] if i < iter: if self.sc_rolled > 10: print("[INFO] Queried too much! ~ sleeping a bit :>") time.sleep(60) self.sc_rolled = 0 self.sc_rolled += 1 i += 1 self.next_page_followers(self.variables["id"]) else: self.flag = False except: print("[ERROR] With np") print("[INFO] followerscount:", self.followers_count) print("[INFO] followers saved:", self.followers_c) if (self.followers_count != self.following_c): print("[WARNING] Not all followers has been saved") for foll in followers_list: for f in foll: all_followers.append(f) self.followers_c = 0 return all_followers def next_page_following(self, id): try: time.sleep(5) pre = self.browser.find_element_by_tag_name("pre").text self.data = json.loads(pre)["data"] page_info = self.data["user"]["edge_follow"]["page_info"] self.variables["id"] = id self.variables["after"] = page_info["end_cursor"] url = "{}&variables={}".format(self.graphql_following, str(json.dumps(self.variables))) self.browser.get(url) except: print('[ERROR] With next page following') def get_following(self): try: pre = self.browser.find_element_by_tag_name("pre").text data = json.loads(pre)["data"] # print(data) page_info = data["user"]["edge_follow"]["page_info"] edges = data["user"]["edge_follow"]["edges"] self.following_count = data["user"]["edge_follow"]["count"] all_following = [] for user in edges: self.following_c += 1 all_following.append(user["node"]["username"]) return all_following except: print('[ERROR] With get following') def collect_following(self, username): try: self.browser.get(self.url + '/' + username) except: print("[ERROR] With opening page of", username) try: following_list = [] all_following = [] self.variables["id"] = self.get_id() self.i = 0 self.sc_rolled = 0 except: print("[ERROR] With following vars") print("[INFO] Collecting following of", username) try: following_url = "{}&variables={}".format( self.graphql_following, str(json.dumps(self.variables))) self.browser.get(following_url) except: print("[ERROR] with opening following page") pre = self.browser.find_element_by_tag_name("pre").text self.data = json.loads(pre)["data"] self.following_count = self.data["user"]["edge_follow"]["count"] self.flag = True i = 1 try: print("[INFO] Following by:", self.following_count) while self.flag: users = self.get_following() self.next_page = self.data["user"]["edge_follow"]["page_info"][ "has_next_page"] following_list.append(users) self.progress(self.following_c, self.following_count, "Followings, page:" + str(i)) iter = self.following_count / self.variables["first"] if i < iter: if self.sc_rolled > 10: print("[INFO] Queried too much! ~ sleeping a bit :>") time.sleep(60) self.sc_rolled = 0 self.sc_rolled += 1 i += 1 self.next_page_following(self.variables["id"]) else: self.flag = False except: print("[ERROR] with np following") print("[INFO] following count:", self.following_count) print("[INFO] following saved:", self.following_c) if (self.following_count != self.following_c): print("[WARNING] Not all following has been saved") for foll in following_list: for f in foll: all_following.append(f) self.following_c = 0 return all_following
class GetCompanyInfo(object): """ 爬取天眼查下的企业的信息 """ def __init__(self): """ 初始化爬虫执行代理,使用firefox访问 """ self.username = '' self.password = '' self.options = webdriver.FirefoxOptions() self.options.add_argument('-headless') # 无头参数 self.geckodriver = r'geckodriver' self.driver = Firefox(executable_path=self.geckodriver, firefox_options=self.options) self.start_url = 'https://www.tianyancha.com' def test(self): """ 调试专用 :return: """ start_url = '' self.driver.get(start_url) for k, v in cookies.items(): self.driver.add_cookie({'name': k, 'value': v}) time.sleep(1) print(self.driver.page_source) self.driver.close() def login(self): """ 登录并检查状态 :return: """ try: self.driver.get(self.start_url) print(self.driver.get_cookies()) username = self.index_login() username_pattern = username[:3] + ' **** ' + username[-4:] print(username_pattern) page = self.driver.page_source is_login = page.find(username_pattern) print(is_login) if is_login != -1: print('登录成功') except Exception as e: print(e) def index_login(self): """ 主页下的登录模式 :return: """ get_login = self.driver.find_elements_by_xpath( '//a[@class="media_port"]')[0] # 登录/注册 print(get_login.text) # url为login的input get_login.click() login_by_pwd = self.driver.find_element_by_xpath( '//div[@class="bgContent"]/div[2]/div[2]/div') # 切换到手机登录 print(login_by_pwd.text) login_by_pwd.click() input1 = self.driver.find_element_by_xpath( '//div[@class="bgContent"]/div[2]/div/div[2]/input') # 手机号码 input2 = self.driver.find_element_by_xpath( '//div[@class="bgContent"]/div[2]/div/div[3]/input') # 密码 print(input1.get_attribute('placeholder')) print(input2.get_attribute('placeholder')) username, password = self._check_user_pass() input1.send_keys(username) input2.send_keys(password) login_button = self.driver.find_element_by_xpath( '//div[@class="bgContent"]/div[2]/div/div[5]') # 点击登录 print(login_button.text) time.sleep(1) # 必须等待否则鉴别是爬虫 login_button.click() return username def _check_user_pass(self): """ 检查是否有帐号密码 :return: """ if self.username and self.password: return self.username, self.password else: username = input('输入您的手机号码\n') password = input('输入您的密码\n') return username, password def login_page_login(self): """ url:www.tianyancha.com/login 在这个url下的登录模式 :return: """ input1 = self.driver.find_element_by_xpath( '//div[contains(@class,"in-block")' ' and contains(@class, "vertical-top")' ' and contains(@class, "float-right")' ' and contains(@class, "right_content")' ' and contains(@class, "mt50")' ' and contains(@class, "mr5")' ' and contains(@class, "mb5")' ']/div[2]/div[2]/div[2]/input') input2 = self.driver.find_element_by_xpath( '//div[contains(@class,"in-block")' ' and contains(@class, "vertical-top")' ' and contains(@class, "float-right")' ' and contains(@class, "right_content")' ' and contains(@class, "mt50")' ' and contains(@class, "mr5")' ' and contains(@class, "mb5")' ']/div[2]/div[2]/div[3]/input') print(input1.get_attribute('placeholder')) input1.send_keys("") print(input2.get_attribute('placeholder')) input2.send_keys('') login_button = self.driver.find_element_by_xpath( '//div[contains(@class,"in-block")' ' and contains(@class, "vertical-top")' ' and contains(@class, "float-right")' ' and contains(@class, "right_content")' ' and contains(@class, "mt50")' ' and contains(@class, "mr5")' ' and contains(@class, "mb5")' ']/div[2]/div[2]/div[5]') print(login_button.text) time.sleep(1) login_button.click() def get_company_info(self, company_name, company_onwer): """ 获取想要的公司信息 :param company_name: :param company_onwer: :return: """ try: time.sleep(1) index_input_company = self.driver.find_element_by_xpath( '//input[@id="home-main-search"]') # 主页搜索框 index_input_company.send_keys(company_name) self.driver.find_element_by_xpath( '//div[contains(@class, "input-group-addon")' ' and contains(@class, "search_button")' ' and contains(@class, " white-btn")' ']').click() # 点击搜索 # button_name = find_company_button.find_element_by_xpath('//span').text # span中的文本应该为【天眼一下】 # print(button_name) # time.sleep(1) company_list = self.driver.find_elements_by_xpath( '//div[contains(@class, "b-c-white")' ' and contains(@class, "search_result_container")' ']/div') # 获取当前页面所有公司的div company_info = list() for each_company in company_list: company_name_from_web = each_company.find_element_by_tag_name( 'img').get_attribute('alt') company_url = each_company.find_element_by_tag_name( 'a').get_attribute('href') company_reg_money = each_company.\ find_element_by_css_selector('div .search_row_new.pt20 div div:nth-child(2) span').text company_reg_time = each_company.\ find_element_by_css_selector('div .search_row_new.pt20 div div:nth-child(3) span').text company_score = each_company.find_element_by_css_selector( '.c9.f20').text company_info.append([ company_name_from_web, company_url, company_reg_money, company_reg_time, company_score + '分' ]) # 获取URL print(company_info[-1]) print('当前匹配公司数:', len(company_info)) if company_info: for each_list in company_info: if each_list[0] == company_name: return '爬取成功: ' + str(each_list) # self.driver.get(each_list[1]) # 进入公司详情页 # score = self.driver.find_element_by_class_name('td-score-img').get_attribute('alt') # print(score) return '爬取成功' else: return '爬取失败' except Exception as e: print(e) def main(self): self.login() msg = self.get_company_info('*****软件有限公司', '') print(msg) print('crawl finish...') self.driver.close()
class SeleniumClass(object): def __init__(self, v_bShow=False, v_WorkingPath='', v_UseCookies=False, v_Engine='Chrome', v_EngineDriver=None): LOG_FORMAT = ('%(levelname) -5s %(asctime)s %(name) -20s %(funcName) ' '-25s %(lineno) -5d: %(message)s') logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) self.StepTimeout = 15 self.WaitTimeout = 2 self.CookiesFile = '' self.UseCookies = v_UseCookies self.Engine = v_Engine self.EngineDriver = v_EngineDriver self.EngineDriverPath = '' if (v_WorkingPath != ''): LOGGER.info('WorkingPath: ' + os.path.realpath(v_WorkingPath)) if (self.Engine == 'IE'): self.CookiesFile = os.path.realpath( v_WorkingPath) + '\\cookies.pkl' else: self.CookiesFile = os.path.realpath( v_WorkingPath) + '/cookies.pkl' if (self.Engine == 'Chrome'): self.EngineDriver = '/usr/local/sbin/chromedriver' opts = ChOptions() opts.add_argument("binary_location = '/usr/bin/'") elif (self.Engine == 'IE'): opts = IeOptions() opts.add_argument( "binary_location = 'C:\\Program Files (x86)\\Internet Explorer'" ) elif (self.Engine == 'Firefox'): self.EngineDriver = '/usr/local/sbin/geckodriver' self.EngineDriver = '/usr/local/sbin' opts = GkOptions() opts.add_argument("binary_location = '/usr/bin/'") LOGGER.info('Engine: ' + self.Engine) LOGGER.info('EngineDriver: ' + self.EngineDriver) self.EngineDriverPath = os.path.dirname( os.path.abspath(self.EngineDriver)) sys.path.insert(0, self.EngineDriverPath) opts.add_argument("--start-maximized") opts.add_argument("--enable-automation") opts.add_argument("--log-level=3") opts.add_argument("--silent") opts.add_argument("--disable-infobars") opts.add_argument("--disable-dev-shm-usage") opts.add_argument("--disable-browser-side-navigation") opts.add_argument("--disable-gpu") opts.add_argument("--no-sandbox") opts.add_argument("--no-zygote") if (not v_bShow): LOGGER.info('Headless Operation') opts.add_argument("--headless") opts.add_argument("--disable-setuid-sandbox") if (self.Engine == 'Chrome'): self.Browser = ChWebBrowser(self.EngineDriver, options=opts) elif (self.Engine == 'IE'): self.Browser = IeWebBrowser(self.EngineDriver, options=opts) elif (self.Engine == 'Firefox'): self.Browser = GkWebBrowser(self.EngineDriver, options=opts) if (self.UseCookies): try: if ((self.CookiesFile != '') and (os.path.isfile(self.CookiesFile))): for cookie in pickle.load(open(self.CookiesFile, "rb")): self.Browser.add_cookie(cookie) LOGGER.info('Cookies Loaded') except Exception as Exc: LOGGER.info('Could Not Load Cookies ' + self.CookiesFile + ' - (' + str(Exc).strip() + ')') self.Browser.set_window_size(1920, 1080) self.Browser.set_window_position(0, 0) def SwitchContext(self, v_ObjIdentity, v_TypeOfIdentity='ID', v_TimeOut=None): if (v_TimeOut is None): v_TimeOut = self.StepTimeout v_TimeOut = 0.5 LOGGER.info("ObjIdentity: " + str(v_ObjIdentity) + " (" + str(v_TypeOfIdentity).strip().upper() + ")" + "; TimeOut: " + str(v_TimeOut)) CtxReturn = False try: if (type(v_ObjIdentity) == int): self.Browser.switch_to.frame(v_ObjIdentity) CtxReturn = True else: if ((str(v_ObjIdentity).strip().upper() == 'DEFAULT') or (str(v_ObjIdentity).strip().upper() == 'MAIN')): self.Browser.switch_to.default_content() CtxReturn = True else: frmElm = self.GetElement(v_ObjIdentity, v_TypeOfIdentity, v_TimeOut) if (frmElm is False): LOGGER.info('iFrame "' + v_ObjIdentity + '" not founded') CtxReturn = False raise TimeoutException else: self.Browser.switch_to.frame(frmElm) CtxReturn = True except Exception as Exc: LOGGER.info('Could Switch to iFrame (' + str(Exc).strip() + ')') CtxReturn = False return CtxReturn def FindElement(self, v_ObjIdentity, v_TypeOfIdentity='ID', v_TimeOut=None): if (v_TimeOut is None): v_TimeOut = self.StepTimeout bReturn = False LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" + v_TypeOfIdentity.strip().upper() + ")" + "; TimeOut: " + str(v_TimeOut)) wait = WebDriverWait(self.Browser, v_TimeOut) if (v_TypeOfIdentity.strip().upper() == 'JSID'): try: for IdTime in range(v_TimeOut - 1): JsCode = ("document.getElementById('" + v_ObjIdentity + "');") JsReturn = self.ExecJsScript(JsCode, True) if 'webdriver.remote.webelement.WebElement' in JsReturn: break time.sleep(1) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True elif (v_TypeOfIdentity.strip().upper() == 'JSXPATH'): try: for IdTime in range(v_TimeOut - 1): JsCode = ("document.evaluate(" + v_ObjIdentity + ", " + "document, null, " + "XPathResult.FIRST_ORDERED_NODE_TYPE, null" + ");") JsReturn = self.ExecJsScript(JsCode, True) if 'webdriver.remote.webelement.WebElement' in JsReturn: break time.sleep(1) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True elif (v_TypeOfIdentity.strip().upper() == 'JSCLASS'): try: for IdTime in range(v_TimeOut - 1): JsCode = ("document.getElementsByClassName('" + v_ObjIdentity + "')[0];") JsReturn = self.ExecJsScript(JsCode, True) if 'webdriver.remote.webelement.WebElement' in JsReturn: break time.sleep(1) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True elif (v_TypeOfIdentity.strip().upper() == 'JSNAME'): try: for IdTime in range(v_TimeOut - 1): JsCode = ("document.getElementsByName('" + v_ObjIdentity + "')[0];") JsReturn = self.ExecJsScript(JsCode, True) if 'webdriver.remote.webelement.WebElement' in JsReturn: break time.sleep(1) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True elif (v_TypeOfIdentity.strip().upper() == 'JSCSS'): try: for IdTime in range(v_TimeOut - 1): JsCode = ("document.querySelectorAll('" + v_ObjIdentity + "')[0];") JsReturn = self.ExecJsScript(JsCode, True) if 'webdriver.remote.webelement.WebElement' in JsReturn: break time.sleep(1) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True elif (v_TypeOfIdentity.strip().upper() == 'ID'): try: wait.until( expected_conditions.visibility_of_element_located( (By.ID, v_ObjIdentity))) self.Browser.find_element_by_id(v_ObjIdentity) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True elif (v_TypeOfIdentity.strip().upper() == 'XPATH'): try: wait.until( expected_conditions.visibility_of_element_located( (By.XPATH, v_ObjIdentity))) self.Browser.find_element_by_xpath(v_ObjIdentity) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True elif (v_TypeOfIdentity.strip().upper() == 'CLASS'): try: wait.until( expected_conditions.visibility_of_element_located( (By.CLASS_NAME, v_ObjIdentity))) self.Browser.find_element_by_class_name(v_ObjIdentity) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True elif (v_TypeOfIdentity.strip().upper() == 'NAME'): try: wait.until( expected_conditions.visibility_of_element_located( (By.NAME, v_ObjIdentity))) self.Browser.find_element_by_name(v_ObjIdentity) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True elif (v_TypeOfIdentity.strip().upper() == 'CSS'): try: wait.until( expected_conditions.visibility_of_element_located( (By.CSS_SELECTOR, v_ObjIdentity))) self.Browser.find_element_by_css_selector(v_ObjIdentity) except NoSuchElementException: bReturn = False except TimeoutException: bReturn = False bReturn = True if (bReturn): LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" + v_TypeOfIdentity.strip().upper() + "); " + "FOUNDED") else: LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" + v_TypeOfIdentity.strip().upper() + "); " + "NOT FOUNDED") return bReturn def GetElement(self, v_ObjIdentity, v_TypeOfIdentity='ID', v_TimeOut=None): if (v_TimeOut is None): v_TimeOut = self.StepTimeout LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" + v_TypeOfIdentity.strip().upper() + "); " + "TimeOut: " + str(v_TimeOut)) ElmReturn = False try: if (self.FindElement(v_ObjIdentity, v_TypeOfIdentity, v_TimeOut)): if ((v_TypeOfIdentity.strip().upper() == 'ID') or (v_TypeOfIdentity.strip().upper() == 'JSID')): ElmReturn = self.Browser.find_element_by_id(v_ObjIdentity) elif (v_TypeOfIdentity.strip().upper() == 'XPATH'): ElmReturn = self.Browser.find_element_by_xpath( v_ObjIdentity) elif (v_TypeOfIdentity.strip().upper() == 'CLASS'): ElmReturn = self.Browser.find_element_by_class_name( v_ObjIdentity) elif (v_TypeOfIdentity.strip().upper() == 'NAME'): ElmReturn = self.Browser.find_element_by_name( v_ObjIdentity) elif (v_TypeOfIdentity.strip().upper() == 'CSS'): ElmReturn = self.Browser.find_element_by_css_selector( v_ObjIdentity) else: ElmReturn = False except NoSuchElementException: ElmReturn = False except TimeoutException: ElmReturn = False if (ElmReturn): LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" + v_TypeOfIdentity.strip().upper() + "); " + "ELEMENT OK") else: LOGGER.info("ObjIdentity: " + v_ObjIdentity + " (" + v_TypeOfIdentity.strip().upper() + "); " + "ELEMENT NOT OK") return ElmReturn def ExecJsScript(self, v_JsScript, v_ReturnValue=True, v_Verbose=False): if (v_ReturnValue): JsScript = 'return ' + v_JsScript else: JsScript = v_JsScript if (v_Verbose): LOGGER.info("ExecJsScript: " + JsScript) JsReturn = str(self.Browser.execute_script(JsScript)) if (v_ReturnValue): if (v_Verbose): LOGGER.info(JsReturn) return JsReturn
class Browser: max_wait = 10 def __init__(self, name, headless=False): self.name = name self.headless = headless self.username = None self.start() def start(self): self.log('starting') options = Options() if self.headless: options.add_argument('--headless') self.driver = Firefox(options=options) self.elem = None self.log('started') def get(self, url): self.driver.get(url) def maximize(self): self.driver.maximize_window() self.log('maximize') def js(self, js): out = self.driver.execute_script(js) self.log('js', out=out) def bottom(self): self.js('window.scrollTo(0, document.body.scrollHeight);') def size(self, width=800, height=600): self.driver.set_window_size(width, height) self.log(f'width: {width}, height: {height}') def user(self): self.username = input('username: '******'password: '******'{self.name}.pkl', 'wb') as f: pickle.dump(cookies, f) self.log('save loaded') def load_cookies(self): with open(f'{self.name}.pkl', 'rb') as f: cookies = pickle.load(f) for cookie in cookies: self.driver.add_cookie(cookie) self.log('cookies loaded') def log(self, message, **kwargs): print(f'browser: {message}', kwargs) def html(self): html = self.driver.page_source self.log(html) def done(self): self.log('closing') self.elem = None self.username = None self.password = None self.driver.close() self.log('done') def pause(self, seconds): self.log('sleep', seconds=seconds) time.sleep(seconds) def find(self, selector): self.log('finding', selector=selector) wait = WebDriverWait(self.driver, self.max_wait) self.elem = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, selector))) self.log('found', elem=self.elem) def type(self, value): self.elem.send_keys(value) if value == self.password: self.log('type password') else: self.log(f'type: {value}') def click(self): self.elem.click() self.log('click') def enter(self): self.type(Keys.ENTER) def screenshot(self, name, show=False): image = Image.open(BytesIO(self.elem.screenshot_as_png)) fname = f'./{name}.png' image.save(fname) self.log(fname) if show: image.show()
def copy_cookies(fromd: webdriver.Firefox, tod: webdriver.Firefox, clear=False): if clear: tod.delete_all_cookies() for cookie in fromd.get_cookies(): tod.add_cookie(cookie)
class Account(): def __init__(self, signal, lock, name, people, headless=True, debug=False): # Variables self.start = time() self.convLists = ['Conversation List', 'Λίστα συζητήσεων'] # Add your language here self.name = name self.people = people self.lock = lock self.signal = signal self.path = dirname(realpath(__file__)) self.cookies = f'{self.path}/cookies/{self.name}_cookies.pkl' self.mediapath = f'{self.path}/media/{self.name}' self.logpath = f'{self.path}/media/{self.name}/last.log' self.debug = debug self.logFile = f'{self.path}/logs/{self.name}_{strftime("%d-%m-%y-%H.%M.%S")}.log' self.url_messages = 'https://www.facebook.com/messages/t/' self.url_home = 'https://facebook.com' self.timeout = 1 self.iter_L = 60 self.iter = self.iter_L self.maxBack = 30 self.faults = 0 self.faultThreashold = 3 # Initialize Gecko driver agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.9 Safari/537.36' profile = FirefoxProfile() profile.set_preference('general.useragent.override', agent) options = Options() options.headless = headless self.driver = Firefox(profile, options=options) # Make folders self.makeFolder(f'{self.path}/logs') self.makeFolder(self.mediapath) for person in self.people: self.makeFolder(f'{self.mediapath}/{person}') def getToMessages(self): # Points driver to url_messages if self.url_messages != self.driver.current_url: self.driver.get(self.url_messages) self.waitU('_1enh') def getToHome(self): # Points driver to url_home self.driver.get(self.url_home) self.waitU('facebook') def waitU(self, element_id): # Waits until the element element_id is available try: element_present = EC.presence_of_element_located( (By.ID, element_id)) WebDriverWait(self.driver, self.timeout).until(element_present) except TimeoutException: pass def getConversations(self, people): # Locates element of conversation self.logIt(f'Get to conversation with list {people}') for convList in self.convLists: if convList in self.driver.page_source: conversations = self.driver.find_elements_by_xpath( f'//ul[@aria-label="{convList}"]//li[@tabindex="-1"]') conv = [] for i in conversations: person = findall( 'data-href="https://www.facebook.com/messages/t/(.*?)"', i.get_attribute('innerHTML'))[0] if person in people: conv.append(i) return conv def getToThread(self, num): # Wrapper to download media self.logIt(f'Getting to thread, media {num}') self.waitU('_3m31') threads = self.driver.find_elements_by_xpath('//a[@class="_3m31"]') thread = threads[num] if self.existsMedia(thread.get_attribute('innerHTML')): return thread.click() self.waitU('_4-od') self.getMedia() thread.send_keys(Keys.ESCAPE) return len(threads) def login(self, username, password): # Logs in to account, USE WITH CAUTION self.logIt( 'Trying to login, USE WITH CAUSION!! (sleeping for 10 secs)') sleep(10) driver = self.driver self.getToHome() elem = driver.find_element_by_id('email') elem.send_keys(username) elem = driver.find_element_by_id('pass') elem.send_keys(password) elem.send_keys(Keys.RETURN) self.waitU('userNav') self.saveCookies() def saveCookies(self): # Saves the cookies self.logIt(f'Saving cookies') self.makeFolder(f'{self.path}/cookies/') if Path(self.cookies).is_file(): move(self.cookies, f'{self.cookies}.bak') with open(self.cookies, 'wb') as filehandler: dump(self.driver.get_cookies(), filehandler) def loadCookies(self): # Loads the cookies if exists(self.cookies): self.getToHome() self.logIt(f'Loading cookies') with open(self.cookies, 'rb') as cookiesfile: cookies = load(cookiesfile) for cookie in cookies: self.driver.add_cookie(cookie) else: raise ValueError('Cookies file not found!') def isRead(self, person, elem): # Checks if the conversation is read if '_1ht3' in elem.get_attribute('outerHTML'): self.logIt(f'Conversation with {person} is not read') self.iter = 2 return False else: self.logIt(f'Conversation with {person} is read') return True def getPerson(self, person, override): # Checks if the conversation is read and if so it gets in self.person = person try: elem = self.getConversations(person)[0] if self.isRead(person, elem) or override: elem.click() self.faults = 0 return True except Exception as e: self.logIt(f'{person} not rechable ({e})') if self.pressEscape(): self.logIt('Pressed escape!') else: self.faults += 1 return False def downloadMedia(self, link): # Downloads the media self.logIt(f'Downloading media with person {self.person}') file = findall('/(\d+_\d+_\d+_\w\.\w{3,4})\?', link)[0] if not Path(f'{self.mediapath}{file}').is_file(): call([ 'curl', '-s', link, '-o', f'{self.mediapath}/{self.person}/{file}' ]) return False else: self.iter = self.iter_L return True def getMedia(self): # Finds the media inside the conversation self.logIt(f'Trying to get media') photo = self.driver.find_element_by_class_name('_4-od') image = findall('src="(.*?)"', photo.get_attribute('innerHTML'))[0] return self.downloadMedia(image.replace('amp;', '')) def existsMedia(self, media): # Checks if the media exists medianame = findall('/(\d+_\d+_\d+_\w\.\w{3,4})\?', media)[0] if not Path(self.logpath).is_file(): with open(self.logpath, 'w') as f: f.write('---\n') with open(self.logpath, 'r+') as f: if medianame not in f.read(): f.write(f'{medianame}\n') self.logIt('Media does not exist, fetching it') return False else: self.logIt('Media exists') return True def manageThread(self, person, override): # Wrapper lenght = 0 now = 0 while self.getPerson(person, override): try: lenght = self.getToThread(now) now += 1 if now == lenght or now > self.maxBack: return except: return return def manageThreads(self, override): # Main wrapper function for person in self.people: self.logIt(f'Now on person {person}') self.manageThread(person, override) self.logIt(f'Sleeping for {self.iter} secs') sleep(self.iter) return True if self.faults < self.faultThreashold else False def pressEscape(self): # Presses escape elements = ['_4-od', '_3m31', '_1enh'] for element in elements: try: elem = self.driver.find_element_by_class_name(element) elem.send_keys(Keys.ESCAPE) return True except: pass return False def makeFolder(self, path): # Makes folder if it doesn't exist if not exists(path): makedirs(path) self.logIt(f'Making a new forlder in {path}') def logIt(self, message): # Logs events and prints on screen debugging information self.signal.set() with open(self.logFile, 'a') as log: log.write( f'[{self.name}]: {message}, t+{round(time()-self.start)}s\n') if self.debug: with self.lock: print( f'[{self.name}]: {message}, t+{round(time()-self.start)}s')
class InstagramComments(object): """ Instagram scraper """ def __init__(self): """ Initializes the instance of the webdriver. Keeps all new opening links in one window. """ self.firefox_options = Options() self.browser = Firefox(options=self.firefox_options) def login(self): """ Login functionality Requires the log in information to be stored in the additional file: login_data """ self.browser.get( 'https://www.instagram.com/accounts/login/?source=auth_switcher') print('Opening the page') time.sleep(8) self.usernameInput = self.browser.find_elements_by_css_selector( 'form input')[0] self.usernameInput.send_keys(login_data.USERNAME_INSTAGRAM) print('Input username') time.sleep(3) self.passwordInput = self.browser.find_elements_by_css_selector( 'form input')[1] self.passwordInput.send_keys(login_data.PASSWORD_INSTAGRAM) print('Input password') try: self.button_login = self.browser.find_element_by_class_name( 'button') except: self.button_login = self.browser.find_element_by_xpath( '/html/body/div[1]/section/main/div/article/div/div[1]/div/form/div[4]/button/div' ) time.sleep(4) self.button_login.click() print('Logged in') time.sleep(5) self.cookies = pickle.dump(self.browser.get_cookies(), open('cookies.pkl', 'wb')) try: self.notnow = self.browser.find_element_by_css_selector( 'body > div.RnEpo.Yx5HN > div > div > div.mt3GC > button.aOOlW.HoLwm' ) self.notnow.click() except: pass def get_post_links(self, username, post_count): """ Crawler to get a list of links to the posts starting from the chronologically most recent :param username: str, the username of the account :param post_count: int, amount of links desired to save :return: a list of links to the post of the specific user """ self.post_links = [] self.browser.get( 'https://www.instagram.com/accounts/login/?source=auth_switcher') self.cookies = pickle.load(open('cookies.pkl', 'rb')) for cookie in self.cookies: self.browser.add_cookie(cookie) self.username = username self.url = 'https://www.instagram.com/' + username + '/' self.browser.get(self.url) time.sleep(10) self.post = 'https://www.instagram.com/p/' while len(self.post_links) < post_count: self.links = [ a.get_attribute('href') for a in self.browser.find_elements_by_tag_name('a') ] for link in self.links: if self.post in link and link not in self.post_links: self.post_links.append(link) self.scroll_down = 'window.scrollTo(0, document.body.scrollHeight);' self.browser.execute_script(self.scroll_down) time.sleep(10) else: print(self.post_links) print(len(self.post_links)) return self.post_links[:post_count] def get_post_details(self, url): """ Saves the information about the instagram post: number of likes, type of the post (photo or video), caption, timestamp with timezone of when it was posted, username of the author :param url: str, link to the post :return: all the elements and send them to the database functionality """ self.browser.get(url) try: self.likes = self.browser.find_element_by_xpath( """//*[@id="react-root"]/section/main/div/div/ article/div[2]/section[2]/div/div/button/span""").text self.post_type = 'photo' except: self.likes = self.browser.find_element_by_xpath( """//*[@id="react-root"]/section/main/div/div/ article/div[2]/section[2]/div/span""").text.split()[0] self.post_type = 'video' self.time_posted = self.browser.find_element_by_xpath( '//a/time').get_attribute("datetime") try: self.caption = self.browser.find_element_by_xpath( """/html/body/div[1]/section/main/div/div/article/div[2]/div[1]/ul/div/li/div/div/div[2]/span""" ).text except NoSuchElementException as e: self.caption = "" try: return DatabaseFunctionality.execute_insert_post_details( url, self.post_type, self.likes, self.time_posted, self.caption) except psycopg2.errors.DatabaseError: pass time.sleep(8) def get_comments(self, url): """ Saves the comments of the post: username of the authour of the comment, comment itself, timestamp with timezone, :param url: link to the post :return: """ self.browser.get( 'https://www.instagram.com/accounts/login/?source=auth_switcher') self.cookies = pickle.load(open('cookies.pkl', 'rb')) for cookie in self.cookies: self.browser.add_cookie(cookie) self.browser.get(url) time.sleep(5) try: self.load_more_comments = self.browser.find_element_by_class_name( 'glyphsSpriteCircle_add__outline__24__grey_9 u-__7') self.action = ActionChains(self.browser) self.action.move_to_element(self.load_more_comments) self.load_more_comments.click() time.sleep(4) self.action.key_down(Keys.SPACE).key_up(Keys.SPACE).perform() except Exception as e: pass time.sleep(5) comment = self.browser.find_elements_by_class_name('gElp9 ') for c in comment: container = c.find_element_by_class_name('C4VMK') name = container.find_element_by_class_name('_6lAjh').text content = container.find_element_by_tag_name('span').text content = content.replace('\n', ' ').strip().rstrip() time_of_post = self.browser.find_element_by_xpath( '//a/time').get_attribute("datetime") comment_details = { 'url_post': url, 'profile name': name, 'comment': content, 'time': time_of_post } print(comment_details) try: return DatabaseFunctionality.execute_insert_comment_details( url, name, content, time_of_post) except psycopg2.errors.DatabaseError as e: pass
class crawlerAliexpress(): #global numberItem def __init__(self, searchName, numberPage=1): # Инициализация браузера. opts = Options() #opts.set_headless() self.browser = Firefox(options=opts) self.searchName = searchName self.numberPage = numberPage self.listDetailsProducts = [] currentUrl = f'https://aliexpress.ru/wholesale?SearchText={self.searchName}&page={self.numberPage}' self.browser.get(currentUrl) cookie = { 'name': 'aep_usuc_f', 'value': 'isfm=y&site=rus&c_tp=RUB&isb=y®ion=RU&b_locale=ru_RU', 'domain': '.aliexpress.ru' } self.browser.add_cookie(cookie) self.browser.get(currentUrl) sleep(1) def scroll_down_page(self, speed=8): current_scroll_position, new_height = 0, 1 while current_scroll_position <= new_height: current_scroll_position += speed self.browser.execute_script( "window.scrollTo(0, {});".format(current_scroll_position)) new_height = self.browser.execute_script( "return document.body.scrollHeight") def getProductsDetail(self, countPage): #scrollPauseTime = 2 self.countPage = countPage self.scroll_down_page() title = self.browser.find_elements_by_xpath( "//li[@class='list-item']//div[@class='item-title-wrap']//a[@class='item-title']" ) price = self.browser.find_elements_by_xpath( "//li[@class='list-item']//div[@class='hover-help']//div[@class='item-price-row']" ) rating = self.browser.find_elements_by_xpath( "//li[@class='list-item']//div[@class='hover-help']//span[@class='rating-value']" ) numberOfSales = self.browser.find_elements_by_xpath( "//li[@class='list-item']//div[@class='hover-help']//a[@class='sale-value-link']" ) seller = self.browser.find_elements_by_xpath( "//li[@class='list-item']//div[@class='hover-help']//a[@class='store-name']" ) itemsTitle = len(title) itemsPrice = len(price) itemsRating = len(rating) itemsNumberOfSales = len(numberOfSales) itemsSeller = len(seller) for i in range( min(itemsTitle, itemsPrice, itemsRating, itemsNumberOfSales, itemsSeller)): itemProduct = [ title[i].text, price[i].text, rating[i].text, numberOfSales[i].text, seller[i].text ] listDetailsProducts.append(itemProduct) self.paginator(self.countPage) def saveInCsv(self, nameFile): self.nameFile = nameFile with open(self.nameFile, "w", newline='') as out_file: writer = csv.writer(out_file) writer.writerows(listDetailsProducts) def check_exists_by_xpath(self, xpath): try: self.browser.find_element_by_xpath(xpath) except NoSuchElementException: return False return True def paginator(self, countPage): sleep(2) isPaginator = self.check_exists_by_xpath( "//button[@class='next-btn next-medium next-btn-normal next-pagination-item next-next' and not(@disabled)]" ) self.numberPage += 1 currentUrl = f'https://aliexpress.ru/wholesale?SearchText={self.searchName}&page={self.numberPage}' if isPaginator and (countPage > 1): self.browser.get(currentUrl) self.getProductsDetail(countPage - 1) else: print('\nВсе страницы обработали') self.browser.close()
def set_login_data(driver: webdriver.Firefox, login_data: LoginData): """ Goes to 404 page of each site in order to set authentication data """ driver.get(login_data.link) for cookie in login_data.cookies: driver.add_cookie(cookie) set_localstorage(driver, login_data.localstorage)
# driver.find_element_by_xpath("//div[@id='u1']//a[text()='登录']").click() # sleep(1) # driver.find_element_by_xpath("//p[text()='用户名登录']").click() # sleep(2) # # 输入用户名 # driver.find_element_by_xpath( # "//input[@name='userName']").send_keys('13541781424') # # 输入密码 # driver.find_element_by_xpath("//input[@name='password']").send_keys('19931025') # # 点击登录 # driver.find_element_by_xpath("//input[@id='TANGRAM__PSP_10__submit']").click() # cookie=driver.get_cookies() # print(cookie) cook = { 'domain': '.baidu.com', 'name': 'BDUSS', 'path': '/', 'value': 'WhSMUpIOXhkOXBrZTF4TmRKMG1YYUkyb0JscGdNRE9kZH40WmpBaUZES35YVGhkSVFBQUFBJCQAAAAAAAAAAAEAAAD4pMmGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL~QEF2~0BBdQ' } # 打开网页 driver.get('https://www.baidu.com') # 添加cookit driver.add_cookie(cook) sleep(3) # 进入网页 driver.get('http://i.baidu.com')