def get_hotel(driver: WebDriver, city: str, n: int) -> None: driver.get('%s/%s/p%d' % (ROOT_URL, city, n)) driver.implicitly_wait(1) hotel_list = driver.find_element_by_id('hotel_list') hotels = hotel_list.find_elements_by_class_name('searchresult_list') for hotel in hotels: hid = str(hotel.get_attribute('id')) if not re.match(r'^\d+$', hid): continue name = driver.find_element_by_xpath('//*[@id="%s"]/ul/li[2]/h2/a' % hid).get_attribute('title') try: points = hotel.find_element_by_class_name('hotel_value').text except Exception: continue start_price = hotel.find_element_by_class_name('J_price_lowList').text about_points = hotel.find_element_by_class_name('hotel_judgement').text points_count = RE_COMMENT.search(about_points).group() logging.info('%s\n%s\n%s\n%s\n%s\n%s\n%s\n' % (city, hid, name, n, points, start_price, points_count)) if Hotel.objects.filter(hid=hid).count() == 0: Hotel.objects.create(city=city, hid=hid, name=name, page=n, points=points, start_price=start_price, points_count=points_count)
class RegisterUser_test(LiveServerTestCase): def setUp(self): self.sel = WebDriver() def tearDown(self): self.sel.quit() def testRegister(self): self.doReg(self) @staticmethod def doReg(self): self.sel.get(self.live_server_url) signUp = WebDriverWait(self.sel, 2).until( EC.element_to_be_clickable((By.ID, "signUp"))) # signUp = self.sel.find_element_by_id('signUp') signUp.click() self.sel.save_screenshot("register01.png") emailBox = self.sel.find_element_by_id('email') firstNameBox = self.sel.find_element_by_id('name') lastNameBox = self.sel.find_element_by_id('username') passwordBox = self.sel.find_element_by_id('password') passwordConfirmBox = self.sel.find_element_by_id('password_c') submitBtn = self.sel.find_element_by_id('regUserSubmit') emailBox.send_keys('*****@*****.**') firstNameBox.send_keys('John') lastNameBox.send_keys('Doe') passwordBox.send_keys('pw') passwordConfirmBox.send_keys('pw') self.sel.save_screenshot("register02.png") submitBtn.click() #self.sel.save_screenshot("IHaveTo.png") try: dropData = WebDriverWait(self.sel, 2).until( EC.visibility_of_element_located((By.ID, 'dropData'))) self.sel.save_screenshot("register03.png") dropData.click() logout = WebDriverWait(self.sel, 2).until( EC.element_to_be_clickable((By.ID, 'navLogout'))) AppFuncs.doLogout(self) except: AppFuncs.doLogout(self) self.assertTrue( False, 'Reg failed. Unable to find logout element.')
def test_that_home_view_shows_devices_grid(self): url = '%s%s' % (self.live_server_url, reverse('devices')) driver = WebDriver() driver.get(url) waiter = WebDriverWait(driver, WEBDRIVER_MAX_TIMEOUT) waiter.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'device'))) device_grid = driver.find_element_by_id('device-grid') devices = device_grid.find_elements_by_class_name('device') self.assertEqual(len(MOCK_DEVICES), len(devices)) for device in MOCK_DEVICES: self.assertIn(device.ip_address, device_grid.text) self.assertIn(device.mac_address, device_grid.text)
class SeleniumTestCase(LiveServerTestCase): def setUp(self): super(SeleniumTestCase, self).setUp() self.selenium = WebDriver() def tearDown(self): self.selenium.quit() super(SeleniumTestCase, self).tearDown() def loginAsUser(self): self.selenium.get('%s%s' % (self.live_server_url, '/accounts/login/')) username_input = self.selenium.find_element_by_name("username") username_input.send_keys('user') password_input = self.selenium.find_element_by_name("password") password_input.send_keys('demo') self.selenium.find_element_by_xpath('//input[@value="Login"]').click()
def search_google(queries): driver = None try: # driver = webdriver.Remote( # command_executor='http://127.0.0.1:4444/wd/hub', # desired_capabilities=DesiredCapabilities.CHROME) # driver = webdriver.Chrome('./chromedriver', chrome_options=["--no-startup-window"]) driver = WebDriver("./filtering/phantomjs") driver.get("http://www.google.com") w = wait.WebDriverWait(driver, 5) sleep(1) w.until(lambda x: driver.execute_script("return document.readyState;") == "complete") # elem = driver.find_elements_by_name("q")[0] # elem.click() # elem.send_keys(queries[0]["q"]) # elem.send_keys(Keys.RETURN) # sleep(1) # w.until(lambda x: driver.execute_script("return document.readyState;") == "complete") # queries[0]["response"] = get_number(driver) for keyword in queries: elem = driver.find_elements_by_name("q")[0] elem.click() elem.clear() elem.send_keys(keyword["q"]) elem.send_keys(Keys.RETURN) sleep(1) w.until(lambda x: driver.execute_script("return document.readyState;") == "complete") keyword["response"] = get_number(driver) driver.save_screenshot("%s.png" % keyword["pr"]) # return ret except: traceback.print_exc() if driver: driver.save_screenshot("test.png") finally: if driver: driver.close()
def test_url_phantom(self, browser: WebDriver): time.sleep(1) browser.get("http://astrodate.ru/#/") # url associated with button click time.sleep(5) browser.save_screenshot("image.png") time.sleep(20)
# Python language bindings for Selenium WebDriver # https://pypi.python.org/pypi/selenium from selenium.webdriver.phantomjs.webdriver import WebDriver dirver = WebDriver( executable_path="../example/phantomjs-2.1.1-windows/bin/phantomjs.exe") dirver.get('http://www.baidu.com') print(dirver.title)
# from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0 from selenium.webdriver.phantomjs.webdriver import WebDriver # Create a new instance of the Firefox driver driver = WebDriver( executable_path='/opt/phantomjs-2.1.1-linux-x86_64/bin/phantomjs', port=5001) # go to the google home page driver.get("http://www.baidu.com") # the page is ajaxy so the title is originally this: print(driver.title) # find the element that's name attribute is q (the google search box) inputElement = driver.find_element_by_id("kw") # type in the search inputElement.send_keys("cheese!") # submit the form (although google automatically searches now without submitting) inputElement.submit() try: # we have to wait for the page to refresh, the last thing that seems to be updated is the title WebDriverWait(driver, 10).until(EC.title_contains("cheese!")) # You should see "cheese! - Google Search"
class Search: def __init__(self, path, websites): self.exec_path = path self.driver = PhantomJS(executable_path=self.exec_path) self.websites = websites self.fileExceptionList = [] self.fileLogList = [] self.candidates = [] self.begin_search_for_SSO() def begin_search_for_SSO(self): for site in self.websites: print site self.driver.set_page_load_timeout(60) self.first_url = "https://www."+site self.sso_info = {"url" : self.first_url, "loginSSO" : [], "signupSSO" : []} try: self.driver.get(self.first_url) except URLError as u: log_obj = { "url" : self.first_url, "reason" : "An exception occurred during first time page load url error" } self.fileLogList.append(log_obj) continue except (TimeoutException, Exception) as t: log_obj = { "url" : self.first_url, "reason" : "An exception occurred during first time page load timeout exception" } self.fileLogList.append(log_obj) continue else: first_url_doc = self.parse_web_page() self.process_page_for_click_candidates(first_url_doc, "login") self.search_for_SSO(self.parse_web_page(), "login") self.process_page_for_click_candidates(self.parse_web_page(), "signup") self.search_for_SSO(self.parse_web_page(), "signup") self.candidates.append(self.sso_info) print self.candidates self.write_to_file() self.done() def parse_web_page(self): html = self.driver.page_source parsed = BeautifulSoup(html, "lxml") return parsed def process_page_for_click_candidates(self, document, ptype): if ptype == 'login': found_elems = document.find_all(["a", "button", "span", "div", "img"], string=['Log In', 'LogIn', 'Login', 'Log in', 'Sign in', 'Sign In', 'Signin', 'SignIn', 'SIGNIN', 'SIGN IN', 'LOGIN', 'LOG IN', 'login', 'log in', 'signin', 'sign in']) elif ptype == 'signup': found_elems = document.find_all(["a", "button", "span", "div", "img"], string=['Sign Up', 'Signup', 'SignUp', 'Sign up', 'SIGN UP', 'SIGNUP', 'sign up', 'signup', 'Create account', 'Create Account', 'CREATE ACCOUNT', 'create account']) found_url = self.extract_url(found_elems) if(found_url): if found_url.startswith("/"): found_url = self.first_url + found_url try: self.driver.get(found_url) except URLError as u: log_obj = { "url" : self.first_url, "reason" : "An exception occurred during click candidate process url error" } self.fileLogList.append(log_obj) return except (TimeoutException, Exception) as t: log_obj = { "url" : self.first_url, "reason" : "An exception occurred during click candidate process timeout exception" } self.fileLogList.append(log_obj) return else: if ptype == 'login': exception_obj = { "url" : self.first_url, "reason" : "login url extraction failed" } else: exception_obj = { "url" : self.first_url, "reason" : "Sign up url extraction failed" } self.fileExceptionList.append(exception_obj) def extract_url(self, elems): while(len(elems) > 0): each = elems.pop() url = each.get('href') if url is not None: return url else: parent = each.find_parent('a') or each.find_parent('button') if parent is None: continue else: p_url = parent.get('href') if p_url is None: continue else: return p_url def search_for_SSO(self, document, stype): stack = [] if document is not None and document.body is not None: stack = document.body.contents while(len(stack) > 0): current = stack.pop() if(not(isinstance(current, NavigableString))): children = current.contents if len(children) > 0: for child in children: if(not(isinstance(child, NavigableString))): stack.insert(0, child) if not(current.name == 'script' or current.attrs == None or current.name == 'embed'): self.process_node(current, stype) def process_node(self, node, stype): if self.filter_node_on_type(node): attrs = node.attrs str_to_check = node.string or '' for key in attrs: try: str_to_check += key+'='+str(attrs[key])+';' except UnicodeError: continue self.check_for_keywords(str_to_check, stype) def filter_node_on_type(self, node): if (node.name != "a" and node.name != "div" and node.name != "img" and node.name != "span" and node.name != "input" and node.name != "button"): return False if (node.name == "input"): if (node.type != "button" and node.type != "img" and node.type != "submit"): return False if (node.name == "A"): if (node.get('href').toLowerCase().indexOf('mailto:') == 0): return False return True def check_for_keywords(self, inputstr, stype): sso = [{"site" : 'google', "url" : ["https://accounts.google.com/o/oauth2/auth"]}, {"site" : 'yahoo', "url" : ["https://api.login.yahoo.com/oauth2/request_auth"]}, {"site" : '500px', "url": ["https://api.500px.com/v1/oauth"]}, {"site" : 'aol', "url" :["https://api.screenname.aol.com/auth"]}, {"site" : 'twitter', "url" : ["https://api.twitter.com/oauth"]}, {"site" : 'vk', "url" : ["https://oauth.vk.com/authorize"]}, {"site" : 'yammer', "url" : ["https://www.yammer.com/oauth2/authorize"]}, {"site" : 'yandex', "url" : ["https://oauth.yandex.com/authorize"]}, {"site" : 'zendesk', "url" : [".zendesk.com/oauth/authorizations/new"]}, {"site" : 'amazon', "url" : ["http://g-ecx.images-amazon.com/images/G/01/lwa/btnLWA", "https://images-na.ssl-images-amazon.com/images/G/01/lwa/btnLWA"]}, {"site" : 'flickr', "url" : ["https://www.flickr.com/services/oauth"]}, {"site" : 'bitbucket', "url" : ["https://bitbucket.org/site/oauth2", "https://bitbucket.org/api/1.0/oauth"]}, {"site" : 'bitly', "url" : ["https://bitly.com/oauth"]}, {"site" : 'cloud foundry', "url" : ["/uaa/oauth"]}, {"site" : 'dailymotion', "url" : ["https://www.dailymotion.com/oauth"]}, {"site" : 'deviantART', "url" : ["https://www.deviantart.com/oauth2"]}, {"site" : 'discogs', "url" : ["https://api.discogs.com/oauth"]}, {"site" : 'huddle', "url" : ["https://login.huddle.net/request"]}, {"site" : 'netflix', "url" : ["https://api-user.netflix.com/oauth"]}, {"site" : 'openlink data spaces', "url" : ["/OAuth"]}, {"site" : 'openstreetmap', "url" : ["http://www.openstreetmap.org/oauth"]}, {"site" : 'opentable', "url" : ["http://www.opentable.com/oauth"]}, {"site" : 'passport', "url" : ["/dialog/authorize", "oauth2/authorize", "oauth/authorize"]}, {"site" : 'paypal', "url" : ["paypal.com/v1/oauth2"]}, {"site" : 'plurk', "url" : ["https://www.plurk.com/OAuth/authorize"]}, {"site" : 'sina weibo', "url" : ["http://api.t.sina.com.cn/oauth/authorize"]}, {"site" : 'stack exchange', "url" : ["https://stackexchange.com/oauth"]}, {"site" : 'statusnet', "url" : ["status.net/api/oauth/authorize"]}, {"site" : 'ubuntu one', "url" : ["https://login.ubuntu.com/api/1.0/authentications"]}, {"site" : 'viadeo', "url" : ["https://partners.viadeo.com/oauth/authorize"]}, {"site" : 'vimeo', "url" : ["https://api.vimeo.com/oauth/authorize"]}, {"site" : 'withings', "url" : ["https://oauth.withings.com/account/authorize"]}, {"site" : 'xero', "url" : ["https://api.xero.com/oauth/Authorize"]}, {"site" : 'xing', "url" : ["https://api.xing.com/v1/authorize"]}, {"site" : 'goodreads', "url" : ["http://www.goodreads.com/oauth"]}, {"site" : 'google app engine', "url" : ["https://accounts.google.com/o/oauth2/v2/auth"]}, {"site" : 'groundspeak', "url" : ["groundspeak.com/oauth"]}, {"site" : 'intel cloud services', "url" : []}, {"site" : 'jive', "url" : ["jiveon.com/oauth2"]}, {"site" : "linkedin", "url" : ["https://www.linkedin.com/oauth/v2/authorization"]}, {"site" : 'trello', "url" : ["https://trello.com/1/OAuthAuthorizeToken", "https://trello.com/1/authorize"]}, {"site" : 'tumblr', "url" : ["https://www.tumblr.com/oauth/authorize"]}, {"site" : 'microsoft', "url" : ["https://login.live.com/oauth20"]}, {"site" : 'mixi', "url" : ["api.mixi-platform.com/OAuth"]}, {"site" : 'myspace', "url" : ["api.myspace.com/authorize"]}, {"site" : 'etsy', "url" : ["https://www.etsy.com/oauth"]}, {"site" : 'evernote', "url" : ["https://sandbox.evernote.com/OAuth.action"]}, {"site" : 'yelp', "url" : ["https://api.yelp.com/oauth2"]}, {"site" : 'facebook', "url" : ["fb-login-button", "https://www.facebook.com/v2.0/dialog/oauth"]}, {"site" : "dropbox", "url" : ["https://www.dropbox.com/1/oauth2/authorize", "https://www.dropbox.com/1/oauth/authorize"]}, {"site" : "twitch", "url" : ["https://api.twitch.tv/kraken/oauth2/authorize"]}, {"site" : "stripe", "url" : ["https://connect.stripe.com/oauth/authorize"]}, {"site" : 'basecamp', "url" : ["https://launchpad.37signals.com/authorization/new"]}, {"site" : "box", "url" : ["https://account.box.com/api/oauth2/authorize"]}, {"site" : "formstack", "url" : ["https://www.formstack.com/api/v2/oauth2/authorize"]}, {"site" : "github", "url" : ["https://github.com/login/oauth/authorize"]}, {"site" : "reddit", "url" : ["https://www.reddit.com/api/v1/authorize"]}, {"site" : "instagram", "url" : ["https://api.instagram.com/oauth/authorize"]}, {"site" : "foursquare", "url" : ["https://foursquare.com/oauth2/authorize"]}, {"site" : "fitbit", "url" : ["https://www.fitbit.com/oauth2/authorize"]}, {"site" : "imgur", "url" : ["https://api.imgur.com/oauth2/authorize"]}, {"site" : "salesforce", "url" : ["https://login.salesforce.com/services/oauth2/authorize"]}, {"site" : "strava", "url" : ["https://www.strava.com/oauth/authorize"]}, {"site" : "battle.net", "url" : ["https://us.battle.net/oauth/authorize"]}] k0 = re.compile('oauth', re.I) k1 = re.compile('openid', re.I) k2 = re.compile('log[\-\S]?[io]n', re.I) k3 = re.compile('sign[\-\S]?[io]n', re.I) k4 = re.compile('Sign\S?up', re.I) e0 = re.compile('social', re.I) e1 = re.compile('subscribe', re.I) e2 = re.compile('connect', re.I) e3 = re.compile('like', re.I) for each in sso: compiled = re.compile(each['site'], re.I | re.S) if compiled.search(inputstr) is not None: if k0.search(inputstr) is not None: if len(each['url']) > 0: for url in each['url']: c_url = re.compile(url, re.I) if c_url.search(inputstr) is not None: if stype == 'login': if k2.search(inputstr) is not None or k3.search(inputstr) is not None: if each['site'] not in self.sso_info["loginSSO"]: self.sso_info["url"] = self.first_url self.sso_info["loginSSO"].append(each['site']) elif stype == 'signup': if each['site'] not in self.sso_info["signupSSO"]: self.sso_info["url"] = self.first_url self.sso_info["signupSSO"].append(each['site']) else: if stype == 'login': if k2.search(inputstr) is not None or k3.search(inputstr) is not None: if each['site'] not in self.sso_info["loginSSO"]: self.sso_info["url"] = self.first_url self.sso_info["loginSSO"].append(each['site']) elif stype == 'signup': if each['site'] not in self.sso_info["signupSSO"]: self.sso_info["url"] = self.first_url self.sso_info["signupSSO"].append(each['site']) elif k1.search(inputstr) is not None: if stype == 'login': if k2.search(inputstr) is not None or k3.search(inputstr) is not None: if each['site'] not in self.sso_info["loginSSO"]: self.sso_info["url"] = self.first_url self.sso_info["loginSSO"].append(each['site']) elif stype == 'signup': if each['site'] not in self.sso_info["signupSSO"]: self.sso_info["url"] = self.first_url self.sso_info["signupSSO"].append(each['site']) def write_to_file(self): for each in self.candidates: if not(len(each['loginSSO']) > 0 or len(each['signupSSO']) > 0): i = self.candidates.index(each) del self.candidates[i] file_exception = open("exceptions.txt", "w") file_exception.write(json.dumps(self.fileExceptionList)) file_exception.close() logFile = open("log.txt", "w") logFile.write(json.dumps(self.candidates)) logFile.close() sys_exceptions = open("errors.txt", "w") sys_exceptions.write(json.dumps(self.fileLogList)) sys_exceptions.close() def done(self): self.driver.quit()
class HtmlURLUtil: """ html请求工具类 urllib:python核心库,一般只用于对url的处理,不用它提供的request请求 tld(top level domain):强大的url域名处理工具,好吧,也不是很强大,但是用着方便 selenium强大的,NB的web自动化测试工具 phantomJS:无界面的webkit,一般使用它的request,好处,可以爬取搜索引擎的结果(benefit ajax) """ __USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) " \ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" def __init__(self, driver=None): self.driver = driver self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', # 'Accept-Encoding': '*', 'Cache-Control': 'max-age=0', 'User-Agent': HtmlURLUtil.__USER_AGENT, 'Connection': 'keep-alive', 'Referer': 'https://www.baidu.com/' } def getHtml(self, url, referer="https://www.baidu.com/"): _result = "" try: my_dc = DesiredCapabilities.PHANTOMJS.copy() my_dc["browserName"] = "chrome" my_dc["platform"] = "mac" my_dc["version"] = "63.0.3239.84" my_dc["phantomjs.page.settings.loadImages"] = False my_dc["phantomjs.page.settings.userAgent"] = HtmlURLUtil.__USER_AGENT service_args = ["--load-images=false", "--disk-cache=false", "--ignore-ssl-errors=true"] # "--webdriver-logfile=webdriver.log","--webdriver-loglevel=INFO" for head, value in self.headers.iteritems(): my_dc["phantomjs.page.customHeaders.{}".format(head)] = value my_dc["phantomjs.page.customHeaders.Referer"] = referer self.driver = WebDriver(desired_capabilities=my_dc, service_args=service_args) self.driver.set_script_timeout(20) self.driver.set_page_load_timeout(30) self.driver.implicitly_wait(5) self.driver.set_window_size(2560, 1066) self.driver.get(url) # 保存网页快照图片 # self.driver.save_screenshot(md5_util.md5(url)+".png") _result = self.driver.page_source except: log.getLogger().exception("HtmlURLUtil getHtml error...") # self.driver.close() self.driver.quit() return _result def closeWebDriver(self): self.driver.quit() def getSortQS(self, url): """ 获取排序好的query string :param url: :return: """ a = urllib.splitquery(url) if len(a) <= 1 or not a[1]: return None qs = urlparse.parse_qs(a[1]) # 使用快速排序O(nlogn) return sort_util.fastSortDict(qs) def getTLD(self, url): """ 获取域名对象 :param url: :return: """ try: if not url: return None web = urllib.splitquery(url)[0] return tld.get_tld(web) except: log.getLogger().exception("getTLD ...%s" % url) return None def getMd5URL(self, url): """ 对url进行md5 先对参数排序,然后进行md5 :param url: :return: """ web = urllib.splitquery(url)[0] string = web + str(self.getSortQS(url)) return md5_util.md5(string) def getElementsByTagName(self, elname): return self.driver.find_elements_by_tag_name(elname) def writeWebContentToFile(self, webcontent, filepath): if not webcontent: return reload(sys) sys.setdefaultencoding("utf-8") try: _dir = os.path.dirname(filepath) if not os.path.exists(_dir): os.makedirs(_dir) f = open(filepath, "w") f.write(webcontent) f.flush() except: log.getLogger().exception("htmlutil writeWebContentToFile ...") finally: f.close() def getCharset(self, content): charset = "utf-8" m = re.compile('<meta .*(http-equiv="?Content-Type"?.*)?charset="?([a-zA-Z0-9_-]+)"?', re.I)\ .search(content) if m and m.lastindex == 2: charset = m.group(2).lower() return charset
TempPath = pathlib.Path(str(os.sep).join(["_classic_", "Interface"])) a = input("按回车开始自动检测: ") print("\n开始读取NGA页面,可能需要一点时间……") exename = 'phantomjs.exe' if os.name == 'nt' else 'phantomjs' driver = WebDriver(executable_path=str(os.sep).join( ['selenium', 'bin', exename]), port=5001) page_link = "http://nga.178.com/read.php?tid=18302645&_ff=240" driver.get(page_link) try: WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.CLASS_NAME, "quote"))) print("已读取:", driver.title, "\n") except Exception as e: print(e) quote_list = driver.find_elements_by_class_name("quote") version_list = quote_list[2].text.split()[0] #2020/04/03(1.13.3.47) #print(version_list)
def spider_comments(driver: WebDriver, hid: str, n: int) -> int: if Comment.objects.filter(hotel=hid).filter(page=n).count() == 15: return 0 try: driver.get('%s/dianping/%s_p%dt0.html' % (ROOT_URL, hid, n)) driver.implicitly_wait(0.5) except (ConnectionRefusedError, urllib.error.URLError, ConnectionResetError, TypeError, AttributeError): del driver return 403 try: comment_list = driver.find_elements_by_css_selector( '#divCtripComment > div.comment_detail_list')[1] except IndexError: driver.implicitly_wait(5) try: comment_list = driver.find_elements_by_css_selector( '#divCtripComment > div.comment_detail_list')[1] except IndexError: comment_list = driver.find_element_by_css_selector( '#divCtripComment > div.comment_detail_list') if Hotel.objects.filter(hid=hid).count() == 1: hotel = Hotel.objects.get(hid=hid) if hotel.comments_count == 0: try: comment_text = driver.find_element_by_css_selector( "#commentTab > a").text logging.warning("\n%s\n" % comment_text) hotel.comments_count = int( RE_COMMENT.search(comment_text).group()) logging.warning("\n%s\n" % hotel.comments_count) hotel.save() except Exception: pass comments = comment_list.find_elements_by_class_name('comment_block') for comment in comments: try: name = comment.find_element_by_class_name( 'name').find_element_by_tag_name('span').text cid = comment.get_attribute('data-cid') points = comment.find_element_by_class_name('n').text room_type = comment.find_element_by_class_name('room_link').text content = comment.find_element_by_class_name( 'J_commentDetail').text.strip() except Exception: continue logging.info('%s\n%s\n%s\n%s\n%s\n%s\n' % (hid, name, n, room_type, points, content)) # with sqlite3.connect('../../db.sqlite3') as conn: # with conn.cursor() as cursor: # cursor.execute("select * from get_data_comment where (cid=?)", (cid,)) if Comment.objects.filter(cid=cid).count() == 0: Comment.objects.create(cid=cid, content=content, hotel=hid, page=n, points=points, room_type=room_type, name=name) elif not Comment.objects.filter(cid=cid).exclude(page=n).count() == 0: return 1 del driver return 0