Exemplo n.º 1
0
def get_hotel(driver: WebDriver, city: str, n: int) -> None:
    driver.get('%s/%s/p%d' % (ROOT_URL, city, n))

    driver.implicitly_wait(1)

    hotel_list = driver.find_element_by_id('hotel_list')
    hotels = hotel_list.find_elements_by_class_name('searchresult_list')
    for hotel in hotels:
        hid = str(hotel.get_attribute('id'))
        if not re.match(r'^\d+$', hid):
            continue
        name = driver.find_element_by_xpath('//*[@id="%s"]/ul/li[2]/h2/a' %
                                            hid).get_attribute('title')
        try:
            points = hotel.find_element_by_class_name('hotel_value').text
        except Exception:
            continue
        start_price = hotel.find_element_by_class_name('J_price_lowList').text
        about_points = hotel.find_element_by_class_name('hotel_judgement').text
        points_count = RE_COMMENT.search(about_points).group()
        logging.info('%s\n%s\n%s\n%s\n%s\n%s\n%s\n' %
                     (city, hid, name, n, points, start_price, points_count))
        if Hotel.objects.filter(hid=hid).count() == 0:
            Hotel.objects.create(city=city,
                                 hid=hid,
                                 name=name,
                                 page=n,
                                 points=points,
                                 start_price=start_price,
                                 points_count=points_count)
Exemplo n.º 2
0
class RegisterUser_test(LiveServerTestCase):

    def setUp(self):
        self.sel = WebDriver()

    def tearDown(self):
        self.sel.quit()

    def testRegister(self):
        self.doReg(self)

    @staticmethod
    def doReg(self):
        self.sel.get(self.live_server_url)
        signUp = WebDriverWait(self.sel, 2).until(
                EC.element_to_be_clickable((By.ID, "signUp")))
        # signUp = self.sel.find_element_by_id('signUp')
        signUp.click()

        self.sel.save_screenshot("register01.png")

        emailBox = self.sel.find_element_by_id('email')
        firstNameBox = self.sel.find_element_by_id('name')
        lastNameBox = self.sel.find_element_by_id('username')
        passwordBox = self.sel.find_element_by_id('password')
        passwordConfirmBox = self.sel.find_element_by_id('password_c')
        submitBtn = self.sel.find_element_by_id('regUserSubmit')

        emailBox.send_keys('*****@*****.**')
        firstNameBox.send_keys('John')
        lastNameBox.send_keys('Doe')
        passwordBox.send_keys('pw')
        passwordConfirmBox.send_keys('pw')
        self.sel.save_screenshot("register02.png")
        submitBtn.click()

        #self.sel.save_screenshot("IHaveTo.png")
        try:

            dropData = WebDriverWait(self.sel, 2).until(
                    EC.visibility_of_element_located((By.ID, 'dropData')))
            self.sel.save_screenshot("register03.png")
            dropData.click()
            logout = WebDriverWait(self.sel, 2).until(
                EC.element_to_be_clickable((By.ID, 'navLogout')))
            AppFuncs.doLogout(self)
        except:
            AppFuncs.doLogout(self)
            self.assertTrue(
                False,
                'Reg failed. Unable to find logout element.')
    def test_that_home_view_shows_devices_grid(self):
        url = '%s%s' % (self.live_server_url, reverse('devices'))
        driver = WebDriver()
        driver.get(url)

        waiter = WebDriverWait(driver, WEBDRIVER_MAX_TIMEOUT)
        waiter.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'device')))

        device_grid = driver.find_element_by_id('device-grid')
        devices = device_grid.find_elements_by_class_name('device')
        self.assertEqual(len(MOCK_DEVICES), len(devices))

        for device in MOCK_DEVICES:
            self.assertIn(device.ip_address, device_grid.text)
            self.assertIn(device.mac_address, device_grid.text)
Exemplo n.º 4
0
class SeleniumTestCase(LiveServerTestCase):
    def setUp(self):
        super(SeleniumTestCase, self).setUp()
        self.selenium = WebDriver()

    def tearDown(self):
        self.selenium.quit()
        super(SeleniumTestCase, self).tearDown()

    def loginAsUser(self):
        self.selenium.get('%s%s' % (self.live_server_url, '/accounts/login/'))
        username_input = self.selenium.find_element_by_name("username")
        username_input.send_keys('user')
        password_input = self.selenium.find_element_by_name("password")
        password_input.send_keys('demo')
        self.selenium.find_element_by_xpath('//input[@value="Login"]').click()
Exemplo n.º 5
0
class SeleniumTestCase(LiveServerTestCase):
        def setUp(self):
            super(SeleniumTestCase, self).setUp()
            self.selenium = WebDriver()

        def tearDown(self):
            self.selenium.quit()
            super(SeleniumTestCase, self).tearDown()

        def loginAsUser(self):
            self.selenium.get('%s%s' % (self.live_server_url, '/accounts/login/'))
            username_input = self.selenium.find_element_by_name("username")
            username_input.send_keys('user')
            password_input = self.selenium.find_element_by_name("password")
            password_input.send_keys('demo')
            self.selenium.find_element_by_xpath('//input[@value="Login"]').click()
Exemplo n.º 6
0
def search_google(queries):
    driver = None
    try:
        # driver = webdriver.Remote(
        #     command_executor='http://127.0.0.1:4444/wd/hub',
        #     desired_capabilities=DesiredCapabilities.CHROME)
        # driver = webdriver.Chrome('./chromedriver', chrome_options=["--no-startup-window"])
        driver = WebDriver("./filtering/phantomjs")

        driver.get("http://www.google.com")
        w = wait.WebDriverWait(driver, 5)
        sleep(1)
        w.until(lambda x: driver.execute_script("return document.readyState;") == "complete")

        # elem = driver.find_elements_by_name("q")[0]
        # elem.click()
        # elem.send_keys(queries[0]["q"])
        # elem.send_keys(Keys.RETURN)
        # sleep(1)
        # w.until(lambda x: driver.execute_script("return document.readyState;") == "complete")
        # queries[0]["response"] = get_number(driver)

        for keyword in queries:
            elem = driver.find_elements_by_name("q")[0]
            elem.click()
            elem.clear()
            elem.send_keys(keyword["q"])
            elem.send_keys(Keys.RETURN)
            sleep(1)
            w.until(lambda x: driver.execute_script("return document.readyState;") == "complete")
            keyword["response"] = get_number(driver)
            driver.save_screenshot("%s.png" % keyword["pr"])
        # return ret

    except:
        traceback.print_exc()
        if driver:
            driver.save_screenshot("test.png")

    finally:
        if driver:
            driver.close()
Exemplo n.º 7
0
 def test_url_phantom(self, browser: WebDriver):
     time.sleep(1)
     browser.get("http://astrodate.ru/#/") # url associated with button click
     time.sleep(5)
     browser.save_screenshot("image.png")
     time.sleep(20)
Exemplo n.º 8
0
# Python language bindings for Selenium WebDriver
# https://pypi.python.org/pypi/selenium
from selenium.webdriver.phantomjs.webdriver import WebDriver

dirver = WebDriver(
    executable_path="../example/phantomjs-2.1.1-windows/bin/phantomjs.exe")
dirver.get('http://www.baidu.com')
print(dirver.title)
Exemplo n.º 9
0
# from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait  # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC  # available since 2.26.0
from selenium.webdriver.phantomjs.webdriver import WebDriver

# Create a new instance of the Firefox driver
driver = WebDriver(
    executable_path='/opt/phantomjs-2.1.1-linux-x86_64/bin/phantomjs',
    port=5001)

# go to the google home page
driver.get("http://www.baidu.com")

# the page is ajaxy so the title is originally this:
print(driver.title)

# find the element that's name attribute is q (the google search box)
inputElement = driver.find_element_by_id("kw")

# type in the search
inputElement.send_keys("cheese!")

# submit the form (although google automatically searches now without submitting)
inputElement.submit()

try:
    # we have to wait for the page to refresh, the last thing that seems to be updated is the title
    WebDriverWait(driver, 10).until(EC.title_contains("cheese!"))

    # You should see "cheese! - Google Search"
Exemplo n.º 10
0
class Search:
    def __init__(self, path, websites):
        self.exec_path = path
        self.driver = PhantomJS(executable_path=self.exec_path)
        self.websites = websites
        self.fileExceptionList = []
        self.fileLogList = []
        self.candidates = []
        self.begin_search_for_SSO()

    def begin_search_for_SSO(self):
        for site in self.websites:
            print site
            self.driver.set_page_load_timeout(60)
            self.first_url = "https://www."+site
            self.sso_info = {"url" : self.first_url, "loginSSO" : [], "signupSSO" : []}
            try:
                self.driver.get(self.first_url)
            except URLError as u:
                log_obj = {
                    "url" : self.first_url,
                    "reason" : "An exception occurred during first time page load url error"
                }
                self.fileLogList.append(log_obj)
                continue
            except (TimeoutException, Exception) as t:
                log_obj = {
                    "url" : self.first_url,
                    "reason" : "An exception occurred during first time page load timeout exception"
                }
                self.fileLogList.append(log_obj)
                continue
            else:
                first_url_doc = self.parse_web_page()
                self.process_page_for_click_candidates(first_url_doc, "login")
                self.search_for_SSO(self.parse_web_page(), "login")
                self.process_page_for_click_candidates(self.parse_web_page(), "signup")
                self.search_for_SSO(self.parse_web_page(), "signup")
                self.candidates.append(self.sso_info)
                print self.candidates
        self.write_to_file()
        self.done()

    def parse_web_page(self):
        html = self.driver.page_source
        parsed = BeautifulSoup(html, "lxml")
        return parsed

    def process_page_for_click_candidates(self, document, ptype):
        if ptype == 'login':
            found_elems = document.find_all(["a", "button", "span", "div", "img"], string=['Log In', 'LogIn', 'Login', 'Log in', 'Sign in', 'Sign In', 'Signin', 'SignIn', 'SIGNIN', 'SIGN IN', 'LOGIN', 'LOG IN', 'login', 'log in', 'signin', 'sign in'])
        elif ptype == 'signup':
            found_elems = document.find_all(["a", "button", "span", "div", "img"], string=['Sign Up', 'Signup', 'SignUp', 'Sign up', 'SIGN UP', 'SIGNUP', 'sign up', 'signup', 'Create account', 'Create Account', 'CREATE ACCOUNT', 'create account'])
        
        found_url = self.extract_url(found_elems)
        
        if(found_url):
            if found_url.startswith("/"):
                found_url = self.first_url + found_url
            try:
                self.driver.get(found_url)
            except URLError as u:
                log_obj = {
                    "url" : self.first_url,
                    "reason" : "An exception occurred during click candidate process url error"
                }
                self.fileLogList.append(log_obj)
                return
            except (TimeoutException, Exception) as t:
                log_obj = {
                    "url" : self.first_url,
                    "reason" : "An exception occurred during click candidate process timeout exception"
                }
                self.fileLogList.append(log_obj)
                return
        else:
            if ptype == 'login':
                exception_obj = {
                        "url" : self.first_url,
                        "reason" : "login url extraction failed"
                    }
            else:
                exception_obj = {
                    "url" : self.first_url,
                    "reason" : "Sign up url extraction failed"
                }                     
            self.fileExceptionList.append(exception_obj)
            
    def extract_url(self, elems):
        while(len(elems) > 0):
            each = elems.pop()
            url = each.get('href')
            if url is not None:
                return url
            else:
                parent = each.find_parent('a') or each.find_parent('button')
                if parent is None:
                    continue
                else:
                    p_url = parent.get('href')
                    if p_url is None:
                        continue
                    else:
                        return p_url

    def search_for_SSO(self, document, stype):
        stack = []
        if document is not None and document.body is not None:
            stack = document.body.contents
            while(len(stack) > 0):
                current = stack.pop()
                if(not(isinstance(current, NavigableString))):
                    children = current.contents
                    if len(children) > 0:
                        for child in children:
                            if(not(isinstance(child, NavigableString))):
                                stack.insert(0, child)

                    if not(current.name == 'script' or current.attrs == None or current.name == 'embed'):
                        self.process_node(current, stype)



    def process_node(self, node, stype):
        if self.filter_node_on_type(node):
            attrs = node.attrs
            str_to_check = node.string or ''
            for key in attrs:
                try:
                    str_to_check += key+'='+str(attrs[key])+';'
                except UnicodeError:
                    continue
                
            self.check_for_keywords(str_to_check, stype)




    def filter_node_on_type(self, node):
        if (node.name != "a" and node.name != "div" and node.name != "img" and
            node.name != "span" and node.name != "input" and
            node.name != "button"):
            return False
        if (node.name == "input"):
            if (node.type != "button" and node.type != "img" and
            node.type != "submit"):
                return False
        if (node.name == "A"):
            if (node.get('href').toLowerCase().indexOf('mailto:') == 0):
                return False
        return True

            
    def check_for_keywords(self, inputstr, stype):
        sso = [{"site" : 'google', "url" : ["https://accounts.google.com/o/oauth2/auth"]}, 
        {"site" : 'yahoo', "url" : ["https://api.login.yahoo.com/oauth2/request_auth"]}, 
        {"site" : '500px', "url": ["https://api.500px.com/v1/oauth"]}, 
        {"site" : 'aol', "url" :["https://api.screenname.aol.com/auth"]}, 
        {"site" : 'twitter', "url" : ["https://api.twitter.com/oauth"]}, 
        {"site" : 'vk', "url" : ["https://oauth.vk.com/authorize"]}, 
        {"site" : 'yammer', "url" : ["https://www.yammer.com/oauth2/authorize"]}, 
        {"site" : 'yandex', "url" : ["https://oauth.yandex.com/authorize"]},
        {"site" : 'zendesk', "url" : [".zendesk.com/oauth/authorizations/new"]}, 
        {"site" : 'amazon', "url" : ["http://g-ecx.images-amazon.com/images/G/01/lwa/btnLWA", "https://images-na.ssl-images-amazon.com/images/G/01/lwa/btnLWA"]},
        {"site" : 'flickr', "url" : ["https://www.flickr.com/services/oauth"]}, 
        {"site" : 'bitbucket', "url" : ["https://bitbucket.org/site/oauth2", "https://bitbucket.org/api/1.0/oauth"]}, 
        {"site" : 'bitly', "url" : ["https://bitly.com/oauth"]}, 
        {"site" : 'cloud foundry', "url" : ["/uaa/oauth"]}, 
        {"site" : 'dailymotion', "url" : ["https://www.dailymotion.com/oauth"]}, 
        {"site" : 'deviantART', "url" : ["https://www.deviantart.com/oauth2"]}, 
        {"site" : 'discogs', "url" : ["https://api.discogs.com/oauth"]}, 
        {"site" : 'huddle', "url" : ["https://login.huddle.net/request"]}, 
        {"site" : 'netflix', "url" : ["https://api-user.netflix.com/oauth"]}, 
        {"site" : 'openlink data spaces', "url" : ["/OAuth"]}, 
        {"site" : 'openstreetmap', "url" : ["http://www.openstreetmap.org/oauth"]}, 
        {"site" : 'opentable', "url" : ["http://www.opentable.com/oauth"]}, 
        {"site" : 'passport', "url" : ["/dialog/authorize", "oauth2/authorize", "oauth/authorize"]},
        {"site" : 'paypal', "url" : ["paypal.com/v1/oauth2"]}, 
        {"site" : 'plurk', "url" : ["https://www.plurk.com/OAuth/authorize"]},
        {"site" : 'sina weibo', "url" : ["http://api.t.sina.com.cn/oauth/authorize"]},
        {"site" : 'stack exchange', "url" : ["https://stackexchange.com/oauth"]}, 
        {"site" : 'statusnet', "url" : ["status.net/api/oauth/authorize"]}, 
        {"site" : 'ubuntu one', "url" : ["https://login.ubuntu.com/api/1.0/authentications"]},
        {"site" : 'viadeo', "url" : ["https://partners.viadeo.com/oauth/authorize"]},
        {"site" : 'vimeo', "url" : ["https://api.vimeo.com/oauth/authorize"]}, 
        {"site" : 'withings', "url" : ["https://oauth.withings.com/account/authorize"]},
        {"site" : 'xero', "url" : ["https://api.xero.com/oauth/Authorize"]},
        {"site" : 'xing', "url" : ["https://api.xing.com/v1/authorize"]}, 
        {"site" : 'goodreads', "url" : ["http://www.goodreads.com/oauth"]}, 
        {"site" : 'google app engine', "url" : ["https://accounts.google.com/o/oauth2/v2/auth"]},
        {"site" : 'groundspeak', "url" : ["groundspeak.com/oauth"]}, 
        {"site" : 'intel cloud services', "url" : []}, 
        {"site" : 'jive', "url" : ["jiveon.com/oauth2"]}, 
        {"site" : "linkedin", "url" : ["https://www.linkedin.com/oauth/v2/authorization"]}, 
        {"site" : 'trello', "url" : ["https://trello.com/1/OAuthAuthorizeToken", "https://trello.com/1/authorize"]}, 
        {"site" : 'tumblr', "url" : ["https://www.tumblr.com/oauth/authorize"]}, 
        {"site" : 'microsoft', "url" : ["https://login.live.com/oauth20"]},
        {"site" : 'mixi', "url" : ["api.mixi-platform.com/OAuth"]}, 
        {"site" : 'myspace', "url" : ["api.myspace.com/authorize"]}, 
        {"site" : 'etsy', "url" : ["https://www.etsy.com/oauth"]}, 
        {"site" : 'evernote', "url" : ["https://sandbox.evernote.com/OAuth.action"]},  
        {"site" : 'yelp', "url" : ["https://api.yelp.com/oauth2"]},  
        {"site" : 'facebook', "url" : ["fb-login-button", "https://www.facebook.com/v2.0/dialog/oauth"]},
        {"site" : "dropbox", "url" : ["https://www.dropbox.com/1/oauth2/authorize", "https://www.dropbox.com/1/oauth/authorize"]}, 
        {"site" : "twitch", "url" : ["https://api.twitch.tv/kraken/oauth2/authorize"]},
        {"site" : "stripe", "url" : ["https://connect.stripe.com/oauth/authorize"]},
        {"site" : 'basecamp', "url" : ["https://launchpad.37signals.com/authorization/new"]},
        {"site" : "box", "url" : ["https://account.box.com/api/oauth2/authorize"]},
        {"site" : "formstack", "url" : ["https://www.formstack.com/api/v2/oauth2/authorize"]},
        {"site" : "github", "url" : ["https://github.com/login/oauth/authorize"]},
        {"site" : "reddit", "url" : ["https://www.reddit.com/api/v1/authorize"]},
        {"site" : "instagram", "url" : ["https://api.instagram.com/oauth/authorize"]},
        {"site" : "foursquare", "url" : ["https://foursquare.com/oauth2/authorize"]},
        {"site" : "fitbit", "url" : ["https://www.fitbit.com/oauth2/authorize"]},
        {"site" : "imgur", "url" : ["https://api.imgur.com/oauth2/authorize"]},
        {"site" : "salesforce", "url" : ["https://login.salesforce.com/services/oauth2/authorize"]},
        {"site" : "strava", "url" : ["https://www.strava.com/oauth/authorize"]},
        {"site" : "battle.net", "url" : ["https://us.battle.net/oauth/authorize"]}]
        k0 = re.compile('oauth', re.I)
        k1 = re.compile('openid', re.I)
        k2 = re.compile('log[\-\S]?[io]n', re.I)
        k3 = re.compile('sign[\-\S]?[io]n', re.I)
        k4 = re.compile('Sign\S?up', re.I)
        e0 = re.compile('social', re.I)
        e1 = re.compile('subscribe', re.I)
        e2 = re.compile('connect', re.I)
        e3 = re.compile('like', re.I)

        

        for each in sso:
            compiled = re.compile(each['site'], re.I | re.S)
            if compiled.search(inputstr) is not None:
                if k0.search(inputstr) is not None:
                    if len(each['url']) > 0:
                        for url in each['url']:
                            c_url = re.compile(url, re.I)
                            if c_url.search(inputstr) is not None:
                                if stype == 'login':
                                    if k2.search(inputstr) is not None or k3.search(inputstr) is not None:
                                        if each['site'] not in self.sso_info["loginSSO"]:
                                            self.sso_info["url"] = self.first_url
                                            self.sso_info["loginSSO"].append(each['site'])
                                elif stype == 'signup':
                                    if each['site'] not in self.sso_info["signupSSO"]:
                                        self.sso_info["url"] = self.first_url
                                        self.sso_info["signupSSO"].append(each['site'])
                    else:
                        if stype == 'login':
                            if k2.search(inputstr) is not None or k3.search(inputstr) is not None:
                                if each['site'] not in self.sso_info["loginSSO"]:
                                    self.sso_info["url"] = self.first_url
                                    self.sso_info["loginSSO"].append(each['site'])
                        elif stype == 'signup':
                            if each['site'] not in self.sso_info["signupSSO"]:
                                self.sso_info["url"] = self.first_url
                                self.sso_info["signupSSO"].append(each['site'])
                elif k1.search(inputstr) is not None:
                    if stype == 'login':
                        if k2.search(inputstr) is not None or k3.search(inputstr) is not None:
                            if each['site'] not in self.sso_info["loginSSO"]:
                                self.sso_info["url"] = self.first_url
                                self.sso_info["loginSSO"].append(each['site'])
                    elif stype == 'signup':
                        if each['site'] not in self.sso_info["signupSSO"]:
                            self.sso_info["url"] = self.first_url
                            self.sso_info["signupSSO"].append(each['site'])
        

    def write_to_file(self):
        for each in self.candidates:
            if not(len(each['loginSSO']) > 0 or len(each['signupSSO']) > 0):
                i = self.candidates.index(each)
                del self.candidates[i]
        file_exception = open("exceptions.txt", "w")
        file_exception.write(json.dumps(self.fileExceptionList))
        file_exception.close()
        
        logFile = open("log.txt", "w")
        logFile.write(json.dumps(self.candidates))
        logFile.close()

        sys_exceptions = open("errors.txt", "w")
        sys_exceptions.write(json.dumps(self.fileLogList))
        sys_exceptions.close()

    def done(self):
        self.driver.quit()
Exemplo n.º 11
0
class HtmlURLUtil:
    """
        html请求工具类
        urllib:python核心库,一般只用于对url的处理,不用它提供的request请求
        tld(top level domain):强大的url域名处理工具,好吧,也不是很强大,但是用着方便
        selenium强大的,NB的web自动化测试工具
        phantomJS:无界面的webkit,一般使用它的request,好处,可以爬取搜索引擎的结果(benefit ajax)
    """
    __USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) " \
                   "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"

    def __init__(self, driver=None):
        self.driver = driver
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            # 'Accept-Encoding': '*',
            'Cache-Control': 'max-age=0',
            'User-Agent': HtmlURLUtil.__USER_AGENT,
            'Connection': 'keep-alive',
            'Referer': 'https://www.baidu.com/'
        }

    def getHtml(self, url, referer="https://www.baidu.com/"):
        _result = ""
        try:
            my_dc = DesiredCapabilities.PHANTOMJS.copy()
            my_dc["browserName"] = "chrome"
            my_dc["platform"] = "mac"
            my_dc["version"] = "63.0.3239.84"
            my_dc["phantomjs.page.settings.loadImages"] = False
            my_dc["phantomjs.page.settings.userAgent"] = HtmlURLUtil.__USER_AGENT

            service_args = ["--load-images=false", "--disk-cache=false",
                            "--ignore-ssl-errors=true"]
            # "--webdriver-logfile=webdriver.log","--webdriver-loglevel=INFO"
            for head, value in self.headers.iteritems():
                my_dc["phantomjs.page.customHeaders.{}".format(head)] = value

            my_dc["phantomjs.page.customHeaders.Referer"] = referer
            self.driver = WebDriver(desired_capabilities=my_dc, service_args=service_args)
            self.driver.set_script_timeout(20)
            self.driver.set_page_load_timeout(30)
            self.driver.implicitly_wait(5)
            self.driver.set_window_size(2560, 1066)

            self.driver.get(url)
            # 保存网页快照图片
            # self.driver.save_screenshot(md5_util.md5(url)+".png")
            _result = self.driver.page_source
        except:
            log.getLogger().exception("HtmlURLUtil  getHtml error...")
            # self.driver.close()
            self.driver.quit()
        return _result

    def closeWebDriver(self):
        self.driver.quit()

    def getSortQS(self, url):
        """
        获取排序好的query string
        :param url:
        :return:
        """
        a = urllib.splitquery(url)
        if len(a) <= 1 or not a[1]:
            return None
        qs = urlparse.parse_qs(a[1])
        # 使用快速排序O(nlogn)
        return sort_util.fastSortDict(qs)

    def getTLD(self, url):
        """
        获取域名对象
        :param url:
        :return:
        """
        try:
            if not url:
                return None
            web = urllib.splitquery(url)[0]
            return tld.get_tld(web)
        except:
            log.getLogger().exception("getTLD ...%s" % url)
        return None

    def getMd5URL(self, url):
        """
        对url进行md5
        先对参数排序,然后进行md5
        :param url:
        :return:
        """
        web = urllib.splitquery(url)[0]
        string = web + str(self.getSortQS(url))
        return md5_util.md5(string)

    def getElementsByTagName(self, elname):
        return self.driver.find_elements_by_tag_name(elname)

    def writeWebContentToFile(self, webcontent, filepath):
        if not webcontent:
            return
        reload(sys)
        sys.setdefaultencoding("utf-8")
        try:
            _dir = os.path.dirname(filepath)
            if not os.path.exists(_dir):
                os.makedirs(_dir)
            f = open(filepath, "w")
            f.write(webcontent)
            f.flush()
        except:
            log.getLogger().exception("htmlutil writeWebContentToFile ...")
        finally:
            f.close()

    def getCharset(self, content):

        charset = "utf-8"
        m = re.compile('<meta .*(http-equiv="?Content-Type"?.*)?charset="?([a-zA-Z0-9_-]+)"?', re.I)\
            .search(content)
        if m and m.lastindex == 2:
            charset = m.group(2).lower()
        return charset
Exemplo n.º 12
0
TempPath = pathlib.Path(str(os.sep).join(["_classic_", "Interface"]))

a = input("按回车开始自动检测: ")

print("\n开始读取NGA页面,可能需要一点时间……")

exename = 'phantomjs.exe' if os.name == 'nt' else 'phantomjs'

driver = WebDriver(executable_path=str(os.sep).join(
    ['selenium', 'bin', exename]),
                   port=5001)

page_link = "http://nga.178.com/read.php?tid=18302645&_ff=240"

driver.get(page_link)

try:
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.CLASS_NAME, "quote")))
    print("已读取:", driver.title, "\n")
except Exception as e:
    print(e)

quote_list = driver.find_elements_by_class_name("quote")

version_list = quote_list[2].text.split()[0]
#2020/04/03(1.13.3.47)

#print(version_list)
Exemplo n.º 13
0
def spider_comments(driver: WebDriver, hid: str, n: int) -> int:
    if Comment.objects.filter(hotel=hid).filter(page=n).count() == 15:
        return 0

    try:
        driver.get('%s/dianping/%s_p%dt0.html' % (ROOT_URL, hid, n))
        driver.implicitly_wait(0.5)
    except (ConnectionRefusedError, urllib.error.URLError,
            ConnectionResetError, TypeError, AttributeError):
        del driver
        return 403
    try:
        comment_list = driver.find_elements_by_css_selector(
            '#divCtripComment > div.comment_detail_list')[1]
    except IndexError:
        driver.implicitly_wait(5)
        try:
            comment_list = driver.find_elements_by_css_selector(
                '#divCtripComment > div.comment_detail_list')[1]
        except IndexError:
            comment_list = driver.find_element_by_css_selector(
                '#divCtripComment > div.comment_detail_list')

    if Hotel.objects.filter(hid=hid).count() == 1:
        hotel = Hotel.objects.get(hid=hid)
        if hotel.comments_count == 0:
            try:
                comment_text = driver.find_element_by_css_selector(
                    "#commentTab > a").text

                logging.warning("\n%s\n" % comment_text)
                hotel.comments_count = int(
                    RE_COMMENT.search(comment_text).group())
                logging.warning("\n%s\n" % hotel.comments_count)
                hotel.save()
            except Exception:
                pass

    comments = comment_list.find_elements_by_class_name('comment_block')

    for comment in comments:
        try:
            name = comment.find_element_by_class_name(
                'name').find_element_by_tag_name('span').text
            cid = comment.get_attribute('data-cid')
            points = comment.find_element_by_class_name('n').text
            room_type = comment.find_element_by_class_name('room_link').text
            content = comment.find_element_by_class_name(
                'J_commentDetail').text.strip()
        except Exception:
            continue
        logging.info('%s\n%s\n%s\n%s\n%s\n%s\n' %
                     (hid, name, n, room_type, points, content))

        # with sqlite3.connect('../../db.sqlite3') as conn:
        #     with conn.cursor() as cursor:
        #         cursor.execute("select * from get_data_comment where (cid=?)", (cid,))
        if Comment.objects.filter(cid=cid).count() == 0:
            Comment.objects.create(cid=cid,
                                   content=content,
                                   hotel=hid,
                                   page=n,
                                   points=points,
                                   room_type=room_type,
                                   name=name)

        elif not Comment.objects.filter(cid=cid).exclude(page=n).count() == 0:
            return 1

    del driver
    return 0