def process_request(self, request, spider): if spider.name in WEBKIT_DOWNLOADER: gh = Ghost() se = Session(gh, download_images=False) se.open(request.url) result, resource = se.evaluate( 'document.documentElement.innerHTML') spider.webkit_se = se renderedBody = str(resource).encode('utf8') return HtmlResponse(request.url, body=renderedBody)
def run3(): gh = Ghost() ss = Session(gh, display=True) count = 0 location = 0 ss.open('https://edition.cnn.com/election/2016/results/exit-polls/arizona/president') ss.wait_timeout() html3 = ss.content.encode('utf-8') patten = re.compile(r'<td class="exit-poll__cell">', re.M)
def round_trip(DepartCity, ReturnCity, departDate, returnDate, debug=0): #global se start_time = datetime.now() url = 'http://flights.ctrip.com/international/round-%s-%s-%s-%s?%s&%s&y_s' % ( DepartCity, ReturnCity, code(DepartCity), code(ReturnCity), departDate, returnDate) #print(url) ctrip_access = False while ctrip_access == False: se = Session(Ghost(), wait_timeout=30, wait_callback=None, display=True, viewport_size=(800, 680), download_images=False) se.delete_cookies() proxy = choice(proxypool) se.set_proxy(proxy[0], proxy[1], int(proxy[2])) try: se.open(url, user_agent=choice(ua_list)) #print('已打开 %s' % url) except: se.exit() del se proxypool.remove(proxy) print("blacklist %s" % proxy[1]) continue ctrip_access = se.exists('li:nth-child(5) > span') if ctrip_access == False: se.exit() del se proxypool.remove(proxy) print("blacklist %s" % proxy[1]) se.click('#sortControls > ul > li:nth-child(5) > span') if se.exists('i.icon-reverse') == True: se.click('#sortControls > ul > li:nth-child(5) > span') se.wait_while_selector('#FI_progBar', timeout=20) #print('Loading finished!') se.sleep(0.2) html = se.content soup = BeautifulSoup(html, "html.parser") source = soup.select('#flightList > div') if debug == 1: return source lowest = source[0].select('span.price2')[0].text end_time = datetime.now() timedelsta = (end_time - start_time).seconds print('%s-%s往返 %s去 %s回 最低价%s 搜索耗时%s秒' % (DepartCity, ReturnCity, departDate, returnDate, lowest, timedelsta)) se.exit() del se price = lowest[1:] insert_price(DepartCity, ReturnCity, departDate, returnDate, price)
def login_qq(): global se ua_m = 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_1_1 like Mac OS X) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0 Mobile/15B150 Safari/604.1' se = Session(Ghost(), user_agent=ua_m, wait_timeout=30, wait_callback=None, display=True, viewport_size=(375, 553), download_images=True) url = 'https://ui.ptlogin2.qq.com/cgi-bin/login?style=38&appid=728041403&s_url=https%3A%2F%2Finfoapp.3g.qq.com%2Fg%2Flogin%2Fproxy.jsp%3FsourceUrl%3Dhttps%25253A%25252F%25252Fportal.3g.qq.com%25252F%25253F_r%25253D0.2646472700205946%252526aid%25253Dindex%252526g_f%25253D1283&target=self&low_login=1&low_login_hour=4321&daid=261&islogin=false&uid=-8794356048489038000' se.open(url) se.set_field_value('#u', '2873723285') se.set_field_value('#p', 'tz1006') se.click('#go', expect_loading=True)
class YoukuGhostDriver(object): def __init__(self, host, port, timeout): #url = 'http://111.161.35.198:12210/youku_ghost.html' url = 'http://%s:%s/youku_ghost.html' % (host, port) self.ghost = Ghost() self.session = Session(self.ghost, wait_timeout=timeout, plugins_enabled=True) self.session.open(url) def parse(self, vid): try: res = [] self.session.evaluate('window.getPlayUrl("%s")' % vid) success, resources = self.session.wait_for_selector('div[id="ck"]') if success: ck = self.session.evaluate( 'document.getElementById("ck").innerHTML') res = ck[0] except Exception, e: log.app_log.error(traceback.format_exc()) finally:
#!/usr/bin/env python #coding:utf-8 from ghost import Ghost, Session import time gh = Ghost() se = Session(gh, display = True) se.open("https://www.baidu.com/") time.sleep(10)
se.click('tbody > tr:nth-child(%s) > td:nth-child(%s)' % (m, n)) # vcode se.capture_to('s/vcode.png', selector='#ticketImg') image = Image.open('s/vcode.png') vcode = pytesseract.image_to_string(image) se.set_field_value('#ticket', vcode) se.sleep(0.1) se.click('#submit', expect_loading=True) login(username, password) url = 'https://trade.cgws.com/cgi-bin/user/Login' se.open(url) # username se.set_field_value('#fundAccount', username) # password se.fire('#normalpassword', 'focus') se.show() html = se.content soup = BeautifulSoup(html, "html.parser") keys = soup.select('tbody > tr > td') key_list = [] for key in keys: key_list.append(key.text) for i in password: m = (key_list.index(i) // 4) + 1 n = (key_list.index(i) % 4) + 1
class GhostMiddleware(object): def __init__(self): ua = random.choice(self.user_agent_list) # 随机选择一个User-Agent self.ghost = Ghost() self.se = Session(self.ghost, display=False, wait_timeout=60, download_images=False) super(GhostMiddleware, self).__init__() # 通过 Ghost 请求动态网页,代替scrapy的downloader def process_request(self, request, spider): # self.se.set_proxy(type_='https', host='127.0.0.1', port=1083) # type_根据访问的url手动修改 self.se.open(request.url) print("访问:{0}".format(request.url)) # 直接返回给spider,而非再传给downloader return HtmlResponse(url=request.url, body=self.se.content, encoding="utf-8", request=request) def __del__(self): self.ghost.exit() user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10", "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12", "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" ]
#-*- coding:utf-8 -*- from ghost import Ghost from ghost import Session import time gh = Ghost() sessin = Session(gh) while True: try: page , resource = sessin.open("http://abcabc.gq") sessin.wait_for_page_loaded(10000) except: pass #print sessin.content try: page , resource = sessin.open("http://abcabc.gq/test.php") sessin.wait_for_page_loaded(10000) except: pass #print sessin.content try: page , resource = sessin.open("http://mxqabc.gq") sessin.wait_for_page_loaded(10000) except: pass #print sessin.content try: page , resource = sessin.open("http://mxqabc.gq/test.php") sessin.wait_for_page_loaded(10000) except: pass #print sessin.content
item_url = 'http://www.supremenewyork.com/shop/accessories/oi6nqp83m/hsyw4g52m' checkout_url = 'https://www.supremenewyork.com/checkout' ############################## ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' header = {'User-Agent': ua} gh = Ghost() se = Session(gh, user_agent=ua, wait_timeout=20, wait_callback=None, display=True, viewport_size=(1080, 1680), download_images=True) ############################## se.open(item_url) se.evaluate("""document.querySelector('input[name="commit"]').click();""") se.sleep(0.5) se.open(checkout_url) ISOFORMAT = '%Y%m%d' today = datetime.today() filename = today.strftime(ISOFORMAT) f = open('supreme' + '/' + filename + '.html', 'w') f.write(se.content) f.close() import code code.interact(banner="", local=locals())
#coding=utf-8 from ghost import Ghost,Session import urllib ghost = Ghost() #url = "http://index.baidu.com/?tpl=trend&word=%B1%E4%D0%CE%BD%F0%B8%D5" url = "http://piaofang.maoyan.com/movie/246083?_v_=yes" ###### urllib ###### #def getHtml(url): # page = urllib.urlopen(url) # html = page.read() # return html # #html = getHtml(url) #print html #print page #print "---" * 30 #print extra_resources ###### Ghost.py ###### with ghost.start(): session = Session(ghost) session.wait_timeout = 999 page,resource = session.open(url) print session.content print page.headers, page.url, page.http_status
#-*- coding:utf-8 -*- from ghost import Ghost from ghost import Session import time gh = Ghost() sessin = Session(gh) while True: try: page, resource = sessin.open("http://abcabc.gq") sessin.wait_for_page_loaded(10000) except: pass #print sessin.content try: page, resource = sessin.open("http://abcabc.gq/test.php") sessin.wait_for_page_loaded(10000) except: pass #print sessin.content try: page, resource = sessin.open("http://mxqabc.gq") sessin.wait_for_page_loaded(10000) except: pass #print sessin.content try: page, resource = sessin.open("http://mxqabc.gq/test.php") sessin.wait_for_page_loaded(10000) except: pass #print sessin.content
from ghost import Ghost, Session ghost = Ghost() USERAGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0" with ghost.start(): session = Session(ghost, download_images=False, display=True, user_agent=USERAGENT, viewport_size=(800, 600)) page, rs = session.open("https://m.facebook.com/login.php", timeout=120) assert page.http_status == 200 session.evaluate(""" document.querySelector('input[name="email"]').value = '*****@*****.**' document.querySelector('input[name="pass"]').value = 'wikipedia150101facebook'; """) session.evaluate("""document.querySelector('input[name="login"]').click();""", expect_loading=True) """ import codecs with codecs.open('fb.html', encoding='utf-8', mode='w') as f: f.write(session.content) """ # session.save_cookies('fbookie') session.capture_to(path='fbookie.png') # gracefully clean off to avoid errors session.webview.setHtml('') session.exit()
file_name = './chap/{0}_{1}-{2}.txt'.format(series_name, lower_bound, upper_bound) content = ''.join([i if ord(i) < 128 else ' ' for i in content]) print(file_name) with open(file_name, 'wt', encoding='utf-8') as file: file.write(content) searching = True story_buffer = '' next_url = initial_url while searching: try: session.open(next_url, timeout=300) lower_bound += 1 session.wait_for_selector(next_button, 60) except: break story_data = session.evaluate( 'document.querySelector("{0}").innerText;'.format(wrapping_div)) text = story_data[0] story_buffer += str(text) if lower_bound > upper_bound: save_story(series_name, lower_bound - chunks, upper_bound, story_buffer) story_buffer = '' lower_bound = upper_bound