def process_request( self, request, spider ): # if 'renderjs' in request.meta: webview = self._get_webview() # subwindow = gtk.ScrolledWindow() # subwindow.add(webview) webview.load_uri(request.url) webview.connect('load-finished', self.load_finished) webview.connect('document-load-finished', self.document_load_finished) webview.connect('console-message', self.console_message) # self.g.add(subwindow) gtk.main() #gtk.main_quit() ctx = jswebkit.JSContext(webview.get_main_frame().get_global_context()) url = ctx.EvaluateScript('window.location.href') html = ctx.EvaluateScript('document.documentElement.innerHTML') #open("html1.html","wb").write(html) #commentnum=sel.xpath('//span[@id="changyan_parti_unit"]/text').extract() #print commentnum # print "huangfeng1" # body=html.encode('utf-8') # print body # print HtmlResponse(url, encoding='utf-8', body=html.encode('utf-8')) # print "huangfeng2" # print html return HtmlResponse(url, encoding='utf-8', body=html.encode('utf-8'))
def show_result(view, frame): print frame.get_title() print frame.get_uri() JSctx = frame.get_global_context() ctx = jswebkit.JSContext(JSctx) text = ctx.EvaluateScript('document.documentElement.innerHTML') print str(text)
def _load_finished(self, deferred, view, frame): if frame != view.get_main_frame(): return ctx = jswebkit.JSContext(frame.get_global_context()) url = ctx.EvaluateScript('window.location.href') html = ctx.EvaluateScript('document.documentElement.innerHTML') response = HtmlResponse(url, encoding='utf-8', body=html.encode('utf-8')) deferred.callback(response)
def get(url): webview = webkit.WebView() webview.connect('load-finished', lambda v, f: gtk.main_quit()) webview.load_uri(url) gtk.main() js = jswebkit.JSContext(webview.get_main_frame().get_global_context()) renderedBody = str(js.EvaluateScript('document.body.innerHTML')) print renderedBody with open("temp", "wb") as f: f.write(renderedBody)
def load_finished(view, frame): # called when the document finishes loading if frame != view.get_main_frame(): return ctx = jswebkit.JSContext(frame.get_global_context()) res = ctx.EvaluateScript('window.location.href') print res res = ctx.EvaluateScript('document.body.innerHTML') tree = lxml.html.fromstring(res) print tree.xpath('//input[@type="submit"]')
def load_finished(self, *args, **kw): try: print 'Render.load_finished' js = jswebkit.JSContext( self.webview.get_main_frame().get_global_context()) self.rendered_html = str( js.EvaluateScript('document.body.innerHTML')) self.pending.set() except Exception, e: print e
def process_request(self, request, spider): if (type(request) is not FormRequest): webview = webkit.WebView() webview.connect('load-finished', lambda v, f: gtk.main_quit()) webview.load_uri(request.url) gtk.main() js = jswebkit.JSContext( webview.get_main_frame().get_global_context()) renderedBody = str( js.EvaluateScript('document.documentElement.innerHTML')) return HtmlResponse(request.url, body=renderedBody)
def __init__(self, widget, uri): webkit.WebView.__init__(self) self._widget_window = widget settings = self.get_settings() settings.set_property("enable-developer-extras", True) #self.load_uri(uri) self.set_transparent(True) self.connect("script-prompt", self._script_callback) self.open(uri) self._ctx = jswebkit.JSContext( self.get_main_frame().get_global_context())
def load_finished(self, view, frame): # if frame != view.get_main_frame(): # return # ctx = jswebkit.JSContext(frame.get_global_context()) ctx = jswebkit.JSContext(view.get_main_frame().get_global_context()) url = ctx.EvaluateScript('window.location.href') html = ctx.EvaluateScript('document.documentElement.innerHTML') response = HtmlResponse(url, encoding='utf-8', body=html.encode('utf-8')) print "finished" self.stop_gtk() self.d.callback(response)
def process_request(self, request, spider): if 'renderjs' in request.meta: webview = self._get_webview() webview.connect('load-finished', self.stop_gtk) webview.load_uri(request.url) gtk.main() ctx = jswebkit.JSContext( webview.get_main_frame().get_global_context()) url = ctx.EvaluateScript('window.location.href') html = ctx.EvaluateScript('document.documentElement.innerHTML') return HtmlResponse(url, encoding='utf-8', body=html.encode('utf-8'))
def process_request(self, request, spider): if not request.meta.has_key('no_webkit') and type(request) is not FormRequest: webview = webkit.WebView() #set browser settings #settings = webkit.WebSettings() #settings.set_property('user-agent','Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10') #webview.set_settings(settings) webview.connect('load-finished', lambda v,f: gtk.main_quit()) webview.load_uri(request.url) gtk.main() js = jswebkit.JSContext(webview.get_main_frame().get_global_context()) renderedBody = str(js.EvaluateScript('document.documentElement.innerHTML')) return HtmlResponse(request.url, body=renderedBody)
def load_finished_cb(self, view, frame): print "load_finished" ctx = jswebkit.JSContext(self.get_main_frame().get_global_context()) window = ctx.EvaluateScript("window") #window.alert(None, "window") #window.foo = "bar" #print ctx.EvaluateScript("window.foo") document = ctx.EvaluateScript("document") #print "Title : ",document.title #form = document.forms[0] #print form.action #form.elements[1].value = "this is me" #form.elements[2].click(form.elements[2]) atags = document.getElementsByTagName(document, "a") print atags.getPropertyNames() for a in atags: print a.href
def _doc_load_finished(self, view, frame): ctx = jswebkit.JSContext(frame.get_global_context()) doc = ctx.EvaluateScript("document") links = doc.getElementsByTagName(doc, "a") for link in links: self.links.append(link.href) # nodes = doc.getElementsByTagName('body') # body = nodes.item(0) # d = doc.createElement("div") # b = doc.createElement("Button") # b.innerHTML = "hello" # b.onclick = self._button_click_event # d.appendChild(b) # txt = doc.createTextNode("hello world") # body.appendChild(txt) # body.appendChild(d) # body.tabIndex = 5 threading.Timer(2, self._webview_done).start()
def process_request(self, request, spider): print '1111111' print spider.name if spider.name in settings.WEBKIT_DOWNLOADER: print '2222' if (type(request) is not FormRequest): print '333333' webview = webkit.WebView() print request.url #webview.conner('load-finished',lambda v,f:gtk.main_quit()) webview.connect('load-finished', lambda v, f: gtk.main_quit()) webview.load_uri(request.url) gtk.main() js = jswebkit.JSContext( webview.get_main_frame().get_global_context()) renderedBody = str( js.EvaluateScript('document.body.innerHTML')) print renderedBody return HtmlResponse(request.url, body=renderedBody)
def get_html(self): frame = self.webview.get_main_frame() ctx = jswebkit.JSContext(frame.get_global_context()) text = ctx.EvaluateScript("document.body.innerHTML") return text