def web_save_transfer(self, url): self.mongo_operate.transfer_web_save( url, source_type='gray', goal_type='counterfeit') h = WebSavePath() source_file_path, target_file_path = h.get_transfer_path( url, 'gray', 'counterfeit') web_info_transfer(source_file_path, target_file_path)
def __init__(self, mongo_operate=None): self.mongo_operate = mongo_operate self.web_save_path = WebSavePath() self.title = '' self.keyword = '' self.Html = '' self.url = '' self.div_num = 0
def page_shot(self): self.read_crawler_config() get_protected_iter = self.get_protected_iter get_gray_iter = self.get_gray_iter get_counterfeit_iter = self.get_counterfeit_iter get_monitor_iter = self.get_monitor_iter url_type = '' while 1: try: url = get_protected_iter.next() url_type = 'protected' except StopIteration: try: url = get_gray_iter.next() url_type = 'gray' except StopIteration: try: url = get_counterfeit_iter.next() url_type = 'counterfeit' except StopIteration: try: url = get_monitor_iter.next() url_type = 'monitor' except StopIteration: break print 'shot: ', url web_save_path = WebSavePath() local_html, local_time = web_save_path.get_html_path_abs( url, url_type) if local_time is None: sys.stderr.write( '%s insert_web_info, web not be saved: %s\n' % (time.ctime(), url)) continue # webpage blockpage webpage_path = local_time + '/webpage.jpeg' img_type = 'webpage' # img name : webpage.jpeg if not os.path.exists(webpage_path): main_html_path = local_time + '/main.html' if not os.path.exists(main_html_path): sys.stderr.write( '%s insert_web_info, main.html not be exist: %s\n' % (time.ctime(), url)) continue call_page_shot = CallPageShot(main_html_path, local_time, img_type) call_page_shot.start() while not os.path.exists(local_time + '/shot_over_sign'): time.sleep(0.5) os.remove(local_time + '/shot_over_sign') print 'shot over'
def page_shot(self): self.read_crawler_config() get_protected_iter = self.get_protected_iter get_gray_iter = self.get_gray_iter get_counterfeit_iter = self.get_counterfeit_iter get_monitor_iter = self.get_monitor_iter url_type = '' while 1: try: url = get_protected_iter.next() url_type = 'protected' except StopIteration: try: url = get_gray_iter.next() url_type = 'gray' except StopIteration: try: url = get_counterfeit_iter.next() url_type = 'counterfeit' except StopIteration: try: url = get_monitor_iter.next() url_type = 'monitor' except StopIteration: break print 'shot: ', url web_save_path = WebSavePath() local_html, local_time = web_save_path.get_html_path_abs( url, url_type) if local_time is None: sys.stderr.write('%s insert_web_info, web not be saved: %s\n' % (time.ctime(), url)) continue # webpage blockpage webpage_path = local_time + '/webpage.jpeg' img_type = 'webpage' # img name : webpage.jpeg if not os.path.exists(webpage_path): main_html_path = local_time + '/main.html' if not os.path.exists(main_html_path): sys.stderr.write('%s insert_web_info, main.html not be exist: %s\n' % (time.ctime(), url)) continue call_page_shot = CallPageShot( main_html_path, local_time, img_type) call_page_shot.start() while not os.path.exists(local_time + '/shot_over_sign'): time.sleep(0.5) os.remove(local_time + '/shot_over_sign') print 'shot over'
def get_web_pic(self, url, url_type='counterfeit', path_type='abs'): ''' 根据url_type,去本地web_info目录下查找对应url保存的网页信息, 根据main.heml和block.html进行截图,若已有截图则跳过,然后保存到该目录, path_type为abs时返回图片的绝对路径,为rel时返回相对路径 ''' web_save_path = WebSavePath() local_html, local_time = web_save_path.get_html_path_abs( url, url_type) if local_html is None or local_time is None: sys.stderr.write('%s get_web_pic, web not be saved: %s\n' % (time.ctime(), url)) return '', '' # webpage blockpage webpage_path = local_time + '/webpage.jpeg' img_type = 'webpage' if not os.path.exists(webpage_path): main_html_path = local_time + '/main.html' if not os.path.exists(main_html_path): sys.stderr.write('%s get_web_pic, main.html not be exist: %s\n' % (time.ctime(), url)) # 没有main.html则肯定没有block.html,所以可直接跳过 return '', '' call_page_shot = CallPageShot(main_html_path, local_time, img_type) call_page_shot.start() while not os.path.exists(local_time + '/shot_over_sign'): time.sleep(0.5) os.remove(local_time + '/shot_over_sign') # insert blockpage blockpage_path = local_time + '/blockpage.jpeg' img_type = 'blockpage' if not os.path.exists(blockpage_path): block_html_path = local_time + '/block.html' if not os.path.exists(block_html_path): sys.stderr.write('%s get_web_pic, block.html not be exist: %s\n' % (time.ctime(), url)) return '', '' call_page_shot = CallPageShot( block_html_path, local_time, img_type) call_page_shot.start() while not os.path.exists(local_time + '/shot_over_sign'): time.sleep(0.2) os.remove(local_time + '/shot_over_sign') if path_type == 'rel': local_html, local_time = web_save_path.get_html_path_rel( url, url_type) webpage_path = local_time + '/webpage.jpeg' blockpage_path = local_time + '/blockpage.jpeg' return webpage_path, blockpage_path
def view_work(self, url, web_type): self.vtree = [] h = WebSavePath() filename, path = h.get_html_path_abs(url, web_type) if path is not None: self.vtree = self.mongo_operate.get_web_tree( url, web_type) # 根据url 获取网页视觉块信息 if self.vtree is False or self.vtree == []: return None p = ViewPageBlock(path, self.vtree, self.current_path, url, self.mongo_operate, web_type) p.gather_vips_pic() # 获取网页图像特征 p.save_feature() # 格式化图像特征,存入mongo else: sys.stderr.write(' view_work:no this path') return None
def view_work(self, url, web_type): self.vtree = [] h = WebSavePath() filename, path = h.get_html_path_abs(url, web_type) if path is not None: self.vtree = self.mongo_operate.get_web_tree( url, web_type) # 根据url 获取网页视觉块信息 if self.vtree is False or self.vtree == []: return None p = ViewPageBlock( path, self.vtree, self.current_path, url, self.mongo_operate, web_type) p.gather_vips_pic() # 获取网页图像特征 p.save_feature() # 格式化图像特征,存入mongo else: sys.stderr.write(' view_work:no this path') return None
def __init__(self, task_id, get_protected_iter, get_gray_iter, get_counterfeit_iter, get_monitor_iter, mongo_operate, update_running_state, update_finish_state, mysql_handle, run_start_time): super(Browser, self).__init__() self.task_id = task_id self.mongo_operate = mongo_operate self.update_running_state = update_running_state self.update_finish_state = update_finish_state self.get_protected_iter = get_protected_iter self.get_gray_iter = get_gray_iter self.get_counterfeit_iter = get_counterfeit_iter self.get_monitor_iter = get_monitor_iter self.mysql_handle = mysql_handle self.run_start_time = run_start_time global _CURRENT_PATH global _LIVE_LOG_PATH self.current_path = _CURRENT_PATH self.live_log_path = _LIVE_LOG_PATH self.web_save_path = WebSavePath() self.main_page = self.page() # QWebPage self.main_page.javaScriptAlert = self._alert self.main_frame = self.main_page.mainFrame() # QWebFrame # 将本地QT对象QWbebView的信号loadfinished与javascript相连 # 网页加载完毕后执行load_finished函数 self.main_frame.loadFinished.connect(self.load_finished) self.pjs = PythonJavascript(self, mongo_operate, update_running_state, mysql_handle) # 为保证每次刷新加载页面是都调用addtoJavascriptWindowObject方法 # 需要与信号javaScriptWindowObjectCleared相连 self.main_frame.javaScriptWindowObjectCleared.connect( self.addpjs) # 绑定python对象注册 with open(self.current_path + "/script.js", "r") as f: self.script = f.read() self.qt_live_path = '/tmp/' + str(task_id) + '_qt_callback.txt' self.check_qt_alive = open(self.qt_live_path, 'w') self.engine_pid = os.getpid() self.load_url()
def __init__(self, url_type_list, mongo_operate): # 例:url="http://www.outofmemory.cn/11/12.htm" self.url = url_type_list[0] self.url_type = url_type_list[1] self.parts = urlparse(self.url) self.host = self.parts.netloc # u_pro: "http" u_s1: "www.outofmemory.cn/11/12.htm" self.u_pro, u_s1 = splittype(self.url) # u_host: "www.outofmemory.cn" u_path: "/11/12.htm" self.u_host, self.u_path = splithost(u_s1) if self.u_host is None: return None self.u_host, port = splitport(self.u_host) # 获取到 "http://www.outofmemory.cn" self.u_phost = self.u_pro + '://' + self.u_host # self.url_md5 = self.get_folder_name_split(url) self.gray_create_time = str( time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time()))) # Web_save_base类获取url保存的路径和MD5值 self.web_save_path = WebSavePath() self.upath, self.identity_list = self.web_save_path.get_file_save_path( self.url, self.url_type) # self.s_path 格式为 域名路径/md5/时间戳 self.s_path = self.upath + '/' + self.gray_create_time # 初始化 extract_html类,获取title和keyword 写入数据库 self.parse_web_text = extract_html(mongo_operate) self.css_path = 'css' self.js_path = 'js' self.html_path = 'html' self.pic_path = 'pic' # 保存数据的根目录 格式为: ...web_save/web_info self.r_path = pjoin(_PATH, 'web_info') # 域名路径层的文件目录 格式为: 工作路径/web_info/域名路径/md5 self.upath = pjoin(self.r_path, self.upath) self.headers = {} self.total_list = []
def __init__(self, task_id, get_protected_iter, get_gray_iter, get_counterfeit_iter, get_monitor_iter, mongo_operate, update_running_state, update_finish_state, mysql_handle, run_start_time): super(Browser, self).__init__() self.task_id = task_id self.mongo_operate = mongo_operate self.update_running_state = update_running_state self.update_finish_state = update_finish_state self.get_protected_iter = get_protected_iter self.get_gray_iter = get_gray_iter self.get_counterfeit_iter = get_counterfeit_iter self.get_monitor_iter = get_monitor_iter self.mysql_handle = mysql_handle self.run_start_time = run_start_time global _CURRENT_PATH global _LIVE_LOG_PATH self.current_path = _CURRENT_PATH self.live_log_path = _LIVE_LOG_PATH self.web_save_path = WebSavePath() self.main_page = self.page() # QWebPage self.main_page.javaScriptAlert = self._alert self.main_frame = self.main_page.mainFrame() # QWebFrame # 将本地QT对象QWbebView的信号loadfinished与javascript相连 # 网页加载完毕后执行load_finished函数 self.main_frame.loadFinished.connect( self.load_finished) self.pjs = PythonJavascript( self, mongo_operate, update_running_state, mysql_handle) # 为保证每次刷新加载页面是都调用addtoJavascriptWindowObject方法 # 需要与信号javaScriptWindowObjectCleared相连 self.main_frame.javaScriptWindowObjectCleared.connect( self.addpjs) # 绑定python对象注册 with open(self.current_path + "/script.js", "r") as f: self.script = f.read() self.qt_live_path = '/tmp/' + str(task_id) + '_qt_callback.txt' self.check_qt_alive = open(self.qt_live_path, 'w') self.engine_pid = os.getpid() self.load_url()
class extract_html(): def __init__(self, mongo_operate=None): self.mongo_operate = mongo_operate self.web_save_path = WebSavePath() self.title = '' self.keyword = '' self.Html = '' self.url = '' self.div_num = 0 # pass def write_mongo(self, goal_url, title, text, url_type): self.mongo_operate.add_web_title(goal_url, url_type, title) self.mongo_operate.add_web_text(goal_url, url_type, text) def get_html_file(self, url, url_type): html_path, time_path = self.web_save_path.get_html_path_abs( url, url_type) if html_path is None or time_path is None: return False # print html_path f = open(html_path, 'r') self.Html = f.read() def get_keyword(self, Html=None): ''' 获取 title keyword, div num ''' if Html is None: Html = self.Html # Html=file('main.html').read() char = chardet.detect(Html)['encoding'] page = etree.HTML(Html.decode(char, 'ignore')) title = page.xpath('/html/head/title/text()') self.title = title[0].strip() match = re.findall(r"<div.*?>(.*?)</div>", Html) self.div_num = len(match) # get the <div> number re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I) # 匹配CDATA s = re_cdata.sub('', Html) re_br = re.compile('<br\s*?/?>') # 处理换行 s = re_br.sub('\n', Html) blank_line = re.compile('\n+') # 去掉多余的空行 s = blank_line.sub('', s) re_comment = re.compile('<!--[^>]*-->') # HTML注释 s = re_comment.sub('', s) # 去掉HTML注释 re_style = re.compile('<style\s*[^>]*>(.*?)</style\s*>') # style s = re_style.sub('', s) re_script = re.compile('<script\s*[^>]*>(.*?)</script>') s = re_script.sub('', s) re_h = re.compile('</?[^>]*>') # 处理html标签 s = re_h.sub('', s) s = replaceCharEntity(s) # 替换实体 s = s.replace(" ", "") cut_web_text = cut_all(s) self.keyword = get_keyword(cut_web_text) if chardet.detect(self.title)['encoding'] == 'GB2312': self.title = unicode(self.title, "gb2312").encode('utf-8') if chardet.detect(self.keyword)['encoding'] == 'GB2312': self.keyword = unicode(self.keyword, "gb2312").encode('utf-8') return [self.div_num, self.keyword, self.title] def save_info(self, url, path, url_type): with open(pjoin(path, 'id.txt'), 'wb') as f: # 存取文件 f.write(str(self.div_num) + ' ' + self.keyword) f.close() self.write_mongo(url, self.title, self.keyword, url_type) # 写入数据库
class HtmlStruct(): def __init__(self, url_type_list, mongo_operate): # 例:url="http://www.outofmemory.cn/11/12.htm" self.url = url_type_list[0] self.url_type = url_type_list[1] self.parts = urlparse(self.url) self.host = self.parts.netloc # u_pro: "http" u_s1: "www.outofmemory.cn/11/12.htm" self.u_pro, u_s1 = splittype(self.url) # u_host: "www.outofmemory.cn" u_path: "/11/12.htm" self.u_host, self.u_path = splithost(u_s1) if self.u_host is None: return None self.u_host, port = splitport(self.u_host) # 获取到 "http://www.outofmemory.cn" self.u_phost = self.u_pro + '://' + self.u_host # self.url_md5 = self.get_folder_name_split(url) self.gray_create_time = str( time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time()))) # Web_save_base类获取url保存的路径和MD5值 self.web_save_path = WebSavePath() self.upath, self.identity_list = self.web_save_path.get_file_save_path( self.url, self.url_type) # self.s_path 格式为 域名路径/md5/时间戳 self.s_path = self.upath + '/' + self.gray_create_time # 初始化 extract_html类,获取title和keyword 写入数据库 self.parse_web_text = extract_html(mongo_operate) self.css_path = 'css' self.js_path = 'js' self.html_path = 'html' self.pic_path = 'pic' # 保存数据的根目录 格式为: ...web_save/web_info self.r_path = pjoin(_PATH, 'web_info') # 域名路径层的文件目录 格式为: 工作路径/web_info/域名路径/md5 self.upath = pjoin(self.r_path, self.upath) self.headers = {} self.total_list = [] def format_css_url(self, path, url): url = url.replace('\\', '') url = url.replace('"', '') url = url.replace("'", '') url = url.strip() if url.find('http') != 0: if url.find('/') == 0 or url.find('./') == 0: url = url[url.find('/') + 1:] if path.find('./') == 0: return url else: path = path[0:path.rfind('/')] url = path + '/' + url elif url.find('../') == 0: if url.find('www.') != -1: url = url[url.find('www.'):] return url else: num = url.count('../') path_list = path.split('/') path_list = path_list[0:len(path_list) - num - 1] if len(path_list) > 0: path = '/'.join(path_list) url = path + '/' + url[num * 3:] else: path = '' url = path + url[num * 3:] else: index = path.rfind('/') path = path[:index] url = pjoin(path, url).replace('\\', '/') return url def set_response(self, body): char = chardet.detect(body)['encoding'] self.char = char body = body.decode(char, 'ignore') self.html = body # 由于网页的编码可能不是utf-8因此需要分析并进行转换 self.page = etree.HTML(body) self.get_urllist() def add_pic(self, css_body, path, filename): '''使用正则匹配css文件中的url,添加到pic_list中,修改css文件中的路径,如此的话,此次下载的css文件需要修该url中的路径 完成对list的修改''' css_url = [] css_pic_format = [] char_css = chardet.detect(css_body)['encoding'] css_pic = re.findall( _CSSIMGURLRE, css_body.decode(char_css, 'ignore').encode('utf8'), re.S) for i in css_pic: if i and i.find('data:') != 0: css_pic_format.append([]) re_url = self.format_css_url(path, i) css_pic_format[-1] = (self.format_url(re_url), i) css_url = _FC( (pjoin(self.r_path, self.s_path, self.css_path), self.css_path), css_pic_format) for url in css_url: css_body = css_body.replace(url[0][1], url[2].replace('\\', '/')) css_path = pjoin(pjoin(self.r_path, self.s_path), 'css') save_to_file(css_body, css_path, filename) return css_url def get_urllist(self): css_list = self.page.xpath('//*/link[@rel="stylesheet"]/@href') js_list = self.page.xpath('//*/script/@src') pic_list = self.page.xpath('//*/link[@rel="shortcut icon"]/@href') pic_list += self.page.xpath('//*/link[@rel="Shortcut Icon"]/@href') pic_list += self.page.xpath('//*/link[@rel="icon"]/@href') pic_list += re.findall(_CSSIMGURLRE, self.html, re.S) pic_list += self.page.xpath('//img/@original') pic_list += self.page.xpath('//img/@src') pic_list += self.page.xpath('//div/@src') pic_list += self.page.xpath('//input/@src') pic_list += self.page.xpath('//*/@background') html_list = self.page.xpath('//iframe/@src') html_list += self.page.xpath('//frame/@src') html_list = list(set(html_list)) pic_list = list(set(pic_list)) css_list = list(set(css_list)) js_list = list(set(js_list)) fc = lambda url_list: [(self.format_url(url).replace( '\\', '/'), url) for url in url_list if url and url.find('data:') != 0] self.pic_list = fc(pic_list) self.css_list = fc(css_list) self.html_list = fc(html_list) self.css_list = _FC( (pjoin(self.r_path, self.s_path, self.css_path), self.css_path), self.css_list) self.js_list = fc(js_list) def format_url(self, url): url = url.replace('\\', '') url = url.replace("'", '') url = url.replace('"', '') url = url.strip() if url.find('//') == 0: return self.u_pro + ':' + url if url.find('www.') == 0: return self.u_pro + '://' + url if url.find('http') != 0: if url.find('./') == 0: r_url = url[url.find('/') + 1:] path = os.path.join( self.u_path[0:self.u_path.rfind('/')], r_url).replace('\\', '/') elif url.find('../') == 0: u_path = '/'.join(self.u_path.split('/')[:-2]) r_url = url[url.find('/') + 1:] path = os.path.join(u_path, r_url).replace('\\', '/') elif url.find('/') == 0: path = url else: path = os.path.join( self.u_path[0:self.u_path.rfind('/') + 1], url).replace('\\', '/') if path: if path[0] == '.': path = os.path.join(self.url, path[path.find('/'):]) if path[0] != '/': return self.u_phost + '/' + path return self.u_phost + path return url def store(self): dl_list = [] dl_list += _FC((pjoin(self.r_path, self.s_path, self.pic_path), self.pic_path), self.pic_list) dl_list += _FC((pjoin(self.r_path, self.s_path, self.js_path), self.js_path), self.js_list) dl_list += _FC((pjoin(self.r_path, self.s_path, self.html_path), self.html_path), self.html_list) self.total_list += dl_list dl_list += self.css_list self.dl_list = dl_list for dl in dl_list: if dl[0][1] != ' ': self.html = self.html.replace( dl[0][1], pjoin(dl[1][1], dl[2]).replace('\\', '/')) self.html = self.html.replace("base href", "cc") self.html = self.html.encode('utf-8', 'ignore') # print 'call back all url', self.url # s_path 格式为: 当前工作目录/web/域名/时间戳/ s_path = pjoin(self.r_path, self.s_path) now_web_info = self.parse_web_text.get_keyword(self.html) print 'now_web_info', now_web_info if self.identity_list is not None: if self.identity_list[0] == now_web_info[0] or self.identity_list[1] == now_web_info[1]: return 0, self.url, self.url_type # print 'call back not saved, wait save url', self.url save_to_file(self.html, s_path, 'main.html') with open(pjoin(s_path, 'url'), 'wb') as f: f.write(self.url) # 将title和关键字从网页中抽取出存入mongo self.parse_web_text.save_info(self.url, s_path, self.url_type) return 1, self.url, self.url_type
class Browser(QWebView): ''' simulation browser, render URL, parse it dom tree and text and webpage ''' def __init__(self, task_id, get_protected_iter, get_gray_iter, get_counterfeit_iter, get_monitor_iter, mongo_operate, update_running_state, update_finish_state, mysql_handle, run_start_time): super(Browser, self).__init__() self.task_id = task_id self.mongo_operate = mongo_operate self.update_running_state = update_running_state self.update_finish_state = update_finish_state self.get_protected_iter = get_protected_iter self.get_gray_iter = get_gray_iter self.get_counterfeit_iter = get_counterfeit_iter self.get_monitor_iter = get_monitor_iter self.mysql_handle = mysql_handle self.run_start_time = run_start_time global _CURRENT_PATH global _LIVE_LOG_PATH self.current_path = _CURRENT_PATH self.live_log_path = _LIVE_LOG_PATH self.web_save_path = WebSavePath() self.main_page = self.page() # QWebPage self.main_page.javaScriptAlert = self._alert self.main_frame = self.main_page.mainFrame() # QWebFrame # 将本地QT对象QWbebView的信号loadfinished与javascript相连 # 网页加载完毕后执行load_finished函数 self.main_frame.loadFinished.connect( self.load_finished) self.pjs = PythonJavascript( self, mongo_operate, update_running_state, mysql_handle) # 为保证每次刷新加载页面是都调用addtoJavascriptWindowObject方法 # 需要与信号javaScriptWindowObjectCleared相连 self.main_frame.javaScriptWindowObjectCleared.connect( self.addpjs) # 绑定python对象注册 with open(self.current_path + "/script.js", "r") as f: self.script = f.read() self.qt_live_path = '/tmp/' + str(task_id) + '_qt_callback.txt' self.check_qt_alive = open(self.qt_live_path, 'w') self.engine_pid = os.getpid() self.load_url() def _alert(self, frame, message): pass def over_handle(self): ''' run over, write to log in tmp ''' with open(self.live_log_path + '/' + str(os.getpid()) + '.txt', 'a+') as f: f.write('over') sys.stdout.write('parse over!\n') def load_url(self): ''' parse once url, first load it ''' global _CURRENT_URL_TYPE try: url = self.get_protected_iter.next() _CURRENT_URL_TYPE = 'protected' except StopIteration: try: url = self.get_gray_iter.next() _CURRENT_URL_TYPE = 'gray' except StopIteration: try: url = self.get_counterfeit_iter.next() _CURRENT_URL_TYPE = 'counterfeit' except StopIteration: try: url = self.get_monitor_iter.next() _CURRENT_URL_TYPE = 'monitor' except StopIteration: self.over_handle() global _CRAWLER_NUM run_time = int(time.time() - self.run_start_time) self.update_finish_state(_CRAWLER_NUM, run_time) os.remove(self.qt_live_path) os._exit(0) # 将实时运行状态写入防卡死检测文件 self.check_qt_alive.seek(0) # 清空文件 self.check_qt_alive.truncate(0) self.check_qt_alive.write(url + ' ' + str(_CRAWLER_NUM) + ' ' + str(self.engine_pid)) self.check_qt_alive.flush() local_html, local_time = self.web_save_path.get_html_path_abs( url, _CURRENT_URL_TYPE) global _LOCAL_TIME _LOCAL_TIME = local_time if local_html is None or local_time is None: sys.stdout.write( 'url not be saved: %s, task_id: %d\n' % (url, self.task_id)) self.load_url() else: global _CURRENT_URL global _CURRENT_TIME_PATH _CURRENT_URL = url _CURRENT_TIME_PATH = local_time #print 'load:', _CURRENT_URL self.load(QUrl(local_html)) def border_webpage(self): ''' get web page border and webpage ** this method inaccurate, page will change longer ** ** give up use ** ''' size = self.main_page.mainFrame().contentsSize() global _CURRENT_URL global _CURRENT_URL_TYPE nwe_border_list = [size.width(), size.height()] self.mongo_operate.add_web_border( _CURRENT_URL, _CURRENT_URL_TYPE, nwe_border_list) # print u"页面宽:%d,页面高:%d" % (size.width(), size.height()) self.main_page.setViewportSize( QSize(size.width() + 16, size.height())) img = QImage(size, QImage.Format_ARGB32) painter = QPainter(img) self.main_page.mainFrame().render(painter) painter.end() global _CURRENT_TIME_PATH img_path = _CURRENT_TIME_PATH + '/webpage.jpeg' if not img.save(img_path): sys.stderr.write('%s error to save img: %s, path: %s\n' % (time.ctime(), _CURRENT_URL, _CURRENT_TIME_PATH)) def load_finished(self, finished): ''' load_url over, save it text and ... ''' #global _CURRENT_URL # print 'finished:', finished, _CURRENT_URL self.pjs.set_vtree_text(str(self.main_frame.toPlainText())) self.main_frame.evaluateJavaScript(self.script) def addpjs(self): self.main_frame.addToJavaScriptWindowObject( "python", self.pjs) # 向Javascript注册Python对象
class Browser(QWebView): ''' simulation browser, render URL, parse it dom tree and text and webpage ''' def __init__(self, task_id, get_protected_iter, get_gray_iter, get_counterfeit_iter, get_monitor_iter, mongo_operate, update_running_state, update_finish_state, mysql_handle, run_start_time): super(Browser, self).__init__() self.task_id = task_id self.mongo_operate = mongo_operate self.update_running_state = update_running_state self.update_finish_state = update_finish_state self.get_protected_iter = get_protected_iter self.get_gray_iter = get_gray_iter self.get_counterfeit_iter = get_counterfeit_iter self.get_monitor_iter = get_monitor_iter self.mysql_handle = mysql_handle self.run_start_time = run_start_time global _CURRENT_PATH global _LIVE_LOG_PATH self.current_path = _CURRENT_PATH self.live_log_path = _LIVE_LOG_PATH self.web_save_path = WebSavePath() self.main_page = self.page() # QWebPage self.main_page.javaScriptAlert = self._alert self.main_frame = self.main_page.mainFrame() # QWebFrame # 将本地QT对象QWbebView的信号loadfinished与javascript相连 # 网页加载完毕后执行load_finished函数 self.main_frame.loadFinished.connect(self.load_finished) self.pjs = PythonJavascript(self, mongo_operate, update_running_state, mysql_handle) # 为保证每次刷新加载页面是都调用addtoJavascriptWindowObject方法 # 需要与信号javaScriptWindowObjectCleared相连 self.main_frame.javaScriptWindowObjectCleared.connect( self.addpjs) # 绑定python对象注册 with open(self.current_path + "/script.js", "r") as f: self.script = f.read() self.qt_live_path = '/tmp/' + str(task_id) + '_qt_callback.txt' self.check_qt_alive = open(self.qt_live_path, 'w') self.engine_pid = os.getpid() self.load_url() def _alert(self, frame, message): pass def over_handle(self): ''' run over, write to log in tmp ''' with open(self.live_log_path + '/' + str(os.getpid()) + '.txt', 'a+') as f: f.write('over') sys.stdout.write('parse over!\n') def load_url(self): ''' parse once url, first load it ''' global _CURRENT_URL_TYPE try: url = self.get_protected_iter.next() _CURRENT_URL_TYPE = 'protected' except StopIteration: try: url = self.get_gray_iter.next() _CURRENT_URL_TYPE = 'gray' except StopIteration: try: url = self.get_counterfeit_iter.next() _CURRENT_URL_TYPE = 'counterfeit' except StopIteration: try: url = self.get_monitor_iter.next() _CURRENT_URL_TYPE = 'monitor' except StopIteration: self.over_handle() global _CRAWLER_NUM run_time = int(time.time() - self.run_start_time) self.update_finish_state(_CRAWLER_NUM, run_time) os.remove(self.qt_live_path) os._exit(0) # 将实时运行状态写入防卡死检测文件 self.check_qt_alive.seek(0) # 清空文件 self.check_qt_alive.truncate(0) self.check_qt_alive.write(url + ' ' + str(_CRAWLER_NUM) + ' ' + str(self.engine_pid)) self.check_qt_alive.flush() local_html, local_time = self.web_save_path.get_html_path_abs( url, _CURRENT_URL_TYPE) global _LOCAL_TIME _LOCAL_TIME = local_time if local_html is None or local_time is None: sys.stdout.write('url not be saved: %s, task_id: %d\n' % (url, self.task_id)) self.load_url() else: global _CURRENT_URL global _CURRENT_TIME_PATH _CURRENT_URL = url _CURRENT_TIME_PATH = local_time #print 'load:', _CURRENT_URL self.load(QUrl(local_html)) def border_webpage(self): ''' get web page border and webpage ** this method inaccurate, page will change longer ** ** give up use ** ''' size = self.main_page.mainFrame().contentsSize() global _CURRENT_URL global _CURRENT_URL_TYPE nwe_border_list = [size.width(), size.height()] self.mongo_operate.add_web_border(_CURRENT_URL, _CURRENT_URL_TYPE, nwe_border_list) # print u"页面宽:%d,页面高:%d" % (size.width(), size.height()) self.main_page.setViewportSize(QSize(size.width() + 16, size.height())) img = QImage(size, QImage.Format_ARGB32) painter = QPainter(img) self.main_page.mainFrame().render(painter) painter.end() global _CURRENT_TIME_PATH img_path = _CURRENT_TIME_PATH + '/webpage.jpeg' if not img.save(img_path): sys.stderr.write('%s error to save img: %s, path: %s\n' % (time.ctime(), _CURRENT_URL, _CURRENT_TIME_PATH)) def load_finished(self, finished): ''' load_url over, save it text and ... ''' #global _CURRENT_URL # print 'finished:', finished, _CURRENT_URL self.pjs.set_vtree_text(str(self.main_frame.toPlainText())) self.main_frame.evaluateJavaScript(self.script) def addpjs(self): self.main_frame.addToJavaScriptWindowObject( "python", self.pjs) # 向Javascript注册Python对象