示例#1
0
 def web_save_transfer(self, url):
     self.mongo_operate.transfer_web_save(
         url, source_type='gray', goal_type='counterfeit')
     h = WebSavePath()
     source_file_path, target_file_path = h.get_transfer_path(
         url, 'gray', 'counterfeit')
     web_info_transfer(source_file_path, target_file_path)
示例#2
0
文件: getkeyword.py 项目: wyl-hit/job
 def __init__(self, mongo_operate=None):
     self.mongo_operate = mongo_operate
     self.web_save_path = WebSavePath()
     self.title = ''
     self.keyword = ''
     self.Html = ''
     self.url = ''
     self.div_num = 0
示例#3
0
    def page_shot(self):
        self.read_crawler_config()
        get_protected_iter = self.get_protected_iter
        get_gray_iter = self.get_gray_iter
        get_counterfeit_iter = self.get_counterfeit_iter
        get_monitor_iter = self.get_monitor_iter
        url_type = ''
        while 1:
            try:
                url = get_protected_iter.next()
                url_type = 'protected'
            except StopIteration:
                try:
                    url = get_gray_iter.next()
                    url_type = 'gray'
                except StopIteration:
                    try:
                        url = get_counterfeit_iter.next()
                        url_type = 'counterfeit'
                    except StopIteration:
                        try:
                            url = get_monitor_iter.next()
                            url_type = 'monitor'
                        except StopIteration:
                            break
            print 'shot: ', url

            web_save_path = WebSavePath()
            local_html, local_time = web_save_path.get_html_path_abs(
                url, url_type)
            if local_time is None:
                sys.stderr.write(
                    '%s  insert_web_info, web not be saved: %s\n' %
                    (time.ctime(), url))
                continue
            # webpage blockpage
            webpage_path = local_time + '/webpage.jpeg'
            img_type = 'webpage'  # img name : webpage.jpeg
            if not os.path.exists(webpage_path):
                main_html_path = local_time + '/main.html'
                if not os.path.exists(main_html_path):
                    sys.stderr.write(
                        '%s  insert_web_info, main.html not be exist: %s\n' %
                        (time.ctime(), url))
                    continue
                call_page_shot = CallPageShot(main_html_path, local_time,
                                              img_type)
                call_page_shot.start()
                while not os.path.exists(local_time + '/shot_over_sign'):
                    time.sleep(0.5)
                os.remove(local_time + '/shot_over_sign')
        print 'shot over'
示例#4
0
    def page_shot(self):
        self.read_crawler_config()
        get_protected_iter = self.get_protected_iter
        get_gray_iter = self.get_gray_iter
        get_counterfeit_iter = self.get_counterfeit_iter
        get_monitor_iter = self.get_monitor_iter
        url_type = ''
        while 1:
            try:
                url = get_protected_iter.next()
                url_type = 'protected'
            except StopIteration:
                try:
                    url = get_gray_iter.next()
                    url_type = 'gray'
                except StopIteration:
                    try:
                        url = get_counterfeit_iter.next()
                        url_type = 'counterfeit'
                    except StopIteration:
                        try:
                            url = get_monitor_iter.next()
                            url_type = 'monitor'
                        except StopIteration:
                            break
            print 'shot: ', url

            web_save_path = WebSavePath()
            local_html, local_time = web_save_path.get_html_path_abs(
                url, url_type)
            if local_time is None:
                sys.stderr.write('%s  insert_web_info, web not be saved: %s\n' %
                                 (time.ctime(), url))
                continue
            # webpage blockpage
            webpage_path = local_time + '/webpage.jpeg'
            img_type = 'webpage'  # img name : webpage.jpeg
            if not os.path.exists(webpage_path):
                main_html_path = local_time + '/main.html'
                if not os.path.exists(main_html_path):
                    sys.stderr.write('%s  insert_web_info, main.html not be exist: %s\n' %
                                     (time.ctime(), url))
                    continue
                call_page_shot = CallPageShot(
                    main_html_path, local_time, img_type)
                call_page_shot.start()
                while not os.path.exists(local_time + '/shot_over_sign'):
                    time.sleep(0.5)
                os.remove(local_time + '/shot_over_sign')
        print 'shot over'
示例#5
0
 def get_web_pic(self, url, url_type='counterfeit', path_type='abs'):
     '''
     根据url_type,去本地web_info目录下查找对应url保存的网页信息,
     根据main.heml和block.html进行截图,若已有截图则跳过,然后保存到该目录,
     path_type为abs时返回图片的绝对路径,为rel时返回相对路径
     '''
     web_save_path = WebSavePath()
     local_html, local_time = web_save_path.get_html_path_abs(
         url, url_type)
     if local_html is None or local_time is None:
         sys.stderr.write('%s  get_web_pic, web not be saved: %s\n' %
                          (time.ctime(), url))
         return '', ''
     # webpage blockpage
     webpage_path = local_time + '/webpage.jpeg'
     img_type = 'webpage'
     if not os.path.exists(webpage_path):
         main_html_path = local_time + '/main.html'
         if not os.path.exists(main_html_path):
             sys.stderr.write('%s  get_web_pic, main.html not be exist: %s\n' %
                              (time.ctime(), url))
             # 没有main.html则肯定没有block.html,所以可直接跳过
             return '', ''
         call_page_shot = CallPageShot(main_html_path, local_time, img_type)
         call_page_shot.start()
         while not os.path.exists(local_time + '/shot_over_sign'):
             time.sleep(0.5)
         os.remove(local_time + '/shot_over_sign')
     # insert blockpage
     blockpage_path = local_time + '/blockpage.jpeg'
     img_type = 'blockpage'
     if not os.path.exists(blockpage_path):
         block_html_path = local_time + '/block.html'
         if not os.path.exists(block_html_path):
             sys.stderr.write('%s  get_web_pic, block.html not be exist: %s\n' %
                              (time.ctime(), url))
             return '', ''
         call_page_shot = CallPageShot(
             block_html_path, local_time, img_type)
         call_page_shot.start()
         while not os.path.exists(local_time + '/shot_over_sign'):
             time.sleep(0.2)
         os.remove(local_time + '/shot_over_sign')
     if path_type == 'rel':
         local_html, local_time = web_save_path.get_html_path_rel(
             url, url_type)
         webpage_path = local_time + '/webpage.jpeg'
         blockpage_path = local_time + '/blockpage.jpeg'
     return webpage_path, blockpage_path
示例#6
0
    def view_work(self, url, web_type):

        self.vtree = []
        h = WebSavePath()
        filename, path = h.get_html_path_abs(url, web_type)
        if path is not None:
            self.vtree = self.mongo_operate.get_web_tree(
                url, web_type)  # 根据url 获取网页视觉块信息
            if self.vtree is False or self.vtree == []:
                return None
            p = ViewPageBlock(path, self.vtree, self.current_path, url,
                              self.mongo_operate, web_type)
            p.gather_vips_pic()  # 获取网页图像特征
            p.save_feature()  # 格式化图像特征,存入mongo
        else:
            sys.stderr.write(' view_work:no this path')
            return None
示例#7
0
文件: getkeyword.py 项目: wyl-hit/job
 def __init__(self, mongo_operate=None):
     self.mongo_operate = mongo_operate
     self.web_save_path = WebSavePath()
     self.title = ''
     self.keyword = ''
     self.Html = ''
     self.url = ''
     self.div_num = 0
示例#8
0
    def view_work(self, url, web_type):

        self.vtree = []
        h = WebSavePath()
        filename, path = h.get_html_path_abs(url, web_type)
        if path is not None:
            self.vtree = self.mongo_operate.get_web_tree(
                url, web_type)  # 根据url 获取网页视觉块信息
            if self.vtree is False or self.vtree == []:
                    return None
            p = ViewPageBlock(
                path, self.vtree, self.current_path, url, self.mongo_operate, web_type)
            p.gather_vips_pic()  # 获取网页图像特征
            p.save_feature()     # 格式化图像特征,存入mongo
        else:
            sys.stderr.write(' view_work:no this path')
            return None
示例#9
0
    def __init__(self, task_id, get_protected_iter, get_gray_iter,
                 get_counterfeit_iter, get_monitor_iter, mongo_operate,
                 update_running_state, update_finish_state, mysql_handle,
                 run_start_time):
        super(Browser, self).__init__()
        self.task_id = task_id
        self.mongo_operate = mongo_operate
        self.update_running_state = update_running_state
        self.update_finish_state = update_finish_state
        self.get_protected_iter = get_protected_iter
        self.get_gray_iter = get_gray_iter
        self.get_counterfeit_iter = get_counterfeit_iter
        self.get_monitor_iter = get_monitor_iter
        self.mysql_handle = mysql_handle
        self.run_start_time = run_start_time

        global _CURRENT_PATH
        global _LIVE_LOG_PATH
        self.current_path = _CURRENT_PATH
        self.live_log_path = _LIVE_LOG_PATH

        self.web_save_path = WebSavePath()
        self.main_page = self.page()  # QWebPage
        self.main_page.javaScriptAlert = self._alert
        self.main_frame = self.main_page.mainFrame()  # QWebFrame
        # 将本地QT对象QWbebView的信号loadfinished与javascript相连
        # 网页加载完毕后执行load_finished函数
        self.main_frame.loadFinished.connect(self.load_finished)
        self.pjs = PythonJavascript(self, mongo_operate, update_running_state,
                                    mysql_handle)
        # 为保证每次刷新加载页面是都调用addtoJavascriptWindowObject方法
        # 需要与信号javaScriptWindowObjectCleared相连
        self.main_frame.javaScriptWindowObjectCleared.connect(
            self.addpjs)  # 绑定python对象注册
        with open(self.current_path + "/script.js", "r") as f:
            self.script = f.read()

        self.qt_live_path = '/tmp/' + str(task_id) + '_qt_callback.txt'
        self.check_qt_alive = open(self.qt_live_path, 'w')
        self.engine_pid = os.getpid()

        self.load_url()
示例#10
0
 def __init__(self, url_type_list, mongo_operate):
     # 例:url="http://www.outofmemory.cn/11/12.htm"
     self.url = url_type_list[0]
     self.url_type = url_type_list[1]
     self.parts = urlparse(self.url)
     self.host = self.parts.netloc
     # u_pro: "http"  u_s1: "www.outofmemory.cn/11/12.htm"
     self.u_pro, u_s1 = splittype(self.url)
     # u_host: "www.outofmemory.cn"   u_path: "/11/12.htm"
     self.u_host, self.u_path = splithost(u_s1)
     if self.u_host is None:
         return None
     self.u_host, port = splitport(self.u_host)
     # 获取到 "http://www.outofmemory.cn"
     self.u_phost = self.u_pro + '://' + self.u_host
     # self.url_md5 = self.get_folder_name_split(url)
     self.gray_create_time = str(
         time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())))
     # Web_save_base类获取url保存的路径和MD5值
     self.web_save_path = WebSavePath()
     self.upath, self.identity_list = self.web_save_path.get_file_save_path(
         self.url, self.url_type)
     # self.s_path 格式为 域名路径/md5/时间戳
     self.s_path = self.upath + '/' + self.gray_create_time
     # 初始化 extract_html类,获取title和keyword 写入数据库
     self.parse_web_text = extract_html(mongo_operate)
     self.css_path = 'css'
     self.js_path = 'js'
     self.html_path = 'html'
     self.pic_path = 'pic'
     # 保存数据的根目录  格式为: ...web_save/web_info
     self.r_path = pjoin(_PATH, 'web_info')
     # 域名路径层的文件目录   格式为: 工作路径/web_info/域名路径/md5
     self.upath = pjoin(self.r_path, self.upath)
     self.headers = {}
     self.total_list = []
示例#11
0
    def __init__(self, task_id, get_protected_iter, get_gray_iter,
                 get_counterfeit_iter, get_monitor_iter, mongo_operate,
                 update_running_state, update_finish_state, mysql_handle,
                 run_start_time):
        super(Browser, self).__init__()
        self.task_id = task_id
        self.mongo_operate = mongo_operate
        self.update_running_state = update_running_state
        self.update_finish_state = update_finish_state
        self.get_protected_iter = get_protected_iter
        self.get_gray_iter = get_gray_iter
        self.get_counterfeit_iter = get_counterfeit_iter
        self.get_monitor_iter = get_monitor_iter
        self.mysql_handle = mysql_handle
        self.run_start_time = run_start_time

        global _CURRENT_PATH
        global _LIVE_LOG_PATH
        self.current_path = _CURRENT_PATH
        self.live_log_path = _LIVE_LOG_PATH

        self.web_save_path = WebSavePath()
        self.main_page = self.page()  # QWebPage
        self.main_page.javaScriptAlert = self._alert
        self.main_frame = self.main_page.mainFrame()  # QWebFrame
        # 将本地QT对象QWbebView的信号loadfinished与javascript相连
        # 网页加载完毕后执行load_finished函数
        self.main_frame.loadFinished.connect(
            self.load_finished)
        self.pjs = PythonJavascript(
            self, mongo_operate, update_running_state, mysql_handle)
        # 为保证每次刷新加载页面是都调用addtoJavascriptWindowObject方法
        # 需要与信号javaScriptWindowObjectCleared相连
        self.main_frame.javaScriptWindowObjectCleared.connect(
            self.addpjs)  # 绑定python对象注册
        with open(self.current_path + "/script.js", "r") as f:
            self.script = f.read()

        self.qt_live_path = '/tmp/' + str(task_id) + '_qt_callback.txt'
        self.check_qt_alive = open(self.qt_live_path, 'w')
        self.engine_pid = os.getpid()

        self.load_url()
示例#12
0
文件: getkeyword.py 项目: wyl-hit/job
class extract_html():
    def __init__(self, mongo_operate=None):
        self.mongo_operate = mongo_operate
        self.web_save_path = WebSavePath()
        self.title = ''
        self.keyword = ''
        self.Html = ''
        self.url = ''
        self.div_num = 0
        # pass

    def write_mongo(self, goal_url, title, text, url_type):
        self.mongo_operate.add_web_title(goal_url, url_type, title)
        self.mongo_operate.add_web_text(goal_url, url_type, text)

    def get_html_file(self, url, url_type):
        html_path, time_path = self.web_save_path.get_html_path_abs(
            url, url_type)
        if html_path is None or time_path is None:
            return False
        # print html_path
        f = open(html_path, 'r')
        self.Html = f.read()

    def get_keyword(self, Html=None):
        '''
        获取 title keyword, div num
        '''
        if Html is None:
            Html = self.Html
        # Html=file('main.html').read()
        char = chardet.detect(Html)['encoding']
        page = etree.HTML(Html.decode(char, 'ignore'))
        title = page.xpath('/html/head/title/text()')
        self.title = title[0].strip()

        match = re.findall(r"<div.*?>(.*?)</div>", Html)
        self.div_num = len(match)  # get the <div> number

        re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I)  # 匹配CDATA
        s = re_cdata.sub('', Html)
        re_br = re.compile('<br\s*?/?>')  # 处理换行
        s = re_br.sub('\n', Html)
        blank_line = re.compile('\n+')  # 去掉多余的空行
        s = blank_line.sub('', s)
        re_comment = re.compile('<!--[^>]*-->')  # HTML注释
        s = re_comment.sub('', s)  # 去掉HTML注释
        re_style = re.compile('<style\s*[^>]*>(.*?)</style\s*>')  # style
        s = re_style.sub('', s)
        re_script = re.compile('<script\s*[^>]*>(.*?)</script>')
        s = re_script.sub('', s)
        re_h = re.compile('</?[^>]*>')  # 处理html标签
        s = re_h.sub('', s)
        s = replaceCharEntity(s)  # 替换实体
        s = s.replace(" ", "")
        cut_web_text = cut_all(s)
        self.keyword = get_keyword(cut_web_text)
        if chardet.detect(self.title)['encoding'] == 'GB2312':
            self.title = unicode(self.title, "gb2312").encode('utf-8')
        if chardet.detect(self.keyword)['encoding'] == 'GB2312':
            self.keyword = unicode(self.keyword, "gb2312").encode('utf-8')
        return [self.div_num, self.keyword, self.title]

    def save_info(self, url, path, url_type):
        with open(pjoin(path, 'id.txt'), 'wb') as f:  # 存取文件
            f.write(str(self.div_num) + ' ' + self.keyword)
        f.close()
        self.write_mongo(url, self.title, self.keyword, url_type)  # 写入数据库
示例#13
0
class HtmlStruct():

    def __init__(self, url_type_list, mongo_operate):
        # 例:url="http://www.outofmemory.cn/11/12.htm"
        self.url = url_type_list[0]
        self.url_type = url_type_list[1]
        self.parts = urlparse(self.url)
        self.host = self.parts.netloc
        # u_pro: "http"  u_s1: "www.outofmemory.cn/11/12.htm"
        self.u_pro, u_s1 = splittype(self.url)
        # u_host: "www.outofmemory.cn"   u_path: "/11/12.htm"
        self.u_host, self.u_path = splithost(u_s1)
        if self.u_host is None:
            return None
        self.u_host, port = splitport(self.u_host)
        # 获取到 "http://www.outofmemory.cn"
        self.u_phost = self.u_pro + '://' + self.u_host
        # self.url_md5 = self.get_folder_name_split(url)
        self.gray_create_time = str(
            time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())))
        # Web_save_base类获取url保存的路径和MD5值
        self.web_save_path = WebSavePath()
        self.upath, self.identity_list = self.web_save_path.get_file_save_path(
            self.url, self.url_type)
        # self.s_path 格式为 域名路径/md5/时间戳
        self.s_path = self.upath + '/' + self.gray_create_time
        # 初始化 extract_html类,获取title和keyword 写入数据库
        self.parse_web_text = extract_html(mongo_operate)
        self.css_path = 'css'
        self.js_path = 'js'
        self.html_path = 'html'
        self.pic_path = 'pic'
        # 保存数据的根目录  格式为: ...web_save/web_info
        self.r_path = pjoin(_PATH, 'web_info')
        # 域名路径层的文件目录   格式为: 工作路径/web_info/域名路径/md5
        self.upath = pjoin(self.r_path, self.upath)
        self.headers = {}
        self.total_list = []

    def format_css_url(self, path, url):
        url = url.replace('\\', '')
        url = url.replace('"', '')
        url = url.replace("'", '')
        url = url.strip()
        if url.find('http') != 0:
            if url.find('/') == 0 or url.find('./') == 0:
                url = url[url.find('/') + 1:]
                if path.find('./') == 0:
                    return url
                else:
                    path = path[0:path.rfind('/')]
                    url = path + '/' + url
            elif url.find('../') == 0:
                if url.find('www.') != -1:
                    url = url[url.find('www.'):]
                    return url
                else:
                    num = url.count('../')
                    path_list = path.split('/')
                    path_list = path_list[0:len(path_list) - num - 1]
                    if len(path_list) > 0:
                        path = '/'.join(path_list)
                        url = path + '/' + url[num * 3:]
                    else:
                        path = ''
                        url = path + url[num * 3:]
            else:
                index = path.rfind('/')
                path = path[:index]
                url = pjoin(path, url).replace('\\', '/')
        return url

    def set_response(self, body):
        char = chardet.detect(body)['encoding']
        self.char = char
        body = body.decode(char, 'ignore')
        self.html = body
        # 由于网页的编码可能不是utf-8因此需要分析并进行转换
        self.page = etree.HTML(body)
        self.get_urllist()

    def add_pic(self, css_body, path, filename):
        '''使用正则匹配css文件中的url,添加到pic_list中,修改css文件中的路径,如此的话,此次下载的css文件需要修该url中的路径
            完成对list的修改'''
        css_url = []
        css_pic_format = []
        char_css = chardet.detect(css_body)['encoding']
        css_pic = re.findall(
            _CSSIMGURLRE, css_body.decode(char_css, 'ignore').encode('utf8'), re.S)
        for i in css_pic:
            if i and i.find('data:') != 0:
                css_pic_format.append([])
                re_url = self.format_css_url(path, i)
                css_pic_format[-1] = (self.format_url(re_url), i)
        css_url = _FC(
            (pjoin(self.r_path, self.s_path, self.css_path), self.css_path), css_pic_format)
        for url in css_url:
            css_body = css_body.replace(url[0][1], url[2].replace('\\', '/'))

        css_path = pjoin(pjoin(self.r_path, self.s_path), 'css')
        save_to_file(css_body, css_path, filename)
        return css_url

    def get_urllist(self):
        css_list = self.page.xpath('//*/link[@rel="stylesheet"]/@href')
        js_list = self.page.xpath('//*/script/@src')
        pic_list = self.page.xpath('//*/link[@rel="shortcut icon"]/@href')
        pic_list += self.page.xpath('//*/link[@rel="Shortcut Icon"]/@href')
        pic_list += self.page.xpath('//*/link[@rel="icon"]/@href')
        pic_list += re.findall(_CSSIMGURLRE, self.html, re.S)
        pic_list += self.page.xpath('//img/@original')
        pic_list += self.page.xpath('//img/@src')
        pic_list += self.page.xpath('//div/@src')
        pic_list += self.page.xpath('//input/@src')
        pic_list += self.page.xpath('//*/@background')
        html_list = self.page.xpath('//iframe/@src')
        html_list += self.page.xpath('//frame/@src')
        html_list = list(set(html_list))
        pic_list = list(set(pic_list))
        css_list = list(set(css_list))
        js_list = list(set(js_list))
        fc = lambda url_list: [(self.format_url(url).replace(
            '\\', '/'), url) for url in url_list if url and url.find('data:') != 0]
        self.pic_list = fc(pic_list)
        self.css_list = fc(css_list)
        self.html_list = fc(html_list)
        self.css_list = _FC(
            (pjoin(self.r_path, self.s_path, self.css_path), self.css_path), self.css_list)
        self.js_list = fc(js_list)

    def format_url(self, url):
        url = url.replace('\\', '')
        url = url.replace("'", '')
        url = url.replace('"', '')
        url = url.strip()
        if url.find('//') == 0:
            return self.u_pro + ':' + url
        if url.find('www.') == 0:
            return self.u_pro + '://' + url
        if url.find('http') != 0:
            if url.find('./') == 0:
                r_url = url[url.find('/') + 1:]
                path = os.path.join(
                    self.u_path[0:self.u_path.rfind('/')], r_url).replace('\\', '/')
            elif url.find('../') == 0:
                u_path = '/'.join(self.u_path.split('/')[:-2])
                r_url = url[url.find('/') + 1:]
                path = os.path.join(u_path, r_url).replace('\\', '/')
            elif url.find('/') == 0:
                path = url
            else:
                path = os.path.join(
                    self.u_path[0:self.u_path.rfind('/') + 1], url).replace('\\', '/')
            if path:
                if path[0] == '.':
                    path = os.path.join(self.url, path[path.find('/'):])
                if path[0] != '/':
                    return self.u_phost + '/' + path
            return self.u_phost + path
        return url

    def store(self):
        dl_list = []
        dl_list += _FC((pjoin(self.r_path, self.s_path,
                              self.pic_path), self.pic_path), self.pic_list)
        dl_list += _FC((pjoin(self.r_path, self.s_path,
                              self.js_path), self.js_path), self.js_list)
        dl_list += _FC((pjoin(self.r_path, self.s_path,
                              self.html_path), self.html_path), self.html_list)
        self.total_list += dl_list
        dl_list += self.css_list
        self.dl_list = dl_list
        for dl in dl_list:
            if dl[0][1] != ' ':
                self.html = self.html.replace(
                    dl[0][1], pjoin(dl[1][1], dl[2]).replace('\\', '/'))
        self.html = self.html.replace("base href", "cc")
        self.html = self.html.encode('utf-8', 'ignore')
        # print 'call back all url', self.url
        # s_path 格式为: 当前工作目录/web/域名/时间戳/
        s_path = pjoin(self.r_path, self.s_path)

        now_web_info = self.parse_web_text.get_keyword(self.html)
        print 'now_web_info', now_web_info  

        if self.identity_list is not None:
            if self.identity_list[0] == now_web_info[0] or self.identity_list[1] == now_web_info[1]:
                return 0, self.url, self.url_type
        # print 'call back not saved, wait save url', self.url
        save_to_file(self.html, s_path, 'main.html')
        with open(pjoin(s_path, 'url'), 'wb') as f:
            f.write(self.url)
        # 将title和关键字从网页中抽取出存入mongo
        self.parse_web_text.save_info(self.url, s_path, self.url_type)
        return 1, self.url, self.url_type
示例#14
0
class Browser(QWebView):

    '''
    simulation browser, render URL, parse it dom tree and text and webpage
    '''

    def __init__(self, task_id, get_protected_iter, get_gray_iter,
                 get_counterfeit_iter, get_monitor_iter, mongo_operate,
                 update_running_state, update_finish_state, mysql_handle,
                 run_start_time):
        super(Browser, self).__init__()
        self.task_id = task_id
        self.mongo_operate = mongo_operate
        self.update_running_state = update_running_state
        self.update_finish_state = update_finish_state
        self.get_protected_iter = get_protected_iter
        self.get_gray_iter = get_gray_iter
        self.get_counterfeit_iter = get_counterfeit_iter
        self.get_monitor_iter = get_monitor_iter
        self.mysql_handle = mysql_handle
        self.run_start_time = run_start_time

        global _CURRENT_PATH
        global _LIVE_LOG_PATH
        self.current_path = _CURRENT_PATH
        self.live_log_path = _LIVE_LOG_PATH

        self.web_save_path = WebSavePath()
        self.main_page = self.page()  # QWebPage
        self.main_page.javaScriptAlert = self._alert
        self.main_frame = self.main_page.mainFrame()  # QWebFrame
        # 将本地QT对象QWbebView的信号loadfinished与javascript相连
        # 网页加载完毕后执行load_finished函数
        self.main_frame.loadFinished.connect(
            self.load_finished)
        self.pjs = PythonJavascript(
            self, mongo_operate, update_running_state, mysql_handle)
        # 为保证每次刷新加载页面是都调用addtoJavascriptWindowObject方法
        # 需要与信号javaScriptWindowObjectCleared相连
        self.main_frame.javaScriptWindowObjectCleared.connect(
            self.addpjs)  # 绑定python对象注册
        with open(self.current_path + "/script.js", "r") as f:
            self.script = f.read()

        self.qt_live_path = '/tmp/' + str(task_id) + '_qt_callback.txt'
        self.check_qt_alive = open(self.qt_live_path, 'w')
        self.engine_pid = os.getpid()

        self.load_url()

    def _alert(self, frame, message):
        pass

    def over_handle(self):
        '''
        run over, write to log in tmp
        '''
        with open(self.live_log_path + '/' + str(os.getpid()) + '.txt', 'a+') as f:
            f.write('over')
        sys.stdout.write('parse over!\n')

    def load_url(self):
        '''
        parse once url, first load it
        '''
        global _CURRENT_URL_TYPE
        try:
            url = self.get_protected_iter.next()
            _CURRENT_URL_TYPE = 'protected'
        except StopIteration:
            try:
                url = self.get_gray_iter.next()
                _CURRENT_URL_TYPE = 'gray'
            except StopIteration:
                try:
                    url = self.get_counterfeit_iter.next()
                    _CURRENT_URL_TYPE = 'counterfeit'
                except StopIteration:
                    try:
                        url = self.get_monitor_iter.next()
                        _CURRENT_URL_TYPE = 'monitor'
                    except StopIteration:
                        self.over_handle()
                        global _CRAWLER_NUM
                        run_time = int(time.time() - self.run_start_time)
                        self.update_finish_state(_CRAWLER_NUM, run_time)
                        os.remove(self.qt_live_path)
                        os._exit(0)
        # 将实时运行状态写入防卡死检测文件
        self.check_qt_alive.seek(0)  # 清空文件
        self.check_qt_alive.truncate(0)
        self.check_qt_alive.write(url + ' ' + str(_CRAWLER_NUM) + ' ' + str(self.engine_pid))
        self.check_qt_alive.flush()
        local_html, local_time = self.web_save_path.get_html_path_abs(
            url, _CURRENT_URL_TYPE)
        global _LOCAL_TIME
        _LOCAL_TIME = local_time
        if local_html is None or local_time is None:
            sys.stdout.write(
                'url not be saved: %s, task_id: %d\n' % (url, self.task_id))
            self.load_url()
        else:
            global _CURRENT_URL
            global _CURRENT_TIME_PATH
            _CURRENT_URL = url
            _CURRENT_TIME_PATH = local_time
            #print 'load:', _CURRENT_URL
            self.load(QUrl(local_html))

    def border_webpage(self):
        '''
        get web page border and webpage
        ** this method inaccurate, page will change longer **
        ** give up use **
        '''
        size = self.main_page.mainFrame().contentsSize()
        global _CURRENT_URL
        global _CURRENT_URL_TYPE
        nwe_border_list = [size.width(), size.height()]
        self.mongo_operate.add_web_border(
            _CURRENT_URL, _CURRENT_URL_TYPE, nwe_border_list)
        # print u"页面宽:%d,页面高:%d" % (size.width(), size.height())
        self.main_page.setViewportSize(
            QSize(size.width() + 16, size.height()))
        img = QImage(size, QImage.Format_ARGB32)
        painter = QPainter(img)
        self.main_page.mainFrame().render(painter)
        painter.end()
        global _CURRENT_TIME_PATH
        img_path = _CURRENT_TIME_PATH + '/webpage.jpeg'
        if not img.save(img_path):
            sys.stderr.write('%s  error to save img: %s,  path: %s\n' %
                             (time.ctime(), _CURRENT_URL,
                              _CURRENT_TIME_PATH))

    def load_finished(self, finished):
        '''
        load_url over, save it text and ...
        '''
        #global _CURRENT_URL
        # print 'finished:', finished, _CURRENT_URL
        self.pjs.set_vtree_text(str(self.main_frame.toPlainText()))
        self.main_frame.evaluateJavaScript(self.script)

    def addpjs(self):
        self.main_frame.addToJavaScriptWindowObject(
            "python", self.pjs)  # 向Javascript注册Python对象
示例#15
0
class Browser(QWebView):
    '''
    simulation browser, render URL, parse it dom tree and text and webpage
    '''
    def __init__(self, task_id, get_protected_iter, get_gray_iter,
                 get_counterfeit_iter, get_monitor_iter, mongo_operate,
                 update_running_state, update_finish_state, mysql_handle,
                 run_start_time):
        super(Browser, self).__init__()
        self.task_id = task_id
        self.mongo_operate = mongo_operate
        self.update_running_state = update_running_state
        self.update_finish_state = update_finish_state
        self.get_protected_iter = get_protected_iter
        self.get_gray_iter = get_gray_iter
        self.get_counterfeit_iter = get_counterfeit_iter
        self.get_monitor_iter = get_monitor_iter
        self.mysql_handle = mysql_handle
        self.run_start_time = run_start_time

        global _CURRENT_PATH
        global _LIVE_LOG_PATH
        self.current_path = _CURRENT_PATH
        self.live_log_path = _LIVE_LOG_PATH

        self.web_save_path = WebSavePath()
        self.main_page = self.page()  # QWebPage
        self.main_page.javaScriptAlert = self._alert
        self.main_frame = self.main_page.mainFrame()  # QWebFrame
        # 将本地QT对象QWbebView的信号loadfinished与javascript相连
        # 网页加载完毕后执行load_finished函数
        self.main_frame.loadFinished.connect(self.load_finished)
        self.pjs = PythonJavascript(self, mongo_operate, update_running_state,
                                    mysql_handle)
        # 为保证每次刷新加载页面是都调用addtoJavascriptWindowObject方法
        # 需要与信号javaScriptWindowObjectCleared相连
        self.main_frame.javaScriptWindowObjectCleared.connect(
            self.addpjs)  # 绑定python对象注册
        with open(self.current_path + "/script.js", "r") as f:
            self.script = f.read()

        self.qt_live_path = '/tmp/' + str(task_id) + '_qt_callback.txt'
        self.check_qt_alive = open(self.qt_live_path, 'w')
        self.engine_pid = os.getpid()

        self.load_url()

    def _alert(self, frame, message):
        pass

    def over_handle(self):
        '''
        run over, write to log in tmp
        '''
        with open(self.live_log_path + '/' + str(os.getpid()) + '.txt',
                  'a+') as f:
            f.write('over')
        sys.stdout.write('parse over!\n')

    def load_url(self):
        '''
        parse once url, first load it
        '''
        global _CURRENT_URL_TYPE
        try:
            url = self.get_protected_iter.next()
            _CURRENT_URL_TYPE = 'protected'
        except StopIteration:
            try:
                url = self.get_gray_iter.next()
                _CURRENT_URL_TYPE = 'gray'
            except StopIteration:
                try:
                    url = self.get_counterfeit_iter.next()
                    _CURRENT_URL_TYPE = 'counterfeit'
                except StopIteration:
                    try:
                        url = self.get_monitor_iter.next()
                        _CURRENT_URL_TYPE = 'monitor'
                    except StopIteration:
                        self.over_handle()
                        global _CRAWLER_NUM
                        run_time = int(time.time() - self.run_start_time)
                        self.update_finish_state(_CRAWLER_NUM, run_time)
                        os.remove(self.qt_live_path)
                        os._exit(0)
        # 将实时运行状态写入防卡死检测文件
        self.check_qt_alive.seek(0)  # 清空文件
        self.check_qt_alive.truncate(0)
        self.check_qt_alive.write(url + ' ' + str(_CRAWLER_NUM) + ' ' +
                                  str(self.engine_pid))
        self.check_qt_alive.flush()
        local_html, local_time = self.web_save_path.get_html_path_abs(
            url, _CURRENT_URL_TYPE)
        global _LOCAL_TIME
        _LOCAL_TIME = local_time
        if local_html is None or local_time is None:
            sys.stdout.write('url not be saved: %s, task_id: %d\n' %
                             (url, self.task_id))
            self.load_url()
        else:
            global _CURRENT_URL
            global _CURRENT_TIME_PATH
            _CURRENT_URL = url
            _CURRENT_TIME_PATH = local_time
            #print 'load:', _CURRENT_URL
            self.load(QUrl(local_html))

    def border_webpage(self):
        '''
        get web page border and webpage
        ** this method inaccurate, page will change longer **
        ** give up use **
        '''
        size = self.main_page.mainFrame().contentsSize()
        global _CURRENT_URL
        global _CURRENT_URL_TYPE
        nwe_border_list = [size.width(), size.height()]
        self.mongo_operate.add_web_border(_CURRENT_URL, _CURRENT_URL_TYPE,
                                          nwe_border_list)
        # print u"页面宽:%d,页面高:%d" % (size.width(), size.height())
        self.main_page.setViewportSize(QSize(size.width() + 16, size.height()))
        img = QImage(size, QImage.Format_ARGB32)
        painter = QPainter(img)
        self.main_page.mainFrame().render(painter)
        painter.end()
        global _CURRENT_TIME_PATH
        img_path = _CURRENT_TIME_PATH + '/webpage.jpeg'
        if not img.save(img_path):
            sys.stderr.write('%s  error to save img: %s,  path: %s\n' %
                             (time.ctime(), _CURRENT_URL, _CURRENT_TIME_PATH))

    def load_finished(self, finished):
        '''
        load_url over, save it text and ...
        '''
        #global _CURRENT_URL
        # print 'finished:', finished, _CURRENT_URL
        self.pjs.set_vtree_text(str(self.main_frame.toPlainText()))
        self.main_frame.evaluateJavaScript(self.script)

    def addpjs(self):
        self.main_frame.addToJavaScriptWindowObject(
            "python", self.pjs)  # 向Javascript注册Python对象
示例#16
0
文件: getkeyword.py 项目: wyl-hit/job
class extract_html():

    def __init__(self, mongo_operate=None):
        self.mongo_operate = mongo_operate
        self.web_save_path = WebSavePath()
        self.title = ''
        self.keyword = ''
        self.Html = ''
        self.url = ''
        self.div_num = 0
        # pass

    def write_mongo(self, goal_url, title, text, url_type):
        self.mongo_operate.add_web_title(goal_url, url_type, title)
        self.mongo_operate.add_web_text(goal_url, url_type, text)

    def get_html_file(self, url, url_type):
        html_path, time_path = self.web_save_path.get_html_path_abs(
            url, url_type)
        if html_path is None or time_path is None:
            return False
        # print html_path
        f = open(html_path, 'r')
        self.Html = f.read()

    def get_keyword(self, Html=None):
        '''
        获取 title keyword, div num
        '''
        if Html is None:
            Html = self.Html
        # Html=file('main.html').read()
        char = chardet.detect(Html)['encoding']
        page = etree.HTML(Html.decode(char, 'ignore'))
        title = page.xpath('/html/head/title/text()')
        self.title = title[0].strip()

        match = re.findall(r"<div.*?>(.*?)</div>", Html)
        self.div_num = len(match)   # get the <div> number

        re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I)  # 匹配CDATA
        s = re_cdata.sub('', Html)
        re_br = re.compile('<br\s*?/?>')  # 处理换行
        s = re_br.sub('\n', Html)
        blank_line = re.compile('\n+')  # 去掉多余的空行
        s = blank_line.sub('', s)
        re_comment = re.compile('<!--[^>]*-->')  # HTML注释
        s = re_comment.sub('', s)  # 去掉HTML注释
        re_style = re.compile('<style\s*[^>]*>(.*?)</style\s*>')  # style
        s = re_style.sub('', s)
        re_script = re.compile('<script\s*[^>]*>(.*?)</script>')
        s = re_script.sub('', s)
        re_h = re.compile('</?[^>]*>')  # 处理html标签
        s = re_h.sub('', s)
        s = replaceCharEntity(s)  # 替换实体
        s = s.replace(" ", "")
        cut_web_text = cut_all(s)
        self.keyword = get_keyword(cut_web_text)
        if chardet.detect(self.title)['encoding'] == 'GB2312':
            self.title = unicode(self.title, "gb2312").encode('utf-8')
        if chardet.detect(self.keyword)['encoding'] == 'GB2312':
            self.keyword = unicode(self.keyword, "gb2312").encode('utf-8')
        return [self.div_num, self.keyword, self.title]

    def save_info(self, url, path, url_type):
        with open(pjoin(path, 'id.txt'), 'wb') as f:  # 存取文件
            f.write(str(self.div_num) + ' ' + self.keyword)
        f.close()
        self.write_mongo(url, self.title, self.keyword, url_type)  # 写入数据库