Пример #1
0
    def fetch_html(self, url, encoding='', errors='strict'):
        if not errors:
            errors = 'strict'

        bytes_data, bytes_encoding = self.fetch_bytes_encoding(url)

        # get encoding
        if not encoding:
            # server send
            encoding = bytes_encoding

            if not encoding:
                # chardect
                encoding = Fetcher.d(url, bytes_data)

                if not encoding:
                    # default: utf-8
                    print('fetcher:打开%s时,使用utf-8作默认' % url)
                    encoding = 'utf-8'

        # decode
        try:
            return bytes_data.decode(encoding, errors)
        except:
            print('下载器<解文本编码>失败')

            s = '字节长度:%d.使用编码:%s.网址:%s' % \
                (len(bytes_data), encoding, url)
            raise c_worker_exception('解文本编码(decode)时出现异常', url, s)
Пример #2
0
    def fetch_html(self, url, encoding='', errors='strict'):
        if not errors:
            errors = 'strict'

        bytes_data, bytes_encoding = self.fetch_bytes_encoding(url)

        # get encoding
        if not encoding:
            # server send
            encoding = bytes_encoding

            if not encoding:
                # chardect
                encoding = Fetcher.d(url, bytes_data)

                if not encoding:
                    # default: utf-8
                    print('fetcher:打开%s时,使用utf-8作默认' % url)
                    encoding = 'utf-8'

        # decode
        try:
            return bytes_data.decode(encoding, errors)
        except:
            print('下载器<解文本编码>失败')

            s = '字节长度:%d.使用编码:%s.网址:%s' % \
                (len(bytes_data), encoding, url)
            raise c_worker_exception('解文本编码(decode)时出现异常', url, s)
Пример #3
0
def pattern_error(blocknum, isblock=True):
    if isblock:
        s = '第%d个block的blockre编译失败' % (blocknum + 1)
    else:
        s = '第%d个block的itemre编译失败' % (blocknum + 1)

    raise c_worker_exception('正则表达式编译失败',
                             '',
                             s)
Пример #4
0
def do_process(data_dict, worker_dict):
    package = data_dict['package']

    url = 'https://pypi.python.org/pypi/%s/' % package

    f = Fetcher()
    string = f.fetch_html(url)

    if not string:
        raise c_worker_exception('无法下载url', url)

    # single page
    single_re = (r'<span class="breadcrumb-separator">.*?'
                 r'<span class="breadcrumb-separator">.*?'
                 r'<a href="(/pypi/([^/]+)/([^"]+))">.*?'
                 r'class="odd".*?'
                 r'<td>(\d{4}-\d{1,2}-\d{1,2})</td>'
                 )

    prog = red.d(single_re, red.DOTALL)
    m = prog.search(string)
    if m:
        info = c_info()
        info.title = m.group(2) + ' ' + m.group(3)
        info.url = url
        info.pub_date = m.group(4)
        info.suid = info.title

        return [info]

    # table page
    table_re = (r'<tr class="(?:odd|even)">.*?'
                r'<a href="(/pypi/([^/]+)/([^"]+))">'
                r'')
    prog = red.d(table_re, red.DOTALL)
    miter = prog.finditer(string)

    lst = list()
    for m in miter:
        info = c_info()
        info.title = m.group(2) + ' ' + m.group(3)
        info.url = 'https://pypi.python.org' + m.group(1)
        info.suid = info.title

        lst.append(info)
    return lst
Пример #5
0
def parse_xml(data_dict, xml):
    if not xml:
        raise c_worker_exception('xml为空字符串', data_dict['url'], '')

    r = red.d(r'^\s*$')
    if r.match(xml) is not None:
        raise c_worker_exception('xml只有空白', data_dict['url'], '')

    # remove namespace of atom
    xml = red.sub(r'''<feed\s+(?:"[^"]*"|'[^']*'|[^"'>])*>''',
                  r'<feed>',
                  xml,
                  count=1,
                  flags=red.IGNORECASE | red.A)

    # ElementTree
    try:
        doc = ET.fromstring(xml)
    except Exception as e:
        doc = None
        if not etree:
            raise c_worker_exception('解析XML失败,可以尝试安装lxml模块', data_dict['url'],
                                     str(e))

    # lxml
    if doc is None:
        try:
            parser = etree.XMLParser(recover=True, encoding='utf-8')
            doc = etree.fromstring(xml.encode('utf-8'), parser=parser)
            print('使用lxml解析%s' % data_dict['url'])

            if doc is None:
                raise Exception('lxml模块也无法解析此XML')

        except Exception as e:
            raise c_worker_exception('lxml解析XML失败', data_dict['url'], str(e))

    # get type of the feed
    if doc.tag == 'rss' and doc.get('version', '') == '1.0':
        feedtype = 0
    elif doc.tag == 'rss' and doc.get('version', '') == '2.0':
        feedtype = 1
    elif doc.tag == 'feed':
        feedtype = 2
    else:
        raise c_worker_exception('无法识别XML的feed类型', data_dict['url'], '')

    # get feed author
    if 'use_feed_author' in data_dict:
        f_author = de_html_char(doc.findtext(tagnames['f_author'][feedtype]))

    item_iter = doc.findall(tagnames['f_items'][feedtype])
    if item_iter is None:
        return []

    ret = []
    for item in item_iter:
        # ------- info -------
        one = c_info()

        # title
        one.title = de_html_char(item.findtext(tagnames['title'][feedtype]))

        # url
        if feedtype < 2:
            url = de_html_char(item.findtext(tagnames['url'][feedtype]))
        else:
            url = ''
            link_iter = item.findall('link')
            if link_iter is not None:
                for tag_link in link_iter:
                    if tag_link.get('rel') == 'alternate':
                        if tag_link.get('type') == 'text/html':
                            url = tag_link.get('href')
                            break
                        url = url or tag_link.get('href')
        one.url = url

        # author, summary, pub_date
        if 'use_feed_author' in data_dict:
            one.author = de_html_char(
                item.findtext(tagnames['author'][feedtype], f_author))
        one.summary = de_html_char(item.findtext(
            tagnames['summary'][feedtype]))
        one.pub_date = de_html_char(
            item.findtext(tagnames['pub_date'][feedtype]))

        # suid
        guid = item.findtext(tagnames['guid'][feedtype])
        one.suid = guid or one.url or one.title

        ret.append(one)

    return ret
Пример #6
0
    def fetch_bytes_encoding(self, url):
        '''return (byte data, encoding)'''
        def get_encoding(r):
            contenttype = r.getheader('Content-Type', '')
            if not contenttype:
                return ''

            matcher = re_contenttype.search(contenttype)
            if matcher:
                return Fetcher.lookup_encoding(matcher.group(1))
            else:
                return ''

        # --------------主体开始-------------

        # request对象
        req = urllib.request.Request(url)
        req.add_header('User-Agent', self.info.ua)

        e = None
        # 重试用的循环
        for i in range(self.info.retry_count):
            try:
                # r是HTTPResponse对象
                r = self.opener.open(req,
                                     timeout=self.info.open_timeout
                                     )
                ret_data = r.read()
                encoding = get_encoding(r)

                # decompress
                contentenc = r.getheader('Content-Encoding', '')
                if contentenc:
                    contentenc = contentenc.lower()
                    if 'gzip' in contentenc:
                        ret_data = gzip.decompress(ret_data)
                    elif 'deflate' in contentenc:
                        try:
                            # first try: zlib
                            ret_data = zlib.decompress(ret_data, 15)
                        except:
                            # second try: raw deflate
                            ret_data = zlib.decompress(ret_data, -15)

                # get encoding from bytes content
                if not encoding:
                    matcher = re_meta.search(ret_data)
                    if matcher:
                        try:
                            extract = matcher.group(1).decode('ascii')
                        except:
                            encoding = ''
                        else:
                            encoding = Fetcher.lookup_encoding(extract)

                return ret_data, encoding

            except Exception as ee:
                e = ee

            if i < self.info.retry_count - 1:
                time.sleep(self.info.retry_interval)
        else:
            print('fetcher:异常,下载%s失败' % url, '\n异常信息:', e)

            s = '%s (下载%s失败,重试了%d次,连接超时限制%d秒)' % \
                (str(e), url, self.info.retry_count, self.info.open_timeout)
            raise c_worker_exception('下载url失败', url, s)
Пример #7
0
def parse_html(data_dict, base_url, html):
    if not html:
        raise c_worker_exception('html为空字符串', data_dict['url'], '')

    r = red.d(r'^\s*$')
    if r.match(html) is not None:
        raise c_worker_exception('html只有空白', data_dict['url'], '')

    # extract json string
    re = red.d(data_dict['re_pattern'], data_dict['re_flags'])
    if re is None:
        raise c_worker_exception('正则表达式编译失败', '', '用于提取json字符串的正则表达式编译失败')

    m = re.search(html)
    if m is None:
        raise c_worker_exception('无法用re(正则表达式)提取json字符', data_dict['url'], '')
    json_str = m.group(1)

    # replace
    if 'repl' in data_dict:
        r = red.d(data_dict['repl_pattern'], data_dict['repl_flags'])
        if r is None:
            raise c_worker_exception('replace正则表达式编译失败')

        json_str = r.sub(data_dict['repl'], json_str)

    # parse json
    try:
        json_obj = json.loads(json_str)
    except Exception as e:
        raise c_worker_exception('解析json时出错', data_dict['url'], str(e))

    # blocks
    json_lst = data_dict['blocks_list']
    ret = list()

    for i, block in enumerate(json_lst):

        # travel path
        path = block[0]
        block_j = json_obj

        # path必定为tuple
        for ii, path_item in enumerate(path):
            try:
                block_j = block_j[path_item]
            except:
                s = '第%d个block, block_path的第%d个路径元素%s无效'
                raise c_worker_exception(
                    s % (i + 1, ii + 1, str(path_item)), data_dict['url'],
                    'path:%s 可能是网站改变了json的设计结构' % str(path))

        # extract
        if type(block_j) == list:
            pass
        elif type(block_j) == dict:
            block_j = block_j.values()
        else:
            s = '第%d个block, block_path找到的不是列表或字典'
            raise c_worker_exception(s % (i + 1), data_dict['url'],
                                     'path:%s 可能是网站改变了json的设计结构' % str(path))

        for block_item_j in block_j:
            info = c_info()

            for key, sub_path in block[1].items():

                temp_jj = block_item_j
                for sub_path_item in sub_path:
                    try:
                        temp_jj = temp_jj[sub_path_item]
                    except Exception as e:
                        print('异常:', e)
                        s1 = '处理第%d个block的映射时异常' % (i + 1)
                        s2 = 'path:%s,key:%s,map:%s,无法找到指定元素%s.' % \
                             (str(path), key, str(sub_path),
                              str(sub_path_item))
                        raise c_worker_exception(s1, '', s2)
                ss = item_process(temp_jj)

                if key == 'title':
                    info.title = ss
                elif key == 'url':
                    info.url = ss
                elif key == 'summary':
                    info.summary = ss
                elif key == 'author':
                    info.author = ss
                elif key == 'pub_date':
                    info.pub_date = ss
                elif key == 'urljoin':
                    info.url = urljoin(base_url, ss)
                elif key == 'suid':
                    info.suid = ss
                elif key == 'temp':
                    info.temp = ss
                else:
                    print('无法处理map_rule', key, sub_path)

                if not info.suid:
                    info.suid = info.url

            ret.append(info)

    return ret
Пример #8
0
    def fetch_bytes_encoding(self, url):
        '''return (byte data, encoding)'''
        def get_encoding(r):
            contenttype = r.getheader('Content-Type', '')
            if not contenttype:
                return ''
            
            matcher = re_contenttype.search(contenttype)
            if matcher:
                return Fetcher.lookup_encoding(matcher.group(1))
            else:
                return ''

        # --------------主体开始-------------

        # request对象
        req = urllib.request.Request(url)
        req.add_header('User-Agent', self.info.ua)

        e = None
        # 重试用的循环
        for i in range(self.info.retry_count):
            try:
                # r是HTTPResponse对象
                r = self.opener.open(req,
                                timeout=self.info.open_timeout
                                )
                ret_data = r.read()
                encoding = get_encoding(r)

                # decompress
                contentenc = r.getheader('Content-Encoding', '')
                if contentenc:
                    contentenc = contentenc.lower()
                    if 'gzip' in contentenc:
                        ret_data = gzip.decompress(ret_data)
                    elif 'deflate' in contentenc:
                        try:
                            # first try: zlib
                            ret_data = zlib.decompress(ret_data, 15)
                        except:
                            # second try: raw deflate
                            ret_data = zlib.decompress(ret_data, -15)
                
                # get encoding from bytes content
                if not encoding:
                    matcher = re_meta.search(ret_data)
                    if matcher:
                        try:
                            extract = matcher.group(1).decode('ascii')
                        except:
                            encoding = ''
                        else:
                            encoding = Fetcher.lookup_encoding(extract)

                return ret_data, encoding

            except Exception as ee:
                e = ee
            
            if i < self.info.retry_count-1:
                time.sleep(self.info.retry_interval)
        else:
            print('fetcher:异常,下载%s失败' % url, '\n异常信息:', e)

            s = '%s (下载%s失败,重试了%d次,连接超时限制%d秒)' % \
                (str(e), url, self.info.retry_count, self.info.open_timeout)
            raise c_worker_exception('下载url失败', url, s)
Пример #9
0
def parse_html(data_dict, base_url, html):
    if not html:
        raise c_worker_exception('html为空字符串', data_dict['url'], '')

    r = red.d(r'^\s*$')
    if r.match(html) is not None:
        raise c_worker_exception('html只有空白', data_dict['url'], '')

    re_lst = data_dict['blocks_list']
    ret = list()

    for i, block in enumerate(re_lst):

        # block re
        block_prog = red.d(block[0][0], block[0][1])
        if block_prog is None:
            pattern_error(i)

        itr = block_prog.finditer(html)
        matches = list(itr)
        if len(matches) != 1:
            s = '第%d个block的block_re找到的结果为%d,应为1' % \
                (i + 1, len(matches))
            raise c_worker_exception(s, '',
                                     '可能是网页改版、服务器显示错误信息')
        subhtml = matches[0].group(1)

        # item re
        item_prog = red.d(block[1][0], block[1][1])
        if item_prog is None:
            pattern_error(i, False)

        itr = item_prog.finditer(subhtml)
        matches = list(itr)
        if not matches:
            s = '第%d个block的item_re找到的结果为0,应大于0' % (i + 1)
            raise c_worker_exception(s, '', '可能是网页改版')

        for m in matches:
            info = c_info()

            for k, v in block[2].items():
                try:
                    ss = map_attrs(m, v)
                except Exception as e:
                    s1 = '处理第%d个block的map_rule时异常' % (i + 1)
                    s2 = '赋值%s给%s时出错,%s' % (str(v), str(k), str(e))
                    raise c_worker_exception(s1, '', s2)

                if k == 'title':
                    info.title = ss
                elif k == 'url':
                    info.url = ss
                elif k == 'urljoin':
                    info.url = urljoin(base_url, ss)
                elif k == 'summary':
                    info.summary = ss
                elif k == 'author':
                    info.author = ss
                elif k == 'pub_date':
                    info.pub_date = ss
                elif k == 'suid':
                    info.suid = ss
                elif k == 'temp':
                    info.temp = ss
                else:
                    print('无法处理map_rule', k, v)

                if not info.suid:
                    info.suid = info.url

            ret.append(info)

    return ret