def fetch_html(self, url, encoding='', errors='strict'): if not errors: errors = 'strict' bytes_data, bytes_encoding = self.fetch_bytes_encoding(url) # get encoding if not encoding: # server send encoding = bytes_encoding if not encoding: # chardect encoding = Fetcher.d(url, bytes_data) if not encoding: # default: utf-8 print('fetcher:打开%s时,使用utf-8作默认' % url) encoding = 'utf-8' # decode try: return bytes_data.decode(encoding, errors) except: print('下载器<解文本编码>失败') s = '字节长度:%d.使用编码:%s.网址:%s' % \ (len(bytes_data), encoding, url) raise c_worker_exception('解文本编码(decode)时出现异常', url, s)
def pattern_error(blocknum, isblock=True): if isblock: s = '第%d个block的blockre编译失败' % (blocknum + 1) else: s = '第%d个block的itemre编译失败' % (blocknum + 1) raise c_worker_exception('正则表达式编译失败', '', s)
def do_process(data_dict, worker_dict): package = data_dict['package'] url = 'https://pypi.python.org/pypi/%s/' % package f = Fetcher() string = f.fetch_html(url) if not string: raise c_worker_exception('无法下载url', url) # single page single_re = (r'<span class="breadcrumb-separator">.*?' r'<span class="breadcrumb-separator">.*?' r'<a href="(/pypi/([^/]+)/([^"]+))">.*?' r'class="odd".*?' r'<td>(\d{4}-\d{1,2}-\d{1,2})</td>' ) prog = red.d(single_re, red.DOTALL) m = prog.search(string) if m: info = c_info() info.title = m.group(2) + ' ' + m.group(3) info.url = url info.pub_date = m.group(4) info.suid = info.title return [info] # table page table_re = (r'<tr class="(?:odd|even)">.*?' r'<a href="(/pypi/([^/]+)/([^"]+))">' r'') prog = red.d(table_re, red.DOTALL) miter = prog.finditer(string) lst = list() for m in miter: info = c_info() info.title = m.group(2) + ' ' + m.group(3) info.url = 'https://pypi.python.org' + m.group(1) info.suid = info.title lst.append(info) return lst
def parse_xml(data_dict, xml): if not xml: raise c_worker_exception('xml为空字符串', data_dict['url'], '') r = red.d(r'^\s*$') if r.match(xml) is not None: raise c_worker_exception('xml只有空白', data_dict['url'], '') # remove namespace of atom xml = red.sub(r'''<feed\s+(?:"[^"]*"|'[^']*'|[^"'>])*>''', r'<feed>', xml, count=1, flags=red.IGNORECASE | red.A) # ElementTree try: doc = ET.fromstring(xml) except Exception as e: doc = None if not etree: raise c_worker_exception('解析XML失败,可以尝试安装lxml模块', data_dict['url'], str(e)) # lxml if doc is None: try: parser = etree.XMLParser(recover=True, encoding='utf-8') doc = etree.fromstring(xml.encode('utf-8'), parser=parser) print('使用lxml解析%s' % data_dict['url']) if doc is None: raise Exception('lxml模块也无法解析此XML') except Exception as e: raise c_worker_exception('lxml解析XML失败', data_dict['url'], str(e)) # get type of the feed if doc.tag == 'rss' and doc.get('version', '') == '1.0': feedtype = 0 elif doc.tag == 'rss' and doc.get('version', '') == '2.0': feedtype = 1 elif doc.tag == 'feed': feedtype = 2 else: raise c_worker_exception('无法识别XML的feed类型', data_dict['url'], '') # get feed author if 'use_feed_author' in data_dict: f_author = de_html_char(doc.findtext(tagnames['f_author'][feedtype])) item_iter = doc.findall(tagnames['f_items'][feedtype]) if item_iter is None: return [] ret = [] for item in item_iter: # ------- info ------- one = c_info() # title one.title = de_html_char(item.findtext(tagnames['title'][feedtype])) # url if feedtype < 2: url = de_html_char(item.findtext(tagnames['url'][feedtype])) else: url = '' link_iter = item.findall('link') if link_iter is not None: for tag_link in link_iter: if tag_link.get('rel') == 'alternate': if tag_link.get('type') == 'text/html': url = tag_link.get('href') break url = url or tag_link.get('href') one.url = url # author, summary, pub_date if 'use_feed_author' in data_dict: one.author = de_html_char( item.findtext(tagnames['author'][feedtype], f_author)) one.summary = de_html_char(item.findtext( tagnames['summary'][feedtype])) one.pub_date = de_html_char( item.findtext(tagnames['pub_date'][feedtype])) # suid guid = item.findtext(tagnames['guid'][feedtype]) one.suid = guid or one.url or one.title ret.append(one) return ret
def fetch_bytes_encoding(self, url): '''return (byte data, encoding)''' def get_encoding(r): contenttype = r.getheader('Content-Type', '') if not contenttype: return '' matcher = re_contenttype.search(contenttype) if matcher: return Fetcher.lookup_encoding(matcher.group(1)) else: return '' # --------------主体开始------------- # request对象 req = urllib.request.Request(url) req.add_header('User-Agent', self.info.ua) e = None # 重试用的循环 for i in range(self.info.retry_count): try: # r是HTTPResponse对象 r = self.opener.open(req, timeout=self.info.open_timeout ) ret_data = r.read() encoding = get_encoding(r) # decompress contentenc = r.getheader('Content-Encoding', '') if contentenc: contentenc = contentenc.lower() if 'gzip' in contentenc: ret_data = gzip.decompress(ret_data) elif 'deflate' in contentenc: try: # first try: zlib ret_data = zlib.decompress(ret_data, 15) except: # second try: raw deflate ret_data = zlib.decompress(ret_data, -15) # get encoding from bytes content if not encoding: matcher = re_meta.search(ret_data) if matcher: try: extract = matcher.group(1).decode('ascii') except: encoding = '' else: encoding = Fetcher.lookup_encoding(extract) return ret_data, encoding except Exception as ee: e = ee if i < self.info.retry_count - 1: time.sleep(self.info.retry_interval) else: print('fetcher:异常,下载%s失败' % url, '\n异常信息:', e) s = '%s (下载%s失败,重试了%d次,连接超时限制%d秒)' % \ (str(e), url, self.info.retry_count, self.info.open_timeout) raise c_worker_exception('下载url失败', url, s)
def parse_html(data_dict, base_url, html): if not html: raise c_worker_exception('html为空字符串', data_dict['url'], '') r = red.d(r'^\s*$') if r.match(html) is not None: raise c_worker_exception('html只有空白', data_dict['url'], '') # extract json string re = red.d(data_dict['re_pattern'], data_dict['re_flags']) if re is None: raise c_worker_exception('正则表达式编译失败', '', '用于提取json字符串的正则表达式编译失败') m = re.search(html) if m is None: raise c_worker_exception('无法用re(正则表达式)提取json字符', data_dict['url'], '') json_str = m.group(1) # replace if 'repl' in data_dict: r = red.d(data_dict['repl_pattern'], data_dict['repl_flags']) if r is None: raise c_worker_exception('replace正则表达式编译失败') json_str = r.sub(data_dict['repl'], json_str) # parse json try: json_obj = json.loads(json_str) except Exception as e: raise c_worker_exception('解析json时出错', data_dict['url'], str(e)) # blocks json_lst = data_dict['blocks_list'] ret = list() for i, block in enumerate(json_lst): # travel path path = block[0] block_j = json_obj # path必定为tuple for ii, path_item in enumerate(path): try: block_j = block_j[path_item] except: s = '第%d个block, block_path的第%d个路径元素%s无效' raise c_worker_exception( s % (i + 1, ii + 1, str(path_item)), data_dict['url'], 'path:%s 可能是网站改变了json的设计结构' % str(path)) # extract if type(block_j) == list: pass elif type(block_j) == dict: block_j = block_j.values() else: s = '第%d个block, block_path找到的不是列表或字典' raise c_worker_exception(s % (i + 1), data_dict['url'], 'path:%s 可能是网站改变了json的设计结构' % str(path)) for block_item_j in block_j: info = c_info() for key, sub_path in block[1].items(): temp_jj = block_item_j for sub_path_item in sub_path: try: temp_jj = temp_jj[sub_path_item] except Exception as e: print('异常:', e) s1 = '处理第%d个block的映射时异常' % (i + 1) s2 = 'path:%s,key:%s,map:%s,无法找到指定元素%s.' % \ (str(path), key, str(sub_path), str(sub_path_item)) raise c_worker_exception(s1, '', s2) ss = item_process(temp_jj) if key == 'title': info.title = ss elif key == 'url': info.url = ss elif key == 'summary': info.summary = ss elif key == 'author': info.author = ss elif key == 'pub_date': info.pub_date = ss elif key == 'urljoin': info.url = urljoin(base_url, ss) elif key == 'suid': info.suid = ss elif key == 'temp': info.temp = ss else: print('无法处理map_rule', key, sub_path) if not info.suid: info.suid = info.url ret.append(info) return ret
def fetch_bytes_encoding(self, url): '''return (byte data, encoding)''' def get_encoding(r): contenttype = r.getheader('Content-Type', '') if not contenttype: return '' matcher = re_contenttype.search(contenttype) if matcher: return Fetcher.lookup_encoding(matcher.group(1)) else: return '' # --------------主体开始------------- # request对象 req = urllib.request.Request(url) req.add_header('User-Agent', self.info.ua) e = None # 重试用的循环 for i in range(self.info.retry_count): try: # r是HTTPResponse对象 r = self.opener.open(req, timeout=self.info.open_timeout ) ret_data = r.read() encoding = get_encoding(r) # decompress contentenc = r.getheader('Content-Encoding', '') if contentenc: contentenc = contentenc.lower() if 'gzip' in contentenc: ret_data = gzip.decompress(ret_data) elif 'deflate' in contentenc: try: # first try: zlib ret_data = zlib.decompress(ret_data, 15) except: # second try: raw deflate ret_data = zlib.decompress(ret_data, -15) # get encoding from bytes content if not encoding: matcher = re_meta.search(ret_data) if matcher: try: extract = matcher.group(1).decode('ascii') except: encoding = '' else: encoding = Fetcher.lookup_encoding(extract) return ret_data, encoding except Exception as ee: e = ee if i < self.info.retry_count-1: time.sleep(self.info.retry_interval) else: print('fetcher:异常,下载%s失败' % url, '\n异常信息:', e) s = '%s (下载%s失败,重试了%d次,连接超时限制%d秒)' % \ (str(e), url, self.info.retry_count, self.info.open_timeout) raise c_worker_exception('下载url失败', url, s)
def parse_html(data_dict, base_url, html): if not html: raise c_worker_exception('html为空字符串', data_dict['url'], '') r = red.d(r'^\s*$') if r.match(html) is not None: raise c_worker_exception('html只有空白', data_dict['url'], '') re_lst = data_dict['blocks_list'] ret = list() for i, block in enumerate(re_lst): # block re block_prog = red.d(block[0][0], block[0][1]) if block_prog is None: pattern_error(i) itr = block_prog.finditer(html) matches = list(itr) if len(matches) != 1: s = '第%d个block的block_re找到的结果为%d,应为1' % \ (i + 1, len(matches)) raise c_worker_exception(s, '', '可能是网页改版、服务器显示错误信息') subhtml = matches[0].group(1) # item re item_prog = red.d(block[1][0], block[1][1]) if item_prog is None: pattern_error(i, False) itr = item_prog.finditer(subhtml) matches = list(itr) if not matches: s = '第%d个block的item_re找到的结果为0,应大于0' % (i + 1) raise c_worker_exception(s, '', '可能是网页改版') for m in matches: info = c_info() for k, v in block[2].items(): try: ss = map_attrs(m, v) except Exception as e: s1 = '处理第%d个block的map_rule时异常' % (i + 1) s2 = '赋值%s给%s时出错,%s' % (str(v), str(k), str(e)) raise c_worker_exception(s1, '', s2) if k == 'title': info.title = ss elif k == 'url': info.url = ss elif k == 'urljoin': info.url = urljoin(base_url, ss) elif k == 'summary': info.summary = ss elif k == 'author': info.author = ss elif k == 'pub_date': info.pub_date = ss elif k == 'suid': info.suid = ss elif k == 'temp': info.temp = ss else: print('无法处理map_rule', k, v) if not info.suid: info.suid = info.url ret.append(info) return ret