def parseDomVideo(self, base, url): try: soup = self.fetchUrlWithBase(base + url, header) div = soup.first("div", {'class': 'players'}) if div != None: script = div.first('script') if script != None: text = unquote( script.text.replace("\"", "").replace("\/", "/")) texts = text.split(",") for item in texts: match = regVideo.search(item) if match != None: videoUrl = match.group(1) return "%s%s%s" % ("http", videoUrl, 'm3u8') print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def fetchUrl(self, url): count = 0 while count < maxCount: try: header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', "Referer": url } req = urllib2.Request(url, headers=header) content = urllib2.urlopen(req, timeout=3000).read() soup = BeautifulSoup(str(content)) return soup except Exception as e: print common.format_exception(e) print '打开页面错误,重试', url, '次数', count count = count + 1 print '打开页面错误,重试3次还是错误', url return BeautifulSoup('')
def fetchImgItemsData(self, url, channel): try: trs = self.fetchDataHead(url) print url, ";itemsLen=", len(trs) objs = [] sortType = dateutil.y_m_d() for item in trs: ahrefs = item.findAll("a") if ahrefs == None: continue for ahref in ahrefs: match = img_channel_title.search(ahref.text) if match == None: continue obj = {} match = img_channel_date.search(ahref.text) if match != None: obj['fileDate'] = match.group(0) else: obj['fileDate'] = '' name = ahref.text.replace(obj['fileDate'], '') obj['name'] = name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(ahref.get('href')) if len(pics) == 0: print '没有 图片文件--', ahref, '---', url continue obj['picList'] = pics obj['showType'] = 3 obj['pics'] = len(pics) obj['sortType'] = sortType obj['showType'] = 3 print 'url=', obj['url'], 'filedate=', obj[ 'fileDate'], ' 图片数量=', len(pics) objs.append(obj) return objs except Exception as e: print common.format_exception(e)
def parseDomVideo(self, url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', "Referer": url } try: match = regVideoid.search(url) if match != None: id = match.group(1) vurl = "/vod/%s/play.htm?%s-0-1" % (id, id) soup = self.fetchUrl(vurl) play_video = soup.first('div', {'class': 'body mab5 pat5'}) if play_video != None: script = play_video.first('script') if script != None: content = unquote(str(script.text)) match = regVideo.search(content) if match != None: obj = json.loads(match.group(1)) data = obj.get('Data', []) urlData = [] for item in data: itemData = item.get('playurls', []) for itemUrl in itemData: for itemurlOne in itemUrl: if itemurlOne.count('http') > 0: urlData.append(itemurlOne) for item in urlData: if item.count('m3u8'): return item for item in urlData: if item.count('/share/'): return item if len(urlData) > 0: return urlData[0] print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def run(self): try: dbVPN = db.DbVPN() ops = db_ops.DbOps(dbVPN) sortType = dateutil.y_m_d() # sortType = "2017-07-12" for i in range(0, 20000): # ret = ops.getTextChannelItems(self.t_item["url"], i) ret = ops.getTextChannelItemsById(i, sortType) if len(ret) == 0: print '写入完毕' break print '开始写入 channel :', self.t_item["url"], cloase = False for item in ret: # path = filePATH + str(item['id']) + ".txt" # if os.path.exists(path) == False: # output = open(path, 'w') # output.write(item['file']) # output.close() # print '写完文件:' + path # path = filePATHWeb + str(item['id']) + ".txt" # if os.path.exists(path) == False: # output = open(path, 'w') # output.write(html_parse.filter_tags(item['file'])) # output.close() # print '写完文件:' + path path = filePATHHtml + str(item['id']) + ".html" # if os.path.exists(path) == False: output = open(path, 'w') output.write( html_parse.txtToHtml( html_parse.filter_tags(item['file']))) output.close() print '写完文件:' + path print '写完页', i print 'channel :', self.t_item["url"], '同步完成 len=', len(ret) dbVPN.close() except Exception as e: print common.format_exception(e)
def parseDomVideo(self, url): try: soup = self.fetchUrl(url) divs = soup.findAll("ul", {'class': 'play-list play-list-long'}) for div in divs: ahref = div.first("a") soup = self.fetchContentUrl(ahref.get("href")) # scripts = soup.findAll('script') # scripts.reverse() # for script in scripts: # text = unquote(script.text.replace("\"","").replace("\/","/")) # print text match = regVideo.search(soup) if match != None: videoUrl = match.group(1) return "%s%s%s" % ("http", videoUrl, 'm3u8') print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def parseDomVideo(self, url): try: soup = self.fetchUrl(url) iframe = soup.first("iframe") if iframe != None: ahref = iframe.get("src") if ahref != None: soup = self.fetchUrl(ahref) scripts = soup.findAll("script") for script in scripts: if script.text != None: content = unquote(str(script.text)) match = regVideo.search(content) if match != None: return "http" + match.group(1) + 'm3u8' print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def parseDomVideo(self, url): try: soup = self.fetchUrl(url) div = soup.first("div", {"class": "vodplaybox"}) if div != None: aherfs = div.findAll("a") if len(aherfs) > 0: aherf = aherfs[len(aherfs) - 1] if aherf != None: content = self.fetchContentUrlWithBase( aherf.get('href')) content = unquote(str(content)) match = regVideo.search(content) if match != None: return 'http' + match.group(1) print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def fetchUrl(self, url, aheader=h_headers): count = 0 while count < maxCount: try: req = urllib2.Request(baseurl + url, headers=h_headers) response = urllib2.urlopen(req, timeout=300) gzipped = response.headers.get( 'Content-Encoding') # 查看是否服务器是否支持gzip content = response.read().decode('utf8', errors='replace') if gzipped: content = zlib.decompress(content, 16 + zlib.MAX_WBITS) # 解压缩,得到网页源码 soup = BeautifulSoup(content) return soup except Exception as e: print common.format_exception(e) print '打开页面错误,重试', baseurl + url, '次数', count count = count + 1 print '打开页面错误,重试3次还是错误', url return BeautifulSoup('')
def parseDomVideo(self, url): try: soup = self.fetchUrl(url, header) div = soup.first('div',{"class":"stab_list"}) if div!=None: ahref = div.first('a') if ahref!=None: soup = self.fetchUrl(ahref.get('href'), header) player = soup.first('div',{"class":"player"}) if player!=None: content = unquote(str(player.text)).split("$") for item in content: match = regVideo.search(item) if match!=None: return "http"+match.group(1)+'.m3u8' print url,'没有mp4' return None except Exception as e: print common.format_exception(e) return None
def execute(self, query, args=None): try: if args != None and isinstance(args, list): typeutil.listReplace(args, None, -1) if query.count('%s') != 0 and query.count('%s') != len(args): print 'error: sql error[%s][%s]' % (query, args) raise ValueError('error: sql error[%s][%s]' % (query, args)) if self.__level == True: if args != None: print('%s:[%s]') % (query, args) else: print query return self.cur.execute(query, args) except Exception as e: error = common.format_exception(e) if error.count("Duplicate entry") > 0: print query return None print common.format_exception(e), query return None
def parseDomVideo(self, url): try: soup = self.fetchUrl(url, header) div = soup.first("iframe", {'name': 'iFrame1'}) if div != None: soup = self.fetchUrl(div.get("src")) scripts = soup.findAll("script") for script in scripts: text = unquote( script.text.replace("\"", "").replace("\/", "/")) texts = text.split(",") for item in texts: match = regVideo.search(item) if match != None: videoUrl = match.group(1) return "%s%s%s" % ("http", videoUrl, 'm3u8') print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def parseDomVideo(self, url): try: soup = self.fetchUrl(baseurl2 + url, header) iframe = soup.first("iframe") if iframe != None: text = self.fetchContentUrl(iframe.get("src"), header) match = regVideoM3.search(text) if match != None: videoUrl = match.group(1) return "%s%s%s" % ("http", videoUrl, 'm3u8') else: video = soup.first("table", {"class": "plhin nthread_firstpost"}) match = regVideoM3.search(video.text) if match != None: videoUrl = match.group(1) return "%s%s%s" % ("http", videoUrl, 'm3u8') print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def parseDomVideo(self, url): try: match = videoId3.search(url) if match != None: videoId = match.group(1) videoUrlId = "/index.php?m=vod-play-id-%s-src-1-num-1.html" % ( videoId) soup = self.fetchUrl(baseurl3 + videoUrlId, header3) DIV = soup.first("div", {"class": "dyplayer"}) if DIV != None: text = unquote(str(DIV.text)) texts = text.split(",") for item in texts: match = regVideoM3.search(item) if match != None: videoUrl = match.group(1) return "%s%s%s" % ("http", videoUrl, 'm3u8') print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def parseDomVideo(self, url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', "Referer": url } try: soup = self.fetchUrl(url, header) adiv = soup.first("div", {"class": "film_bar clearfix"}) if adiv != None: ahref = adiv.first("a") if ahref != None: soup = self.fetchUrl(ahref.get("href"), header) div = soup.first("div", {'class': 'player_l'}) if div != None: div.first("") script = div.first('script') if script != None: text = unquote(str(script.text)) texts = text.split("$") for item in texts: match = regVideo.search(item) if match != None: videoUrl = match.group(1) return "%s%s%s" % ("http", videoUrl, 'm3u8') for item in texts: match = shareVideo.search(item) if match != None: videoUrl = "%s%s%s%s" % ( "http", match.group(1), "/share/", match.group(2)) return videoUrl print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def fetchTextData(self, url, channel): try: soup = self.fetchUrl(url) div = soup.first("div", {"class": "box list channel"}) if div == None: print '没有数据', url return [] datalist = div.findAll("li") objs = [] sortType = dateutil.y_m_d() for item in datalist: ahref = item.first("a") if ahref!=None: try: obj = {} span = ahref.first('span') if span != None: obj['fileDate'] = span.text else: obj['fileDate'] = '' name = ahref.text.replace(obj['fileDate'], '') obj['name'] = name.replace("【完】","") print name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() ret = self.fetchText(ahref.get('href')) if ret==None: print '没有文章数据',ahref.get('href') continue obj['sortType'] = sortType objs.append(obj) except Exception as e: print common.format_exception(e) return objs except Exception as e: print common.format_exception(e)
def parseDomVideo(self, url): try: soup = self.fetchUrl(url) div = soup.first("div",{"class":"details-con2-body"}) if div!=None: ahref = div.first("a") if ahref!=None: soup = self.fetchUrl(ahref.get("href")) player = soup.first("div",{"class":"player-box details-body"}) if player!=None: script = player.first("script") if script!=None: content = unquote(str(script.text)) match = regVideo.search(content) if match!=None: obj = json.loads(match.group(1)) data = obj.get('Data',[]) urlData = [] for item in data: itemData = item.get('playurls',[]) for itemUrl in itemData: for itemurlOne in itemUrl: if itemurlOne.count('http')>0: urlData.append(itemurlOne) for item in urlData: if item.count('m3u8'): return item for item in urlData: if item.count('/share/'): return item if len(urlData)>0: return urlData[0] print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def parseDomVideo(self, url): try: soup = self.fetchUrl(url) iframe = soup.first("iframe") if iframe != None: v = iframe.get("src").replace(".", ".") match = video_iframe.search(v) if match != None: id = v.replace("https://baiduyunbo.com/?id=", "") return video_m3u8 % (id) else: soup = self.fetchUrlWithBase(v) scripts = soup.findAll("script") for script in scripts: match = video_mp4.search(script.text) if match != None: return "%s%s%s" % ("http", match.group(1), "mp4") print url, '没有找到mp4' return None except Exception as e: print common.format_exception(e) return None
def parseDomVideo(self, base, url): try: soup = self.fetchUrl(url, header) divs = soup.findAll("div") urls = [] for div in divs: ahref = div.first("a") divTitle = div.first("div") if ahref != None and divTitle != None and ahref.get( "rel") != None: h5 = div.first("h5") name = divTitle.text if h5 != None: name = h5.text obj = {} obj['name'] = name obj['url'] = ahref.get("href") urls.append(obj) return urls except Exception as e: print common.format_exception(e) return None
def fetchUrl(self, url, aheader=header): count = 0 while count < maxCount: try: req = urllib2.Request(baseurl + url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13', "Referer":baseurl}) req.encoding = 'utf-8' response = urllib2.urlopen(req, timeout=300) gzipped = response.headers.get( 'Content-Encoding') # 查看是否服务器是否支持gzip content = response.read() if gzipped: content = zlib.decompress( content, 16 + zlib.MAX_WBITS) # 解压缩,得到网页源码 soup = BeautifulSoup(content.decode('gbk', errors='replace')) return soup except Exception as e: print common.format_exception(e) print '打开页面错误,重试', baseurl + url, '次数', count count = count + 1 print '打开页面错误,重试3次还是错误', url return BeautifulSoup('')
def parseDomVideo(self, url): try: ID = url.replace(".html","").replace("/klav-video/","") vediourl= "/klav-play/%s-1-1.html"%(ID) soup = self.fetchUrl(baseurl7+vediourl, header) div = soup.first("div",{"class":"pages"}) if div !=None: texts = div.text.split(";") for text in texts: match = regVideoMp4.search(text) match = regVideoM3.search(text) if match!=None: videoUrl =match.group(1) return "%s%s%s"%("http",videoUrl,'m3u8') if match!=None: videoUrl =match.group(1) return "%s%s%s"%("http",videoUrl,'mp4') print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def fetchUrlWithBase(self, url, aheader=header): count = 0 while count < maxCount: try: req = urllib2.Request(url, headers={'Cookie':"td_cookie=18446744069599001696; UM_distinctid=16267a77486203-0a34f7eb9f837-454c092b-1fa400-16267a7748726f; CNZZDATA4033785=cnzz_eid%3D1967344694-1522153663-null%26ntime%3D1522153663; CNZZDATA1263493226=2025093065-1522155903-null%7C1522155903; PHPSESSID=cqppj1tg9v8tf27j95ogqogjs1; td_cookie=18446744069599206493; WSKY=6c172; jiathis_rdc=%7B%22http%3A//www.zxdy.cc/vod/22266.html%22%3A1739039602%2C%22http%3A//www.zxdy.cc/play/22266-0-1.html%22%3A1739044927%2C%22http%3A//www.zxdy.cc/Uploads/https%3A//tupian.tupianzy.com/pic/upload/vod/2018-03-03/201803031520062617.jpg%22%3A1739118415%2C%22http%3A//www.zxdy.cc/list/1-p-3-0.html%22%3A1739129605%2C%22http%3A//www.zxdy.cc/list/1-p-1-0.html%22%3A1739216767%2C%22http%3A//www.zxdy.cc/list/9-p-1-0.html%22%3A1739358031%2C%22http%3A//www.zxdy.cc/list/9-p-2-0.html%22%3A1739371664%2C%22http%3A//www.zxdy.cc/Uploads/https%3A//wx3.sinaimg.cn/mw690/005w5c6ogy1fjuo496v5uj30tu15ok3k.jpg%22%3A1739577535%2C%22http%3A//www.zxdy.cc/Uploads/https%3A//img.alicdn.com/imgextra/i4/2264228004/TB2UynHnQqvpuFjSZFhXXaOgXXa_%21%212264228004.jpg%22%3A1739585958%2C%22http%3A//www.zxdy.cc/%22%3A1739586271%2C%22http%3A//www.zxdy.cc/vod/5128.html%22%3A1739763188%2C%22http%3A//www.zxdy.cc/vod/1.html%22%3A1739772004%2C%22http%3A//www.zxdy.cc/play/1-0-1.html%22%3A1739777508%2C%22http%3A//www.zxdy.cc/vod/4063.html%22%3A1739811363%2C%22http%3A//www.zxdy.cc/play/4063-0-2.html%22%3A1739820736%2C%22http%3A//www.zxdy.cc/list/11-p-1-0.html%22%3A1739855919%2C%22http%3A//www.zxdy.cc/vod/22236.html%22%3A0%7C1522158279843%2C%22http%3A//www.zxdy.cc/play/22236-0-1.html%22%3A%220%7C1522158307282%22%7D" ,'User-Agent': 'Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html, Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13', "Referer":url}) req.encoding = 'utf-8' response = urllib2.urlopen(req, timeout=300) gzipped = response.headers.get( 'Content-Encoding') # 查看是否服务器是否支持gzip content = response.read().decode('utf8', errors='replace') if gzipped: content = zlib.decompress( content, -zlib.MAX_WBITS) # 解压缩,得到网页源码 soup = BeautifulSoup(content) return soup except Exception as e: print common.format_exception(e) print '打开页面错误,重试', url, '次数', count count = count + 1 print '打开页面错误,重试3次还是错误', url return BeautifulSoup('')
def fetchUrl(self, url): count = 0 while count < maxCount: try: req = urllib2.Request( url, headers={ "Cookie": "zenid=b227c2098ac37d540e4579fb024e9ba9; __utma=62982011.325664695.1514618636.1514618636.1514618636.1; __utmc=62982011; __utmz=62982011.1514618636.1.1.utmcsr=seqing.one|utmccn=(referral)|utmcmd=referral|utmcct=/2059.html; __atuvc=7%7C52; __utmb=62982011.35.10.1514618636", "Upgrade-Insecure-Requests": "1", 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13', "Referer": "http://www.eroti-cart.com" }) req.encoding = 'utf-8' opener = urllib2.build_opener() opener.add_handler(SmartRedirectHandler()) urllib2.install_opener(opener) response = urllib2.urlopen(req, timeout=300) gzipped = response.headers.get( 'Content-Encoding') # 查看是否服务器是否支持gzip content = response.read().decode('utf8', errors='replace') if gzipped: content = zlib.decompress(content, 16 + zlib.MAX_WBITS) # 解压缩,得到网页源码 # cmd = ("wget %s" % (url)) # textlist = os.popen(cmd).readlines() soup = BeautifulSoup(content) return soup except Exception as e: print common.format_exception(e) print '打开页面错误,重试', url, '次数', count count = count + 1 print '打开页面错误,重试3次还是错误', url return BeautifulSoup('')
def fetchTextData(self, url, channel): try: soup = self.fetchUrl(baseurl+url) div = soup.first("div", {"class": "novelList"}) if div == None: print '没有数据', url return [] datalist = div.findAll("a") objs = [] sortType = dateutil.y_m_d() for item in datalist: try: obj = {} span = item.first('div',{"class":"pull-right date "}) if span != None: obj['fileDate'] = span.text else: obj['fileDate'] = '' name = item.first("div",{"class":"pull-left"}).text obj['name'] = name.replace("【完】","") print name obj['url'] = item.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() ret = self.fetchText(item.get('href')) if ret==None: print '没有文章数据',item.get('href') continue obj['sortType'] = sortType objs.append(obj) except Exception as e: print common.format_exception(e) return objs except Exception as e: print common.format_exception(e)
def parseDomVideo(self, url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', "Referer": url } obj = {} try: soup = self.fetchUrl(url, header) playtool = soup.first("div", {'class': 'play-wapper'}) if playtool != None: obj['pic'] = playtool.first('img').get('src') ahrefs = playtool.findAll('a') for ahref in ahrefs: match = regVideo.search(ahref.text) if match != None: videoUrl = match.group(1) obj['mp4'] = "%s%s%s" % ("http", videoUrl, 'm3u8') return obj print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def fetchImgItemsData(self, url, channel): soup = self.fetchUrl(baseurl4, url) div = soup.first("div", {"class": "zxlist"}) if div == None: print '没有数据', url return [] datalist = div.findAll("ul") objs = [] sortType = dateutil.y_m_d() for item in datalist: ahref = item.first("a") if ahref != None: try: obj = {} obj['fileDate'] = item.first('li', {"class": "zxsyd"}).text name = ahref.text obj['name'] = name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl4 obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(obj['url']) if len(pics) == 0: print '没有 图片文件--', obj['url'], '---', url continue obj['picList'] = pics obj['showType'] = 3 obj['pics'] = len(pics) obj['sortType'] = sortType print name, pics[0], ' url=', obj['url'], ' 图片数量=', len( pics) objs.append(obj) except Exception as e: print common.format_exception(e) return objs
def parseDomVideo(self, url): try: soup = self.fetchUrl(url, header) adiv = soup.first("div",{"class":"playBar"}) if adiv!=None: ahref = adiv.first('a') if ahref!=None: soup = self.fetchUrl(ahref.get("href"), header) style = soup.first("ul",{"style":"text-align:center;;"}) if style!=None: script = style.first("script") if script!=None: text = unquote(str(self.fetchUrl(script.get("src")))) texts = text.split("$") for item in texts: match = regVideo.search(item) if match!=None: videoUrl =match.group(1) return "%s%s%s"%("http",videoUrl,'m3u8') print '没找到mp4' return None except Exception as e: print common.format_exception(e) return None
def run(self): dbVPN = db.DbVPN() ops = db_ops.DbOps(dbVPN) ops.inertTextChannel(self.t_obj) dbVPN.commit() print self.t_obj try: channel = self.t_obj['url'] for i in range(1, maxTextPage): url = self.t_obj['url'].replace(".html", "-") + str(i) + ".html" count = self.update(url, ops, channel) dbVPN.commit() if count == 0: break else: self.update(url, ops, channel) dbVPN.commit() dbVPN.close() except Exception as e: print common.format_exception(e) dbVPN.commit() dbVPN.close()
def fetchHeadChannel(self): try: soup = self.fetchUrl("/") menu = soup.first("div", {"id": "nav"}) if menu == None: print '没找到对应的频道 ', baseurl return None lis = menu.findAll("a") ret = [] for a in lis: print a if a != None and a.text.find('首页') == -1: row = {} row['name'] = a.text row['baseurl'] = baseurl row['url'] = a.get('href') row['channelType'] = 'normal' row['updateTime'] = datetime.datetime.now() row['channel'] = baseurl.replace("http://", "").replace( "https://", "") + channel_pre + a.get('href') ret.append(row) return ret except Exception as e: print common.format_exception(e)
def fetchContentUrlWithBase(self, url): count = 0 while count < maxCount: try: req = urllib2.Request(baseurl + url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13', "Referer":baseurl}) req.encoding = 'utf-8' response = urllib2.urlopen(req, timeout=300) content = response.read() gzipped = response.headers.get( 'Content-Encoding') # 查看是否服务器是否支持gzip if gzipped: content = zlib.decompress( content, 16 + zlib.MAX_WBITS) # 解压缩,得到网页源码 return content except Exception as e: print common.format_exception(e) print '打开页面错误,重试', baseurl+url, '次数', count count = count + 1 print '打开页面错误,重试3次还是错误', baseurl+url return '' # p = BaseParse() # print p.fetchContentUrlWithBase("/list/?37-1.html", header)