def craw(url, page): html1 = urllib.request.urlopen(url).read() html1 = str(html1) print(html1) pat1 = '<div id="plist".+? <div class="page clearfix">' result1 = retest.compile(pat1).findall(html1) print("=============") print(result1) result1 = result1[0] print("=============") print(result1) pat2 = '<img width="220" height="220" data-img="1" src="//(.+?\.jpg)">' imagelist = retest.compile(pat2).findall(result1) print("=============????????") print(imagelist) x = 1 for imageurl in imagelist: imagename = "E:/learn/GitHub/pythoncrawl/img1/" + str(page) + str( x) + '.jpg' imageurl = "http://" + imageurl try: urllib.request.urlretrieve(imageurl, filename=imagename) except urllib.error.URLError as e: if hasattr(e, "code"): x += 1 if hasattr(e, "reason"): x += 1 x += 1
def parse(self, response): # pass item = QtpjtItem() paturl="(http://pic.qiantucdn.com/58pic/.*?).jpg" item["picurl"] = retest.compile(paturl).findall(str(response.body)) patlocal = "http://pic.qiantucdn.com/58pic/.*?/.*?/.*?/(.*?).jpg" item["picid"] = retest.compile(patlocal).findall(str(response.body)) yield item for i in range(1,201): nexturl="http://www.58pic.com/tb/id-"+str(i)+".html" yield Request(nexturl,callback=self.parse)
def parse(self, response): # pass item = HexunpjtItem() item['name'] = response.xpath( "//span[@class='ArticleTitleText']/a/text()").extract() item["url"] = response.xpath( "//span[@class='ArticleTitleText']/a/@href").extract() pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">' hcurl = retest.compile(pat1).findall(str(response.body))[0] headers2 = ( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0" ) opener = urllib.request.build_opener() opener.addheaders = [headers2] # 将opener安装为全局 urllib.request.install_opener(opener) #data为对应博客列表页的所有博文的点击数与评论数数据 data = urllib.request.urlopen(hcurl).read() #pat2为提取文章阅读数的正则表达式 pat2 = "click\d*?','(\d*?)'" #pat3为提取文章评论数的正则表达式 pat3 = "comment\d*?','(\d*?)'" #提取阅读数和评论数数据并分别赋值给item下的hits和comment item["hits"] = retest.compile(pat2).findall(str(data)) item["comment"] = retest.compile(pat3).findall(str(data)) yield item #提取博文列表页的总页数 pat4 = "blog.hexun.com/p(.*?)/" #通过正则表达式获取到的数据为一个列表,倒数第二个元素为总页数 data2 = retest.compile(pat4).findall(str(response.body)) if (len(data2) >= 2): totalurl = data2[-2] else: totalurl = 1 #在实际运行中,下一行print的代码可以注释掉,在调试过程中,可以开启下一行print的代码 #print("一共"+str(totalurl)+"页") #进入for循环,依次爬取各博文列表页的博文数据 for i in range(2, int(totalurl) + 1): #构造下一次要爬取的url,爬取一下页博文列表页中的数据 nexturl = "http://" + str( self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html" #进行下一次爬取,下一次爬取仍然模拟成浏览器进行 yield Request( nexturl, callback=self.parse, headers={ 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0" })
def run(self): page = self.pagestart keycode = urllib.request.quote("key") pagecode = urllib.request.quote("&queue") for page in range(self.pagestart, self.pageend + 1): url = "http://weixin.sougou.com/weixin?type=2&query=" + keycode + pagecode + str( page) data1 = use_proxy(self.proxy, url) listurlpat = '<div class="txt_box">.*?(http://.*)"' listurl.append(retest.compile(listurlpat, retest.S).findall(data1)) print("get page " + str(len(listurl))) for i in range(0, len(listurl)): time.sleep(7) for j in range(0, len((listurl[i]))): try: url = listurl[i][j] url = url.replace("amp;", "") print("第" + str(i) + "i" + str(j) + "j次入队") self.urlqueue.put(url) self.urlqueue.task_done() except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) time.sleep(10) except Exception as e: print("excption:" + str(e)) time.sleep(1)
def getcontent(listurl, proxy): i = 0 html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>微信文章页面</title> </head> <body>''' fh = open("../1.html", "wb") fh.close() fh = open("../1.html", "ab") for i in range(0, len(listurl)): for j in range(0, len(listurl[i])): try: url = listurl[i][j] url = url.replace("amp;", "") data = use_proxy(proxy, url) titlepat = "<title>(.*?)</title>" contentpat = 'id="js_content">(.*?)id="js_sg_bar"' title = retest.compile(titlepat).findall(data) content = retest.compile(contentpat, retest.S).findall(data) thistitle = "此次没有获取到" thiscontent = "此次没有获取到" if (title != []): thistitle = title[0] if (content != []): thiscontent = content[0] dataall = "<p>标题为:" + thistitle + "</p><p>内容为:" + thiscontent + "</p><br>" fh.write(dataall.encode("utf-8")) print("第" + str(i) + "个网页第" + str(j) + "次处理") #便于调试 except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) except Exception as e: print("exception:" + str(e)) time.sleep(1) fh.close() html2 = '''</body> </html> ''' fh = open("../1.html", "ab") fh.write(html2.encode("utf-8")) fh.close()
def run(self): html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>微信文章页面</title> </head> <body>''' fh = open("../2.html", "wb") fh.write(html1.encode("utf-8")) fh.close() fh = open("../2.html", "ab") i = 1 while (True): try: url = self.urlqueue.get() data = use_proxy(self.proxy, url) titlepat = "<title>(.*?)</title>" contentpat = 'id="js_content">(.*?)id="js_sq_bar"' title = retest.compile(titlepat).findall(data) content = retest.compile(contentpat, retest.S).findall(data) thistitle = "not this time" thiscontent = "not this time" if (title != []): thistitle = title[0] if (content != []): thiscontent = content[0] dataall = "<p>title:" + thistitle + "</p><p>content:" + thiscontent + "</p><br>" fh.write(dataall.encode("utf-8")) print("page" + str(i) + "...") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) time.sleep(10) except Exception as e: print("exception:" + str(e)) time.sleep(1) fh.close() html2 = '''</body> </html> ''' fh = open("../2/html", "ab") fh.write(html2.encode("utf-8")) fh.close()
def getlink(url): headers = ("user-agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36") opener=urllib.request.build_opener() opener.addheaders=[headers] urllib.request.install_opener(opener) file=urllib.request.urlopen(url) data=str(file.read()) pat='(https?://[^\s)";]+\.(\w|/)*)' link=retest.compile(pat).findall(data) link=list(set(link)) return link
def getcontent(url,page): headers = ("user-agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36") opener=urllib.request.build_opener() opener.addheaders=[headers] urllib.request.install_opener(opener) data=urllib.request.urlopen(url).read().decode("utf-8", errors='ignore')#这里转码有错误,先忽略吧 data=str(data) # print(data) # userpat='target="_blank" title="(.*?)">' # < a class ="recmd-content" href="/article/121217166" target="_blank" onclick="_hmt.push(['_trackEvent','web-list-user','chick'])" > 咏哥走了,金大侠也走了,朋友圈又是一波接一波的轰炸。跑保险的借此说大病;做净水器的说喝水的重要性;做保健品的说产品疗效……我说,咏哥买不起保险还是安装不起净水器 < / a > userpat="'web-list-user','chick'])\" >(.*?)</a>\"" # contentpat='<div class="content">(.*?)</div>' # contentpat='<div class="content">(.*?)</div>' # userlist=re.compile(userpat,re.S).findall(data) userlist=retest.compile(userpat, retest.S).findall(data) print(userlist)
def parse_one_page(html): pattern = retest.compile( '<dd>.*?board-index.*?>(\d+)</i>.*?<a.*?src="(.*?)">.*?name"><a' + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', retest.S) items = retest.findall(pattern, html) print(items) for item in items: yield { 'index': item[0], 'image': item[1], 'title': item[2].strip(), 'actor': item[3].strip()[3:] if len(item[3]) > 3 else '', 'time': item[4].strip()[5:] if len(item[4]) > 5 else '', 'score': item[5].strip() + item[6].strip() }
def getlisturl(key, pagestart, pageend, proxy): try: page = pagestart keycode = urllib.request.quote(key) pagecode = urllib.request.quote("&page") for page in range(pagestart, pageend + 1): url = "http://weixin.sogou.com/weixin?type=2&query=" + keycode + pagecode + str( page) data1 = use_proxy(proxy, url) listurlpat = '<div class="txt-box">.*?(http://.*?)"' listurl.append(retest.compile(listurlpat, retest.S).findall(data1)) print("total" + str(len(listurl))) return listurl except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) time.sleep(0) except Exception as e: print("exception:" + str(e)) time.sleep(1)
print(result2) print("----------") string = "apythonhellomypythonhispythonourpythonend" pattern = ".python." result = retest.match(pattern, string) result2 = retest.match(pattern, string).span() print(result) print(result2) print("----------") string = "hellomypythonhispythonourpythonend" pattern = ".python." result = retest.match(pattern, string) result2 = retest.search(pattern, string) print(result) print(result2) print("----------") string = "hellomypythonhispythonourpythonend" pattern = retest.compile(".python.") #预编译 result = pattern.findall(string) #找出符合模式的所有结果 print(result) result = retest.compile(".python.").findall(string) print(result) print("----------") pattern = "python." resultl = retest.sub(pattern, "php", string) #全部替换 result2 = retest.sub(pattern, "php", string, 2) #最多替换两次
headall.append(item) opener.addheaders = headall urllib.request.install_opener(opener) #建立一个自定义函数craw(vid,comid),实现自动抓取对应评论网页并返回抓取数据 def craw(vid, comid): url = "http://coral.qq.com/article/" + vid + "/comment?commentid=" + comid + "&reqnum=20" data = urllib.request.urlopen(url).read().decode("utf-8") return data idpat = '"id":"(.*?)"' userpat = '"nick":"(.*?)",' conpat = '"content":"(.*?)",' #第一层循环,代表抓取多少页,每一次外层循环抓取一页 for i in range(1, 10): print("------------------------------------") print("第" + str(i) + "页评论内容") data = craw(vid, comid) #第二层循环,根据抓取的结果提取并处理每条评论的信息,一页20条评论 for j in range(0, 20): idlist = retest.compile(idpat, retest.S).findall(data) userlist = retest.compile(userpat, retest.S).findall(data) conlist = retest.compile(conpat, retest.S).findall(data) print("用户名是 :" + eval('u"' + userlist[j] + '"')) print("评论内容是:" + eval('u"' + conlist[j] + '"')) print("\n") #将comid改变为该页的最后一条评论id,实现不断自动加载 comid = idlist[19]