def parse_item(self, response): item = HexunItem() item["title"] = response.xpath("//span[@class='ArticleTitleText']/a/text()").extract() item["link"] = response.url pat_link = '(http://click.tool.hexun.com/click.aspx\?articleid=.*?)"' click_link = re.compile(pat_link,re.S).findall(str(response.body))[0] headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", 'Referer': response.url } opener = urllib.request.build_opener() headall = [] for key, value in headers.items(): headers_item = (key, value) headall.append(headers_item) opener.addheaders = headall urllib.request.install_opener(opener) click_data = urllib.request.urlopen(click_link).read().decode("utf-8",'ignore') pat_click = '"articleClickCount"\).innerHTML = (.*?);' pat_comment = '"articleCommentCount"\).innerHTML = (.*?);' click = re.compile(pat_click,re.S).findall(click_data)[0] commemt = re.compile(pat_comment,re.S).findall(click_data)[0] print(click,commemt) item["click"] = click item["comment"] = commemt #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return item
def detail_parse(self,response): item = HexunItem() articlestr = response.text #截取之后的 articles = articlestr[23:-4] # print(articles) json_obj = json.loads((articles)) urls = [] for i in range(0,len(json_obj['result'])): print(str(json_obj['result'][i])) urls.append(str(json_obj['result'][i]['entityurl'])) item['url'] = urls yield item # json_obj = json.loads((sss[23:-2])) # print(type(json_obj)) # print(json_obj) # for i in range(0, len(json_obj['result']) - 1): # print(str(json_obj['result'][i])) # print(str(json_obj['result'][i]['entitytime'])) # # s1 = "hx_json11587781686930( " # # print(len(s1)) # with open("test.txt", "r") as f: # for res in json_obj['result']: # f.writelines(str(res['entityurl'])) # # print(str(res['entityurl'])) # f.close() pass
def data(self, response): item = HexunItem() item['url'] = response.url item['title'] = response.meta['title'] item['stock'] = response.meta['stock'] item['time'] = response.xpath("//span[@class='pr20']/text()").extract()[0] item['author'] = response.xpath("//*[@rel='nofollow']/text()").extract()[0] item['text'] = ''.join(response.xpath("//div[@class='art_contextBox']/p/text()").extract()) yield item
def parse(self, response): soup = BeautifulSoup(response.body, "html.parser") td_list = soup.body.find(id='BankNameList').find_all('td', class_='fir_td') for td in td_list: item = HexunItem() print(td.div.string.strip()) print(td.find(class_='pere').em.string.strip()) result = td.div.string.strip().split('/') item['from_currency'] = result[0] item['to_currency'] = result[1] item['rate'] = td.find(class_='pere').em.string.strip() yield item
def parseItem(self, response): jsonData = json.loads(response.body_as_unicode().strip(';').strip('(').strip(')')) datas = jsonData['Data'][0] contractCode=self.getContractName(response) for dataItem in datas: lldpeItem = HexunItem() lldpeItem['product'] = contractCode lldpeItem['dateTime'] = dataItem[0] lldpeItem['price'] = dataItem[1] lldpeItem['amount'] = dataItem[2] lldpeItem['volumn'] = dataItem[3] lldpeItem['avePrice'] = dataItem[4] lldpeItem['openInterest'] = dataItem[5] yield lldpeItem
def parse(self, response): item = HexunItem() #提取名字和链接 item['name']=response.xpath("//span[@class='ArticleTitleText']/a/text()").extract() item["url"]=response.xpath("//span[@class='ArticleTitleText']/a/@href").extract() #使用urllib和re模块获取博文的评论数和阅读数 #构造提取评论数和点击数网址的正则表达式 # edit it according to the reality pat1='<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">' #hcurl为存储评论数和点击数的网址(后面用来二次爬取) 网页结构变化后表达式可能提取不到 urls=re.compile(pat1).findall(str(response.body)) hcurl = urls[0] if urls else "" if not hcurl: print("Extracted nothing!") # 模拟成浏览器 headers2 = ("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0") opener = urllib.request.build_opener() opener.addheaders = [headers2] # 将opener安装为全局 urllib.request.install_opener(opener) #data为对应博客列表页的所有博文的点击数与评论数数据 data=urllib.request.urlopen(hcurl).read() #pat2为提取文章阅读数的正则表达式 pat2="click\d*?','(\d*?)'" #pat3为提取文章评论数的正则表达式 pat3="comment\d*?','(\d*?)'" #提取阅读数和评论数数据并分别赋值给item下的hits和comment item["hits"]=re.compile(pat2).findall(str(data)) #print(item["hits"]) item["comment"]=re.compile(pat3).findall(str(data)) yield item #提取博文列表页的总页数 pat4="blog.hexun.com/p(.*?)/" #通过正则表达式获取到的数据为一个列表,倒数第二个元素为总页数 data2=re.compile(pat4).findall(str(response.body)) if(len(data2)>=2): totalurl=data2[-2] else: totalurl=1 #print("一共"+str(totalurl)+"页") #调试用 #进入for循环,依次爬取各博文列表页的博文数据 for i in range(2,int(totalurl)+1): #构造下一次要爬取的url,爬取一下页博文列表页中的数据 nexturl="http://"+str(self.uid)+".blog.hexun.com/p"+str(i)+"/default.html" #进行下一次爬取,下一次爬取仍然模拟成浏览器进行 yield Request(nexturl,callback=self.parse,headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"})
def parse_item(self, response): item = HexunItem() item['title'] = response.xpath("//span[@class='ArticleTitleText']/a/text()").extract()[0] item['link'] = response.url item['blog_id'] = response.xpath("//script").re('ARecommend.aspx\?blogid=(.*?)&',re.S)[0] item['article_id'] = response.xpath("//span[@class='ArticleTitleText']/a/@href").re('blog.hexun.com/(.*?)_d.html')[0] cc_url = "http://click.tool.hexun.com/click.aspx?articleid=%s&blogid=%s" %(item['article_id'],item['blog_id']) print('url', cc_url) headers = { 'Referer': response.url, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } res = requests.get(cc_url, headers=headers) data=res.text print(data) item['comment']=re.compile('articleCommentCount.*?= (.*?);', re.S).findall(data)[0] item['click'] = re.compile('articleClickCount.*?= (.*?);', re.S).findall(data)[0] return item