def parse(self, response): #抓取所有文章的题目,评论数,收藏数和点赞数 # 一页的文章连接数 article_link_list =response.css('#archive .floated-thumb a.archive-title::attr(href)').extract() next_page_link=response.css('.next.page-numbers::attr(href)').extract()[0] # article_title = response.css('#archive .floated-thumb a.archive-title::attr(title)').extract()[0] meta = { 'page_num': next_page_link[-2:-1] } for url in article_link_list: print('url = ',url) yield Request(url=url, callback=self.parse_detail,meta=meta) # 进行翻页功能 # print('next_page_link',next_page_link) i =0 if next_page_link: i = i+1 yield Request(url=next_page_link,callback=self.parse) if i >4: # 先请求前三页 return
def parse_detial(self, response): res = json.loads(response.text) graphql = res.get('graphql') shortcode_media = graphql.get('shortcode_media') owner = shortcode_media.get('owner') username = owner.get('username') personal_url = 'https://www.instagram.com/{}/?__a=1'.format(username) # 拼接出个人主页的链接 yield Request(url=personal_url, meta={'username': username}, callback=self.parse_person)
def parse(self, response): #print(response.text) rownodes = Selector( response=response).xpath('//div[@class="row post"]') # 分析每一个资讯 for rowpost in rownodes: inforesult = {} colnode = rowpost.xpath('.//div[@class="col-md-7 col-sm-6"]')[0] inforesult['i_title'] = colnode.xpath( './h4/a/text()').extract_first() print(inforesult['i_title']) inforesult['i_type'] = "威胁情报" inforesult['i_sourcesite'] = "安全牛" inforesult['i_url'] = colnode.xpath('./h4/a/@href').extract_first() if inforesult['i_url']: inforesult['i_uuid'] = MD5.get_md5(inforesult['i_url']) inforesult['i_abstract'] = colnode.xpath( './p/text()').extract_first() author = colnode.xpath( './/span[@class="author"]/a/text()').extract_first() authorurl = colnode.xpath( './/span[@class="author"]/a/@href').extract_first() authordict = {'author': author, 'authorurl': authorurl} inforesult['i_author'] = json.dumps(authordict, ensure_ascii=False) # 星期三, 四月 18, 2018 timestr = colnode.xpath( './/span[@class="date"]/text()').extract_first() inforesult['i_releasetime'] = self.get_info_releasetime(timestr) inforesult['i_content'] = self.get_info_content( inforesult['i_url']) inforesult['i_imagesurls'] = rowpost.xpath( './/div[@class="thumb"]/a/img/@src').extract() yield IbugItem(inforesult) # 访问下一页 nextpage = Selector(response=response).xpath( '//div[@class="navigation"]/div[@class="nav-previous"]/a/@href' ).extract_first() if nextpage: print('--------------------------------------访问下一页:' + nextpage) yield Request(nextpage, callback=self.parse, dont_filter=True)
def get_response_object(self, url): path_to_file = url.replace(FILE_SYSTEM_PREFIX, '') f = open(path_to_file, 'rb') bytess = f.read() f.close() return HtmlResponse(url, 200, self.generate_response_headers(), bytess, None, Request(url), encoding='utf-8')
def parse(self, response): res = json.loads(response.text) data = res.get('data') user = data.get('user') edge_web_discover_media = user.get('edge_web_discover_media') edges = edge_web_discover_media.get('edges') for x in edges: node = x.get('node') shortcode = node.get('shortcode') url = 'https://www.instagram.com/p/{}/?__a=1'.format(shortcode) # 进入二级页面 准备找到个人主页的链接 yield Request(url=url, callback=self.parse_detial) break
def parse_fans(self, response): response = json.loads(response.text) data = response.get('data') user = data['user'] edge_followed_by = user['edge_followed_by'] edges = edge_followed_by['edges'] # 粉丝数据 for user_name in edges: node = user_name['node'] username = node['username'] personal_url = 'https://www.instagram.com/{}/?__a=1'.format( username) # 拼接出粉丝个人主页的链接 yield Request(url=personal_url, callback=self.parse_person)
def parse_person(self, response): username = response.meta.get('username') res = json.loads(response.text) graphql = res.get('graphql') user = graphql.get('user') # 找到查看粉丝请求链接 # id = user.get('id') # fans_url = 'https://www.instagram.com/graphql/query/?query_hash=56066f031e6239f35a904ac20c9f37d9&variables={"id":"{}","include_reel":true,"fetch_mutual":false,"first":24}'.format(id) # yield Request( # url=fans_url, # callback=self.parse_fans # ) # 找到每个帖子的请求链接 edge_owner_to_timeline_media = user.get('edge_owner_to_timeline_media') edges = edge_owner_to_timeline_media.get('edges') for x in edges: node = x.get('node') shortcode = node.get('shortcode') post_url = 'https://www.instagram.com/p/{}/?__a=1'.format( shortcode) yield Request(url=post_url, meta={'username': username}, callback=self.prase_post)