def parse(self, response): item = SomenewItem() url = response.url item['article_id'] = get_md5(url) com_num = response.xpath( "//h2[@id='comm_span']/span/text()").extract_first() if com_num is None: com_num = 0 else: com_num = com_num.replace("(", '').replace(")", '').replace( "\n", '').replace("\t", '') com_int = re.match("(.*)k$", str(com_num)) if com_int is not None: item['comm_num'] = int(float(com_int.group(1)) * 1000) else: item['comm_num'] = com_num item['read_num'] = '0' item['env_num'] = '0' conid = re.match('.*?(\d+)', url) if conid: conid = conid.group(1) fav_url = 'https://www.thepaper.cn/cont_vote_json.jsp?contid=' + str( conid) yield scrapy.Request(fav_url, callback=self.get_fav_num, meta={"item": item}) else: item['fav_num'] = 0 item['hot_value'] = int(item['fav_num']) + int(item['comm_num']) yield item
def parse(self, response): id = response.xpath( "//div[@class='attitude']/span/@data-id").extract_first() upnum_url = 'http://www.bjnews.com.cn/webapi/getupnum?id=' + str(id) item = SomenewItem() item['article_id'] = get_md5(response.url) yield scrapy.Request(upnum_url, callback=self.get_comment_num, meta={'item': item})
def parse(self, response): content = re.search("getcomments:'(.*?)'", response.text, re.S) url = content.group(1) comment_url = 'http:' + url item = SomenewItem() item['article_id'] = get_md5(response.url) yield scrapy.Request(comment_url, callback=self.get_comment_num, meta={'item': item})
def parse(self, response): item = SomenewItem() url = str(response.url) title = response.xpath("//div[@class='l_a']/h1/text()").extract_first() item['article_id'] = get_md5(url) sourceid = response.xpath( "//meta[@name='contentid']/@content").extract_first() com_url = 'https://commentn.huanqiu.com/api/v2/async?a=comment&m=source_info&appid=e8fcff106c8f&sourceid=' + sourceid + '&url=' + \ url + '&title=' + title yield scrapy.Request(com_url, callback=self.get_com_num, meta={'item': item})
def parse(self, response): content = re.findall(r'("productKey".*)?"target"', response.text, re.S) content = ''.join(content).replace('\n', '').replace(' ', '') con = "{" + content + "}" con = eval( con ) # {'productKey': 'a2869674571f77b5a0867c3d71db5856', 'docId': 'E9UT79BB0001875P'} item = SomenewItem() comment_url = 'https://comment.api.163.com/api/v1/products/{productKey}/threads/{docId}?ibc=jssdk'.format( productKey=con['productKey'], docId=con['docId']) item['article_id'] = get_md5(response.url) yield scrapy.Request(url=comment_url, callback=self.get_comment_num, meta={'item': item})
def parse(self, response): fullText = response.body.decode() # print(fullText) item = SomenewItem() comm_re = re.search('comments_count\: (\d+)', fullText, re.S) if comm_re: item['comm_num'] = comm_re.group(1) else: item['comm_num'] = 0 item['read_num'] = 0 item['fav_num'] = 0 item['env_num'] = 0 item['hot_value'] = item['comm_num'] item['article_id'] = get_md5(response.url) # print(item) yield item
def parse(self, response): item = SomenewItem() url = response.url item['article_id'] = get_md5(url) com_parm = response.xpath( "//meta[@name='sudameta'][2]/@content").extract_first() if com_parm == 'sinaog:0': com_parm = response.xpath( "//meta[@name='sudameta'][1]/@content").extract_first() com_parm_dic = { i.split(':')[0]: i.split(':')[1] for i in com_parm.split(';') } com_url = 'http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=' + com_parm_dic[ 'comment_channel'] + '&newsid=' + com_parm_dic[ 'comment_id'] + '&group=undefined&compress=0&ie=utf-8' yield scrapy.Request(com_url, callback=self.get_com_num, meta={"item": item})
def parse(self, response): item = SomenewItem() html = response.body.decode() comm_num_re = re.search('\"comments_count\"\: (\d+)', html, re.S) if comm_num_re: item['comm_num'] = comm_num_re.group(1) else: item['comm_num'] = 0 env_num_re = re.search('\"reposts_count\"\: (\d+)', html, re.S) if env_num_re: item['env_num'] = env_num_re.group(1) else: item['env_num'] = 0 fav_num_re = re.search('\"attitudes_count\"\: (\d+)', html, re.S) if fav_num_re: item['fav_num'] = fav_num_re.group(1) else: item['fav_num'] = 0 item['read_num'] = 0 item['hot_value'] = int(item['comm_num']) + int(item['env_num']) + int( item['fav_num']) item['article_id'] = get_md5(response.url) yield item
def parse(self, response): item = SomenewItem() url = response.url item['article_id'] = get_md5(url) # if not item['content'] == '': html = response.xpath( "//*[@id='Main-Article-QQ']/div/div[1]/div[2]/script/text()" ).extract_first() # 如果没有取到cmt_id,说明comm_num=0 if html: html = html.replace("\n", '').replace(' ', '') cmt_id = re.match('.*?cmt_id=(\d+).*', html).group(1) com_url = 'https://coral.qq.com/article/' + cmt_id + '/commentnum' yield scrapy.Request(com_url, callback=self.get_comm_num, dont_filter=True, meta={'item': item}) else: item['comm_num'] = 0 item['fav_num'] = '0' item['read_num'] = '0' item['env_num'] = '0' item['hot_value'] = 0 yield item