def fetchOldArticleList(self, channel, articleList, articleCount=100): ''' 从全局文章表,获取尚未消亡的文章id,而且这些文章并不在本次爬虫爬回来的记录里 ''' channel = int(channel) # 用来查询总页数 selectSql_count = 'SELECT COUNT(*) FROM %s where extinct="N" and channel_id=%d ' sql2 = selectSql_count % (Constants.TABLE_SA_ARTICLE, channel) # 获取旧文章的sql selectSql = 'SELECT TID,title, publish_datetime,url, meta_info,like_count,reply_count,forward_count FROM %s where extinct="N" and channel_id=%d ' sql = selectSql % (Constants.TABLE_SA_ARTICLE, channel) if len(articleList) > 0: whereClauseList = map( lambda article: ' tid<>"%s" ' % (article.tid), articleList) whereClauseList = ' and '.join(whereClauseList) sql += ' and (%s)' % (whereClauseList) sql2 += ' and (%s)' % (whereClauseList) sql2 += ' order by add_datetime desc;' self.dbProxy.execute(sql2) resultList2 = self.dbProxy.fetchall() # print '12456789sssssssssssssssssss' # print resultList2 #((53,),) resultList2 = re.findall(r'\d+', str(resultList2)) # 返回一个list # print resultList2[0] if int(resultList2[0]) > int(articleCount): randpage = random.randint( 0, int(math.ceil(float(resultList2[0]) / articleCount))) else: randpage = 0 # 用来随机取数据库页数 sql += ' order by add_datetime desc limit %d,%d' % (randpage, articleCount) self.dbProxy.execute(sql) resultList = self.dbProxy.fetchall() L1 = [] for item in resultList: result = Article(item[0], channel, title=item[1], publish_datetime=item[2], url=item[3], meta_info=item[4]) result.statistics = ArticleStatistics(item[0], channel, like_count=item[5], reply_count=item[6], forward_count=item[7]) L1.append(result) return L1
def parse_info(self, response): weibo_list = response.xpath("//div[@class='c' and @id]") for weibo in weibo_list: item = Weibospider1Item() div = weibo.xpath("./div") if len(div) == 1: # 微博类型 item["category"] = "无图原创" item["author"] = weibo.xpath( "./div/a[@class='nk']/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item["content"] = weibo.xpath( "./div/span[@class='ctt']").xpath('string(.)').extract() img = weibo.xpath("./div/span[@class='ctt']/img/@src") if len(img) == 1: item["content"] = weibo.xpath( "./div/text()|./div/span[@class='ctt']//text()" ).extract() item["dianzan"] = weibo.xpath("./div/a/text()").extract()[-4] item["relay"] = weibo.xpath("./div/a/text()").extract()[-3] item["comment"] = weibo.xpath( "./div/a[@class='cc']/text()").extract_first() item["comment_url"] = weibo.xpath( "./div/a[@class='cc']/@href").extract_first() item["send_time"] = weibo.xpath( "./div/span[@class='ct']/text()").extract_first() item["reason"] = None item["img_url"] = None item['reason_name'] = None item['reason_id'] = None elif len(div) == 2: item["category"] = "" item["content"] = weibo.xpath("./div[1]/span[@class='ctt']" ).xpath('string(.)').extract() img = weibo.xpath("./div/span[@class='ctt']/img/@src") if len(img) == 1: item["content"] = weibo.xpath( "./div[1]/text()|./div[1]/span[@class='ctt']//text()" ).extract() item["relay"] = weibo.xpath("./div[2]/a/text()").extract()[-3] item["comment"] = weibo.xpath( "./div[2]/a[@class='cc']/text()").extract_first() item["reason"] = None img = weibo.xpath("./div[2]//img[@class='ib']/@src") if len(img) == 0: # 无图转发 item['category'] = "无图转发" item["author"] = weibo.xpath( "./div/span[@class = 'cmt']/a/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item['reason_name'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/text()" ).extract_first() item['reason_id'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/@href").extract_first( ) item["dianzan"] = weibo.xpath( "./div[2]/a/text()").extract()[-4] item["reason"] = weibo.xpath( "./div[2]/text()|./div[2]//span[@class='kt']/text()" ).extract() item["comment_url"] = weibo.xpath( "./div[2]/a[@class='cc']/@href").extract_first() item["img_url"] = None item["send_time"] = weibo.xpath( "./div[2]/span[@class='ct']/text()").extract_first() else: # 有图原创 item['category'] = "有图原创" item["author"] = weibo.xpath( "./div/a[@class='nk']/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item['reason_name'] = None item['reason_id'] = None item["dianzan"] = weibo.xpath( "./div[2]/a/text()").extract()[-4] item["img_url"] = weibo.xpath( "./div[2]//img[@class='ib']/@src").extract_first() item["comment_url"] = weibo.xpath( "./div[2]/a[@class='cc']/@href").extract_first() item["send_time"] = weibo.xpath( "./div[2]/span[@class='ct']/text()").extract_first() else: # len(div) == 3 item["category"] = "带图片转发" item["author"] = weibo.xpath( "./div[1]/a[@class='nk']/text()").extract_first() item['author_id'] = weibo.xpath( "./div[1]/a[@class='nk']/@href").extract_first() item['reason_name'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/text()").extract_first() item['reason_id'] = weibo.xpath( "./div[1]/span[@class = 'cmt']/a/@href").extract_first() item["content"] = weibo.xpath("./div[1]/span[@class = 'ctt']" ).xpath('string(.)').extract() img = weibo.xpath("./div[1]/span[@class='ctt']/img/@src") if len(img) == 1: item["content"] = weibo.xpath( "./div[1]/text()|./div[1]/span[@class='ctt']//text()" ).extract() item["send_time"] = weibo.xpath( "./div[3]/span[@class='ct']/text()").extract_first() item["dianzan"] = weibo.xpath( "./div[3]/a/text()").extract()[-4] item["relay"] = weibo.xpath("./div[3]/a/text()").extract()[-3] item["comment"] = weibo.xpath( "./div[3]/a[@class='cc']/text()").extract_first() item["comment_url"] = weibo.xpath( "./div[3]/a[@class='cc']/@href").extract_first() item["img_url"] = weibo.xpath( "./div[2]//img[@class='ib']/@src").extract_first() item["reason"] = weibo.xpath( "./div[3]/text()|./div[3]//span[@class='kt']/text()" ).extract() item['relay_url'] = '' item['TID'] = re.findall(r'uid=.{1,}&', item["comment_url"])[0][4:-1] a = weibo.xpath("//a[@class='nk']/@href").extract() yield item article = Article(tid=item['TID'], channel_id=9, content=item['content'], publish_datetime=item['send_time'], url=item['comment_url'], title=item['content'][0:100], author_id=item['author_id'], author_name=item['author']) article.statistics = ArticleStatistics( tid=item['TID'], channel_id=9, reply_count=item['comment'], forward_count=item['relay'], like_count=item['dianzan'], ) if int(item['relay']) > 0: self.relay_url_list.append(item['relay_url']) self.r.append(article) self.name_url_list.append(a) num_page = response.xpath( "//div[@id='pagelist']/form/div/text()").extract() num_page = [i.replace( u"\xa0", "", ) for i in num_page] num_page = [i for i in num_page if len(i) > 0][0] num_page = re.findall(r'\d+', num_page) print('正在爬取第', num_page[0], '页', num_page[1]) max_page = NUM_PAGE if max_page is None: max_page = int(num_page[1]) if int(num_page[0]) == max_page: L = [] for L1 in self.name_url_list: L += L1 for url_1 in L: with open(os_file.a + '\\crawler_url.txt', 'a', encoding='utf-8') as f: f.write(url_1 + "\n") print('页数上限,搜索页数据爬取完毕') print('爬虫结束,开始热度分析') SARunner().article_List(self.r) print("爬取微博数:", len(self.r)) # print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户') # 爬取作者头像 id 关注 粉丝 with open(os_file.a + '\\crawler_url.txt', 'r', encoding='utf-8') as f: urls = f.readlines() # 获取待爬个数 # 去重 L2 = {}.fromkeys(urls).keys() self.L2 = len(L2) print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户') for url in L2: yield scrapy.FormRequest(url=url, callback=self.parse_info_detail, dont_filter=True) else: next_url = response.xpath( "//a[text() = '下页']/@href").extract_first() next_url = urllib.parse.urljoin(response.url, next_url) yield scrapy.Request(next_url, callback=self.parse_info, dont_filter=True)