Exemplo n.º 1
0
    def fetchOldArticleList(self, channel, articleList, articleCount=100):
        '''
        从全局文章表,获取尚未消亡的文章id,而且这些文章并不在本次爬虫爬回来的记录里
        '''
        channel = int(channel)
        # 用来查询总页数
        selectSql_count = 'SELECT COUNT(*) FROM %s where extinct="N" and channel_id=%d '
        sql2 = selectSql_count % (Constants.TABLE_SA_ARTICLE, channel)
        # 获取旧文章的sql
        selectSql = 'SELECT TID,title, publish_datetime,url, meta_info,like_count,reply_count,forward_count FROM %s where extinct="N" and channel_id=%d '
        sql = selectSql % (Constants.TABLE_SA_ARTICLE, channel)

        if len(articleList) > 0:
            whereClauseList = map(
                lambda article: ' tid<>"%s" ' % (article.tid), articleList)
            whereClauseList = ' and '.join(whereClauseList)
            sql += ' and (%s)' % (whereClauseList)
            sql2 += ' and (%s)' % (whereClauseList)
        sql2 += ' order by add_datetime desc;'
        self.dbProxy.execute(sql2)
        resultList2 = self.dbProxy.fetchall()
        # print '12456789sssssssssssssssssss'
        # print resultList2 #((53,),)
        resultList2 = re.findall(r'\d+', str(resultList2))  # 返回一个list
        # print resultList2[0]
        if int(resultList2[0]) > int(articleCount):
            randpage = random.randint(
                0, int(math.ceil(float(resultList2[0]) / articleCount)))
        else:
            randpage = 0  # 用来随机取数据库页数

        sql += ' order by add_datetime desc limit %d,%d' % (randpage,
                                                            articleCount)
        self.dbProxy.execute(sql)
        resultList = self.dbProxy.fetchall()

        L1 = []
        for item in resultList:
            result = Article(item[0],
                             channel,
                             title=item[1],
                             publish_datetime=item[2],
                             url=item[3],
                             meta_info=item[4])
            result.statistics = ArticleStatistics(item[0],
                                                  channel,
                                                  like_count=item[5],
                                                  reply_count=item[6],
                                                  forward_count=item[7])
            L1.append(result)

        return L1
Exemplo n.º 2
0
    def parse_info(self, response):
        weibo_list = response.xpath("//div[@class='c' and @id]")
        for weibo in weibo_list:
            item = Weibospider1Item()
            div = weibo.xpath("./div")
            if len(div) == 1:
                # 微博类型
                item["category"] = "无图原创"
                item["author"] = weibo.xpath(
                    "./div/a[@class='nk']/text()").extract_first()
                item['author_id'] = weibo.xpath(
                    "./div[1]/a[@class='nk']/@href").extract_first()
                item["content"] = weibo.xpath(
                    "./div/span[@class='ctt']").xpath('string(.)').extract()
                img = weibo.xpath("./div/span[@class='ctt']/img/@src")
                if len(img) == 1:
                    item["content"] = weibo.xpath(
                        "./div/text()|./div/span[@class='ctt']//text()"
                    ).extract()
                item["dianzan"] = weibo.xpath("./div/a/text()").extract()[-4]
                item["relay"] = weibo.xpath("./div/a/text()").extract()[-3]
                item["comment"] = weibo.xpath(
                    "./div/a[@class='cc']/text()").extract_first()
                item["comment_url"] = weibo.xpath(
                    "./div/a[@class='cc']/@href").extract_first()
                item["send_time"] = weibo.xpath(
                    "./div/span[@class='ct']/text()").extract_first()
                item["reason"] = None
                item["img_url"] = None
                item['reason_name'] = None
                item['reason_id'] = None

            elif len(div) == 2:
                item["category"] = ""
                item["content"] = weibo.xpath("./div[1]/span[@class='ctt']"
                                              ).xpath('string(.)').extract()
                img = weibo.xpath("./div/span[@class='ctt']/img/@src")
                if len(img) == 1:
                    item["content"] = weibo.xpath(
                        "./div[1]/text()|./div[1]/span[@class='ctt']//text()"
                    ).extract()
                item["relay"] = weibo.xpath("./div[2]/a/text()").extract()[-3]
                item["comment"] = weibo.xpath(
                    "./div[2]/a[@class='cc']/text()").extract_first()
                item["reason"] = None
                img = weibo.xpath("./div[2]//img[@class='ib']/@src")
                if len(img) == 0:
                    # 无图转发
                    item['category'] = "无图转发"
                    item["author"] = weibo.xpath(
                        "./div/span[@class = 'cmt']/a/text()").extract_first()
                    item['author_id'] = weibo.xpath(
                        "./div[1]/a[@class='nk']/@href").extract_first()
                    item['reason_name'] = weibo.xpath(
                        "./div[1]/span[@class = 'cmt']/a/text()"
                    ).extract_first()
                    item['reason_id'] = weibo.xpath(
                        "./div[1]/span[@class = 'cmt']/a/@href").extract_first(
                        )
                    item["dianzan"] = weibo.xpath(
                        "./div[2]/a/text()").extract()[-4]
                    item["reason"] = weibo.xpath(
                        "./div[2]/text()|./div[2]//span[@class='kt']/text()"
                    ).extract()
                    item["comment_url"] = weibo.xpath(
                        "./div[2]/a[@class='cc']/@href").extract_first()
                    item["img_url"] = None
                    item["send_time"] = weibo.xpath(
                        "./div[2]/span[@class='ct']/text()").extract_first()

                else:
                    # 有图原创
                    item['category'] = "有图原创"
                    item["author"] = weibo.xpath(
                        "./div/a[@class='nk']/text()").extract_first()
                    item['author_id'] = weibo.xpath(
                        "./div[1]/a[@class='nk']/@href").extract_first()
                    item['reason_name'] = None
                    item['reason_id'] = None
                    item["dianzan"] = weibo.xpath(
                        "./div[2]/a/text()").extract()[-4]
                    item["img_url"] = weibo.xpath(
                        "./div[2]//img[@class='ib']/@src").extract_first()
                    item["comment_url"] = weibo.xpath(
                        "./div[2]/a[@class='cc']/@href").extract_first()
                    item["send_time"] = weibo.xpath(
                        "./div[2]/span[@class='ct']/text()").extract_first()

            else:
                # len(div) == 3
                item["category"] = "带图片转发"
                item["author"] = weibo.xpath(
                    "./div[1]/a[@class='nk']/text()").extract_first()
                item['author_id'] = weibo.xpath(
                    "./div[1]/a[@class='nk']/@href").extract_first()
                item['reason_name'] = weibo.xpath(
                    "./div[1]/span[@class = 'cmt']/a/text()").extract_first()
                item['reason_id'] = weibo.xpath(
                    "./div[1]/span[@class = 'cmt']/a/@href").extract_first()
                item["content"] = weibo.xpath("./div[1]/span[@class = 'ctt']"
                                              ).xpath('string(.)').extract()
                img = weibo.xpath("./div[1]/span[@class='ctt']/img/@src")
                if len(img) == 1:
                    item["content"] = weibo.xpath(
                        "./div[1]/text()|./div[1]/span[@class='ctt']//text()"
                    ).extract()
                item["send_time"] = weibo.xpath(
                    "./div[3]/span[@class='ct']/text()").extract_first()
                item["dianzan"] = weibo.xpath(
                    "./div[3]/a/text()").extract()[-4]
                item["relay"] = weibo.xpath("./div[3]/a/text()").extract()[-3]
                item["comment"] = weibo.xpath(
                    "./div[3]/a[@class='cc']/text()").extract_first()
                item["comment_url"] = weibo.xpath(
                    "./div[3]/a[@class='cc']/@href").extract_first()
                item["img_url"] = weibo.xpath(
                    "./div[2]//img[@class='ib']/@src").extract_first()
                item["reason"] = weibo.xpath(
                    "./div[3]/text()|./div[3]//span[@class='kt']/text()"
                ).extract()
            item['relay_url'] = ''

            item['TID'] = re.findall(r'uid=.{1,}&',
                                     item["comment_url"])[0][4:-1]
            a = weibo.xpath("//a[@class='nk']/@href").extract()
            yield item
            article = Article(tid=item['TID'],
                              channel_id=9,
                              content=item['content'],
                              publish_datetime=item['send_time'],
                              url=item['comment_url'],
                              title=item['content'][0:100],
                              author_id=item['author_id'],
                              author_name=item['author'])
            article.statistics = ArticleStatistics(
                tid=item['TID'],
                channel_id=9,
                reply_count=item['comment'],
                forward_count=item['relay'],
                like_count=item['dianzan'],
            )
            if int(item['relay']) > 0:
                self.relay_url_list.append(item['relay_url'])

            self.r.append(article)
            self.name_url_list.append(a)

        num_page = response.xpath(
            "//div[@id='pagelist']/form/div/text()").extract()
        num_page = [i.replace(
            u"\xa0",
            "",
        ) for i in num_page]
        num_page = [i for i in num_page if len(i) > 0][0]
        num_page = re.findall(r'\d+', num_page)

        print('正在爬取第', num_page[0], '页', num_page[1])
        max_page = NUM_PAGE
        if max_page is None:
            max_page = int(num_page[1])
        if int(num_page[0]) == max_page:
            L = []
            for L1 in self.name_url_list:
                L += L1
            for url_1 in L:
                with open(os_file.a + '\\crawler_url.txt',
                          'a',
                          encoding='utf-8') as f:
                    f.write(url_1 + "\n")

            print('页数上限,搜索页数据爬取完毕')
            print('爬虫结束,开始热度分析')
            SARunner().article_List(self.r)

            print("爬取微博数:", len(self.r))
            # print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户')
            # 爬取作者头像 id 关注 粉丝
            with open(os_file.a + '\\crawler_url.txt', 'r',
                      encoding='utf-8') as f:
                urls = f.readlines()
                # 获取待爬个数
                # 去重
                L2 = {}.fromkeys(urls).keys()
                self.L2 = len(L2)
                print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户')
                for url in L2:
                    yield scrapy.FormRequest(url=url,
                                             callback=self.parse_info_detail,
                                             dont_filter=True)
        else:
            next_url = response.xpath(
                "//a[text() = '下页']/@href").extract_first()
            next_url = urllib.parse.urljoin(response.url, next_url)
            yield scrapy.Request(next_url,
                                 callback=self.parse_info,
                                 dont_filter=True)