示例#1
0
class WeiboCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.channel = channel
        self.entityId = 'SYSU'
        # self.user_name = "*****@*****.**"
        # self.session = SessionCrawler(sleepRange=[3,8])
        # self.pass_word = "810214bee810214"
        # self.user_name = "15088137907"
        # self.pass_word = "4p2yhynrb7"
        self.user_name_password = self.get_username_password()
        self.user_name = self.user_name_password[0]
        self.pass_word = self.user_name_password[1]
        self.logger.info('username: %s' % self.user_name)

        self.email = SendEmail()
        self.db = InsertDB()
        self.monitor_title = '微博爬虫监控'
        self.proxies = ''

        self.session = SessionCrawler(sleepRange=[3, 8])

        # user_name_password3 = '15767199023:j980216'
        # user_name_password1 = '13427287354:4ova7zixzj'
        # user_name_password2 = '13532011721:1emr41761u'
        # user_name_password3 = '13640792755:1eek9uuym4'
        # user_name_password4 = '13697726577:7hviv4old0'####
        # user_name_password5 = '13794342903:6imuw2cdya'

        # 197的微博账号
        # user_name_password1 = '17825769929:4ms7e2v3zx'
        # user_name_password2 = '18211493432:7fagvqyi9p'
        # user_name_password3 = '17827278983:0nenzag325'
        # user_name_password4 = '13922771190:5aqa10wvwf'
        # user_name_password5 = '15999916968:2i45j5b49y'

        # 15119820746 - ---0htkvsq5h6
        # 15986585396 - ---5gsmhx3e8k
        # 13430915912 - ---8s1nif2d50
        # 15012471375 - ---3qwlffw8vv
        # 17880567972 - ---6jrlzr2fqe
        # 17876156948 - ---5g5w4i43f3
        # 15915132451 - ---2rl2v9hy9t
        # 13543985544 - ---8x0pqi3as7
        # 13717382951 - ---5p2d39l19r
        # 13640695490 - ---6nxc4vou4o
    def change_cookie(self):
        '''
        随机获取一个cookie
        :return:
        '''
        # usename_list = [
        #             '18814095644','13432881156','*****@*****.**','15018377821','*****@*****.**',
        #             '15767199023','13427287354','13532011721','13640792755','13794342903',
        #             '17825769929','18211493432','17827278983','13922771190','15999916968',
        #             '15119820746','15986585396','13430915912','15012471375','17880567972',
        #             '17876156948','15915132451','13543985544','13717382951','13640695490',
        #             '15711707673','13680181412','13414759320','17820956139','18476072534',
        #             '17806699214','13418852766','17827181603','15919354070','15088137907'
        #                ]
        usename_list = [
            '18814095644',
            '13432881156',
            '*****@*****.**',
            '15018377821',
            '*****@*****.**',
        ]
        usename = random.choice(usename_list)

        return usename

    def get_username_password(self):
        '''
        随机获取用户和密码
        :return:
        '''
        user_name_password1 = '18814095644:ljda.18814095644'
        user_name_password2 = '13432881156:liang452035397'
        user_name_password3 = '[email protected]:810214bee810214'
        user_name_password4 = '15018377821:zzm15331411'
        user_name_password5 = '15767199023:j980216'
        user_name_password6 = '[email protected]:uwinvip'

        user_list = [
            user_name_password1, user_name_password2, user_name_password3,
            user_name_password4, user_name_password5, user_name_password6
        ]

        user_choice = random.choice(user_list)
        user_name_password = user_choice.split(':')
        return user_name_password

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        run_msg = '微博爬虫开始运行'
        self.db.Insert(self.channel.channel_id, self.entityId, run_msg)
        startTime = endTime - datetime.timedelta(hours=2)
        # startTime=datetime.datetime(2017,11,20,23)
        page = 1
        articleList = list()
        hasnext = True
        while hasnext:
            data = self.__searchByPage(keywordList, startTime, endTime, page)

            (articleListInPage, hasnext) = self.__parseSearchPage(data)

            articleList.extend(articleListInPage)
            page += 1
        if articleList is None:
            article_msg = '微博没有爬取到数据'
            self.email.send(self.monitor_title, article_msg)
            self.db.Insert(self.channel.channel_id, self.entityId, article_msg)
        end_msg = '微博爬虫结束'
        self.db.Insert(self.channel.channel_id, self.entityId, end_msg)
        return articleList

    def __searchByPage(self, keywordList, startTime, endTime, page):
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        params = {
            'typeall':
            '1',
            'suball':
            '1',  # 包含全部
            'timescope':
            'custom:%s:%s' % (startTime.strftime("%Y-%m-%d"),
                              (endTime.strftime("%Y-%m-%d"))),  # 时间
            # 微博搜索的时间范围格式不同 不能写%Y-%m-%d-%H
            'Refer':
            'SWeibo_box',
            'page':
            page
        }
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            # 'Host': 's.weibo.com',
            # 'Referer': 'http://s.weibo.com/',
            'User-Agent': user_agent
        }
        index_url = 'http://s.weibo.com/weibo/' + query  # 搜索主页+

        usename_cookie = self.change_cookie()
        self.logger.debug('Use Cookie %s' % usename_cookie)
        try:
            cookies = self.__load_cookies_from_lwp(usename_cookie)
            html = self.session.get(index_url,
                                    params=params,
                                    headers=headers,
                                    cookies=cookies)  # 加载本地cookies
            lindex = html.find(
                '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"'
            )
            rindex = html[lindex:].find('</script>')
            rindex = lindex + rindex - 1

            lindex = lindex + len(
                '<script>STK && STK.pageletM && STK.pageletM.view(')
            jo = json.loads(html[lindex:rindex])
            data = jo['html']  # 实时微博页
            self.logger.debug('Get data')
            return data
        except Exception as e:
            self.logger.debug('ERROR %s' % e)
            loginFlag = self.__login()
            self.logger.debug('Use username: %s' % self.user_name)
            if loginFlag is False:
                self.logger.error('Fail to logon')
                login_msg = '微博登录失败'
                self.email.send(self.monitor_title, login_msg)
                self.db.Insert(self.channel.channel_id, self.entityId,
                               login_msg)
                return

            cookies = self.__load_cookies_from_lwp(self.user_name)
            self.logger.debug('Get a new Cookie: %s' % cookies)
            html = self.session.get(index_url,
                                    params=params,
                                    headers=headers,
                                    cookies=cookies)  # 加载本地cookies
            lindex = html.find(
                '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"'
            )
            rindex = html[lindex:].find('</script>')
            rindex = lindex + rindex - 1
            lindex = lindex + len(
                '<script>STK && STK.pageletM && STK.pageletM.view(')
            jo = json.loads(html[lindex:rindex])
            data = jo['html']  # 实时微博页
            return data

            # self.logger.warning('Crawler failed: %s' % e)
            # msg = '没有获取到json数据,说明微博爬虫挂了'
            # self.email.send(self.monitor_title,msg)
            # self.db.Insert(self.channel.channel_id,self.entityId,msg)

    def __parseSearchPage(self, data):
        '''
        @return: (articleList,hasnext)
        '''
        articleList = list()
        hasnext = False
        soup = BeautifulSoup(data, "lxml")
        self.logger.info(soup)
        # check if no result
        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return (articleList, hasnext)

        # find page bar to check if more

        pageDivList = soup.findAll('div', {'class': 'W_pages'})
        if len(pageDivList) > 0:
            pageDiv = pageDivList[0]
            if len(pageDiv.findAll('a',
                                   {'class': 'page next S_txt1 S_line1'})) > 0:
                hasnext = True
        if hasnext is False:
            self.logger.info('The last page')

        root_1 = soup.findAll('div', {"action-type": "feed_list_item"})
        # self.logger.debug(root_1)
        for r in root_1:
            root_2 = r.find('div', {'class': "content clearfix"})
            mid = r.attrs['mid']
            article_url = root_2.find('div', {
                'class': "feed_from W_textb"
            }).findNext('a').attrs['href']
            self.logger.debug('1  %s', article_url)
            if not article_url.startswith('http:'):
                article_url = 'http:' + article_url
            # self.logger.debug(article_url)
            root_content = root_2.find('p', {'class': "comment_txt"})

            long_content = root_content.find('a', {'action-type': "fl_unfold"})
            try:
                link_content = root_content.find('a').attrs['href']
                link_content = '  原文链接: ' + link_content
            except:
                link_content = ''
            if long_content:
                content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[
                    'action-data']
                self.session.randomSleep()
                response = self.session.get(content_url, textRspOnly=False)
                try:
                    content_html = response.json()['data']['html']
                    content = BeautifulSoup(
                        content_html, 'html.parser').text.strip().replace(
                            "'", "''").replace("%", "\%").replace(":", "\:")
                except Exception, e:
                    self.logger.debug('Exception: %s' % e)
                    continue
            else:
                content = root_content.text.strip().replace("'", "''").replace(
                    "%", "\%").replace(":", "\:")
                content = content + link_content
                # self.logger.error(content)

            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '
            author_id = r.attrs['tbinfo']
            author_id = re.findall(r'ouid=(\d+)', author_id)[0]
            author_name = root_2.find('a').attrs['nick-name']
            publish_datetime = root_2.find('a', {
                'class': "W_textb"
            }).attrs['date']

            try:
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.localtime(float(publish_datetime) / 1000))
            except:
                continue
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name)

            # fetch statistics
            root_3 = r.find('div', {'class': "feed_action clearfix"})
            soup_li_list = root_3.findAll('li')
            self.__parseStatistics(article, soup_li_list)
            # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \
            #     publish_datetime, reply_count, read_count, like_count, collect_count, forward_count
            if article not in articleList:
                articleList.append(article)
        return (articleList, hasnext)
示例#2
0
class WeiboCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel=None, logger=None):
        '''
        Constructor
        '''
        # if logger is None:
        #         #     self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        #         # else:
        #         #     self.logger = logger
        self.logger = logging.getLogger()

        self.channel = channel
        self.user_name_password = self.get_username_password()
        self.user_name = self.user_name_password[0]
        self.pass_word = self.user_name_password[1]
        # self.logger.info('username: %s' % self.user_name)
        print 'username: %s' % self.user_name

        self.session = SessionCrawler(sleepRange=[3, 8])

    def get_username_password(self):
        '''
        随机获取用户和密码
        :return:
        '''
        # 超算1的微博账号
        # user_name_password1 = '18814095644:ljda.18814095644'
        # user_name_password2 = '13432881156:liang452035397'
        # user_name_password3 = '[email protected]:810214bee810214'
        # user_name_password4 = '15018377821:zzm15331411'
        # user_name_password5 = '[email protected]:uwinvip'

        # 超算2的微博账号
        # user_name_password1 = '13432881156:liang452035397'
        # user_name_password2 = '[email protected]:uwinvip'
        # user_name_password3 = '15767199023:j980216'
        # user_name_password1 = '13427287354:4ova7zixzj'
        # user_name_password2 = '13532011721:1emr41761u'
        # user_name_password3 = '13640792755:1eek9uuym4'
        # user_name_password4 = '13697726577:7hviv4old0'
        # user_name_password5 = '13794342903:6imuw2cdya'

        # 197的微博账号
        user_name_password1 = '17825769929:4ms7e2v3zx'
        user_name_password2 = '18211493432:7fagvqyi9p'
        user_name_password3 = '17827278983:0nenzag325'
        user_name_password4 = '13922771190:5aqa10wvwf'
        user_name_password5 = '15999916968:2i45j5b49y'

        user_list = [
            user_name_password1, user_name_password2, user_name_password3,
            user_name_password4, user_name_password5
        ]

        # user_list = [user_name_password1, user_name_password2, user_name_password3]

        user_choice = random.choice(user_list)
        user_name_password = user_choice.split(':')
        return user_name_password

    # 随机选取User-Agent
    def get_random_agent(self):
        user_agent = [
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;  Trident/5.0)",
        ]
        random_header = random.choice(user_agent)
        return random_header

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        self.__login()
        # startTime = endTime - datetime.timedelta(hours=2)
        # # startTime=datetime.datetime(2017,11,20,23)
        # page = 1
        # articleList = list()
        # hasnext = True
        # while hasnext:
        #     data = self.__searchByPage(keywordList, startTime,endTime, page)
        #     (articleListInPage,hasnext) = self.__parseSearchPage(data)
        #
        #     articleList.extend(articleListInPage)
        #     page+=1
        # return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        # return
        try:
            (data, check) = self.__fetchSingleArticle(article)

            if check == '0':
                soup = BeautifulSoup(data, 'lxml')
                ulList = soup.findAll(
                    'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'})
                li_list = ulList[0].findAll('li')
                self.__parseStatistics(article, li_list)
            elif check == '1':
                self.logger.warning(u'要访问的网页404了:%s', article.url)
                return
            else:
                self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url)
                return
        except:
            self.logger.error('Fail to fetch statistics for:%s, %s',
                              article.url, traceback.format_exc())
            return

    def __save_cookies_lwp(self, cookiejar):
        """
        保存cookies到本地
        """
        filename = 'sinaweibocookies'
        lwp_cookiejar = cookielib.LWPCookieJar()
        for c in cookiejar:
            args = dict(vars(c).items())
            args['rest'] = args['_rest']
            del args['_rest']
            c = cookielib.Cookie(**args)
            lwp_cookiejar.set_cookie(c)
        lwp_cookiejar.save(filename, ignore_discard=True)

    def __load_cookies_from_lwp(self):
        """
        读取本地cookies
        """
        filename = 'sinaweibocookies'
        lwp_cookiejar = cookielib.LWPCookieJar()
        lwp_cookiejar.load(filename, ignore_discard=True)
        return lwp_cookiejar

    def __parseStatistics(self, article, soup_li_list):
        # 新版
        collect_count = soup_li_list[0].find('span').text
        collect_count = re.findall(r'\d+', collect_count)
        if len(collect_count) > 0:
            collect_count = int(collect_count[0])
        else:
            collect_count = 0
        forward_count = soup_li_list[1].find('span').text

        forward_count = re.findall(r'\d+', forward_count)
        if len(forward_count) > 0:
            forward_count = int(forward_count[0])
        else:
            forward_count = 0

        reply_count = soup_li_list[2].find('span').text
        reply_count = re.findall(r'\d+', reply_count)
        if len(reply_count) > 0:
            reply_count = int(reply_count[0])
        else:
            reply_count = 0

        like_count = soup_li_list[3].find('span').text
        like_count = re.findall(r'\d+', like_count)
        if len(like_count) > 0:
            like_count = int(like_count[0])
        else:
            like_count = 0
        article.statistics.reply_count = reply_count
        article.statistics.like_count = like_count
        article.statistics.collect_count = collect_count
        article.statistics.forward_count = forward_count

    def __parseSearchPage(self, data):
        '''
        @return: (articleList,hasnext)
        '''
        articleList = list()
        hasnext = False
        soup = BeautifulSoup(data, "lxml")
        # self.logger.info(soup)
        #check if no result
        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return (articleList, hasnext)

        #find page bar to check if more

        pageDivList = soup.findAll('div', {'class': 'W_pages'})
        if len(pageDivList) > 0:
            pageDiv = pageDivList[0]
            if len(pageDiv.findAll('a',
                                   {'class': 'page next S_txt1 S_line1'})) > 0:
                hasnext = True
        if hasnext is False:
            self.logger.info('The last page')

        root_1 = soup.findAll('div', {"action-type": "feed_list_item"})
        # self.logger.debug(root_1)
        for r in root_1:
            root_2 = r.find('div', {'class': "content clearfix"})
            mid = r.attrs['mid']
            article_url = root_2.find('div', {
                'class': "feed_from W_textb"
            }).findNext('a').attrs['href']
            self.logger.debug('1  %s', article_url)
            if not article_url.startswith('http:'):
                article_url = 'http:' + article_url
            # self.logger.debug(article_url)
            root_content = root_2.find('p', {'class': "comment_txt"})

            long_content = root_content.find('a', {'action-type': "fl_unfold"})
            try:
                link_content = root_content.find('a').attrs['href']
                link_content = '  原文链接: ' + link_content
            except:
                link_content = ''
            if long_content:
                content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[
                    'action-data']
                self.session.randomSleep()
                response = self.session.get(content_url, textRspOnly=False)
                try:
                    content_html = response.json()['data']['html']
                    content = BeautifulSoup(
                        content_html, 'html.parser').text.strip().replace(
                            "'", "''").replace("%", "\%").replace(":", "\:")
                except Exception, e:
                    self.logger.debug('Exception: %s' % e)
                    continue
            else:
                content = root_content.text.strip().replace("'", "''").replace(
                    "%", "\%").replace(":", "\:")
                content = content + link_content
                # self.logger.error(content)

            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '
            author_id = r.attrs['tbinfo']
            author_id = re.findall(r'ouid=(\d+)', author_id)[0]
            author_name = root_2.find('a').attrs['nick-name']
            publish_datetime = root_2.find('a', {
                'class': "W_textb"
            }).attrs['date']

            try:
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.localtime(float(publish_datetime) / 1000))
            except:
                continue
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name)

            #fetch statistics
            root_3 = r.find('div', {'class': "feed_action clearfix"})
            soup_li_list = root_3.findAll('li')
            self.__parseStatistics(article, soup_li_list)

            if article not in articleList:
                articleList.append(article)
        return (articleList, hasnext)
示例#3
0
class WechatCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.session = SessionCrawler(logger=self.logger)
        self.session_public = SessionCrawler(logger=self.logger)
        self.session_public_article = SessionCrawler(logger=self.logger)
        self.channel = channel
        self.entityId = 'SYSU'

        self.ip_list = None
        self.proxies = None
        self.monitor_title = '微信爬虫监控'
        self.email = SendEmail()
        self.db = InsertDB()

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,爬取一天内的文章
        @param keywordList: 关键字数组
        @:param endTime: 搜索结束时间
        '''
        run_msg = '微信爬虫开始运行'
        self.db.Insert(self.channel.channel_id, self.entityId, run_msg)
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeStr = startTime.strftime('%Y-%m-%d')
        endTimeStr = endTime.strftime('%Y-%m-%d')

        # startTime = endTime - datetime.timedelta(days=1)
        # startTimeStr = startTime.strftime('%Y-%m-%d')
        # endTimeStr=startTime.strftime('%Y-%m-%d')

        self.logger.debug('startTime:%s', startTimeStr)
        self.logger.debug('endTime:%s', endTimeStr)

        # 随机选取一个代理
        # proxy_crawler = ProxyCrawler()
        # proxies = proxy_crawler.get_random_proxy()

        # publicList = self.getPublic()
        # articleList = self.searchPublic(publicList)
        articleList = list()
        urlList = list()

        for keyword in keywordList:
            # 忽略第一次,第一次不带时间范围
            pageUrl = (SOUGO_WEIXIN_URL_INIT % (urllib.quote(
                keyword.encode('utf-8')), int(time.time() * 1000))).replace(
                    '#', '%')
            self.logger.debug('pageUrl:%s', pageUrl)
            self.session.randomSleep()
            lastPageUrl = pageUrl
            # 爬取微信一天时间的内容
            pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote(
                keyword.encode('utf-8')), startTimeStr, endTimeStr)
            self.logger.debug('pageUrl:%s', pageUrl)
            # 得到ip队列
            self.ip_list = getIp()
            ip = self.ip_list.dequeue()
            self.proxies = {"http": "http://" + ip}

            while True:
                # proxies = {"http": "http://" + ip}
                headers = HEADERS_SOGOU.copy()
                headers['Referer'] = lastPageUrl
                try:
                    response = self.session.get(pageUrl,
                                                allow_redirects=False,
                                                headers=headers,
                                                proxies=self.proxies)
                    soup = BeautifulSoup(response, 'lxml')
                    main = soup.find('ul', {'class': "news-list"})
                    while True:
                        if main is None:
                            # self.logger.error('Fail to parse: ip被封,更新ip')
                            content = 'ip被封,更新ip'
                            # self.email.send(self.monitor_title, content)
                            self.db.Insert(self.channel.channel_id,
                                           self.entityId, content)
                            temp = self.ip_list.dequeue()
                            if self.ip_list.isempty():
                                self.ip_list = getIp()
                            self.proxies = {"http": "http://" + temp}
                            # while True:
                            #     try:
                            response = self.session.get(pageUrl,
                                                        allow_redirects=False,
                                                        headers=headers,
                                                        proxies=self.proxies)
                            soup = BeautifulSoup(response, 'lxml')
                            main = soup.find('ul', {'class': "news-list"})
                            #     break
                            # except:
                            #     ip_unuseful_content = '此ip是不合格的ip,更新ip'
                            #     # self.email.send(self.monitor_title, ip_unuseful_content)
                            #     self.db.Insert(self.channel.channel_id,self.entityId,ip_unuseful_content)
                            #     tmp = self.ip_list.dequeue()
                            #     if self.ip_list.isempty():
                            #         self.ip_list = getIp()
                            #     self.proxies = {"http": "http://" + tmp}
                        else:
                            break

                    li_list = main.findAll('li')
                    # li_list有可能为空,但还可以翻页
                    for li in li_list:
                        a_list = li.findAll('a')
                        try:
                            publish_datetime = li.select_one('.s-p').get('t')
                            publish_datetime = time.strftime(
                                '%Y-%m-%d %H:%M:%S',
                                time.localtime(int(publish_datetime)))
                        except Exception as e:
                            self.logger.debug(
                                'Publish_datetime crawl failed, use now time')
                            publish_datetime = datetime.datetime.now(
                            ).strftime('%Y-%m-%d %H:%M:%S')
                        print publish_datetime

                        for a in a_list:
                            if a['uigs'].startswith('article_title'):
                                # self.logger.debug('Article title:%s',a.text)
                                urlList.append((a['href'], pageUrl, a.text,
                                                publish_datetime))
                                break
                    pageBarList = soup.findAll('div',
                                               {'id': 'pagebar_container'})
                    if len(pageBarList) == 0:
                        # 没有翻页,直接退出
                        break
                    pageBar = pageBarList[0]
                    aList = pageBar.findAll('a')
                    foundNextPage = False
                    for a in aList:
                        if a['uigs'] == 'page_next':
                            foundNextPage = True
                            lastPageUrl = pageUrl
                            pageUrl = SOGOU_URL + a['href']
                            self.logger.debug('Found next page:%s', a.text)
                            break
                    if foundNextPage is False:
                        break
                except:
                    ip_unuseful_content = '此ip是不合格的ip,更新ip'
                    # self.email.send(self.monitor_title,ip_unuseful_content)
                    self.db.Insert(self.channel.channel_id, self.entityId,
                                   ip_unuseful_content)
                    tmp = self.ip_list.dequeue()
                    if self.ip_list.isempty():
                        self.ip_list = getIp()
                    self.proxies = {"http": "http://" + tmp}
        for item in urlList:
            article = self.crawlArticle(item[0],
                                        referer=item[1],
                                        title=item[2],
                                        publish_datetime=item[3],
                                        flag=0)

            if article is not None:
                if article not in articleList:
                    # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                    articleList.append(article)
        if articleList is None:
            monitor_content = '微信没有数据,或者微信爬虫挂了'
            self.email.send(self.monitor_title, monitor_content)
            self.db.Insert(self.channel.channel_id, self.entityId,
                           monitor_content)
        end_msg = '微信爬虫结束'
        self.db.Insert(self.channel.channel_id, self.entityId, end_msg)
        return articleList

    def searchPublic(self, keywordList):
        '''
        根据关键字数组,开始时间和结束时间范围搜索公众号
        :param keywordList:
        :param endTime:
        :return:
        '''
        articleList = list()

        for keyword in keywordList:
            self.logger.debug(keyword)
            pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword[0])
            self.logger.info('pageUrl:%s', pageUrl)
            self.session.randomSleep()
            response = self.session.get(pageUrl,
                                        textRspOnly=False,
                                        headers=HEADERS_SOGOU)
            soup = BeautifulSoup(response.text, 'lxml')
            main = soup.find('ul', {'class': "news-list2"})

            if main is None:
                self.logger.error('Fail to parse:%s', response.text)
            try:
                li_list = main.findAll('li')
            except Exception, e:
                print e
                continue

            for li in li_list:
                a_title = li.find('p', {'class': "tit"})
                if a_title is not None:
                    title = str(a_title.text.strip())
                    if title == keyword[0]:
                        self.logger.debug(title)
                        a_href = a_title.find('a')['href']
                        sub_articleList = self.crawlWetchartpublic(a_href)
                        for article in sub_articleList:
                            articleList.append(article)
        return articleList
示例#4
0
class WeiXinCrawlerByTopic(object):
    def __init__(self, sleepRange, logger):
        '''
        Constructor
        '''
        self.logger = logger
        self.session = SessionCrawler(None, sleepRange, self.logger)

    def __querySogou(self, sougoUrl):
        '''
        Given the official account id, we expect to uniquely find one and only one item
        Return the url to the official account
        '''
        self.logger.debug('Query sougo %s', sougoUrl)
        content = self.session.get(sougoUrl, SOUGO_HEADER)
        soup = BeautifulSoup(content)
        ul = soup.findAll('ul',{'class':'news-list'})[0]
        liList = ul.findAll('li')
        for li in liList:
            aList = li.findAll('a')
            articleUrl = None
            for a in aList:
                if a['uigs'].startswith('article_title'):
                    articleUrl = a['href']
                    break
            if articleUrl is not None:
                header = WX_HEADER.copy()
                header['Referer'] = sougoUrl
                self.session.randomSleep()
                content = self.session.get(articleUrl, header)
                article = self.parseArticle(content)
                article.contentUrl = articleUrl
                queryComment(self.session, articleUrl, article)

    def parseArticle(self, content):
        soup = BeautifulSoup(content)
        article = Article()
        #content
        div = soup.findAll('div',{'class':'rich_media_content'})
        if len(div)==0:
            #it may be due to that this post has been banned
            self.logger.warn('No content')
            return None
            #raise Exception('No content for %s'% article.title)
        article.content = div[0].text

        #title in <title> ... </title>
        title = soup.findNext('title')
        
        article.title = title.text
        article.wid = hash(article.title)
        
        #find meta list
        divMeta = soup.findAll('div',{'class':'rich_media_meta_list'})[0]
        
        #post date
        emPostdate = divMeta.findAll('em',{'id':'post-date'})[0]
        article.publishDateTime = time.mktime(datetime.datetime.strptime(emPostdate.text, '%Y-%m-%d').timetuple())
        
        #author
        emAuthorList = divMeta.findAll('em',{'class':'rich_media_meta rich_media_meta_text'})
        for em in emAuthorList:
            if 'id' not in em.attrs:
                article.author = em.text
                break
        
        #profile
        divProfile = divMeta.findAll('div',{'class':'profile_inner'})[0]
        ##nickname
        strong = divProfile.findAll('strong',{'class':'profile_nickname'})[0]
        article.userName = strong.text
        ##userid
        article.userId = strong.findNext('span').text
        
        return article
示例#5
0
class WeiboCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.channel = channel
        # self.user_name = "*****@*****.**"
        # self.session = SessionCrawler(sleepRange=[3,8])
        # self.pass_word = "810214bee810214"
        # self.user_name = "15018377821"
        # self.pass_word = "zzm15331411"
        self.user_name_password = self.get_username_password()
        self.user_name = self.user_name_password[0]
        self.pass_word = self.user_name_password[1]
        self.logger.info('username: %s' % self.user_name)

        self.session = SessionCrawler(sleepRange=[3, 8])

    def get_username_password(self):
        '''
        随机获取用户和密码
        :return:
        '''
        user_name_password1 = '18814095644:ljda.18814095644'
        user_name_password2 = '13432881156:liang452035397'
        user_name_password3 = '[email protected]:810214bee810214'
        user_name_password4 = '15018377821:zzm15331411'
        user_name_password5 = '[email protected]:uwinvip'
        user_name_password6 = '15767199023:j980216'

        user_list = [
            user_name_password1, user_name_password2, user_name_password3,
            user_name_password4, user_name_password5, user_name_password6
        ]

        user_choice = random.choice(user_list)
        user_name_password = user_choice.split(':')
        return user_name_password

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''

        # time_now = time.time()
        # date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        # com_time_low = time.mktime(time.strptime(date + ' 07:00:00', '%Y-%m-%d %H:%M:%S'))
        # com_time_hig = time.mktime(time.strptime(date + ' 07:59:58', '%Y-%m-%d %H:%M:%S'))
        # if time_now > com_time_low and time_now < com_time_hig:
        # loginFlag = self.__login()
        # if loginFlag is False:
        #     self.logger.error('Fail to logon')
        #     return

        startTime = endTime - datetime.timedelta(hours=2)
        # startTime=datetime.datetime(2017,11,20,23)
        page = 1
        articleList = list()
        hasnext = True
        while hasnext:
            data = self.__searchByPage(keywordList, startTime, endTime, page)
            (articleListInPage, hasnext) = self.__parseSearchPage(data)

            articleList.extend(articleListInPage)
            page += 1
        return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        # return
        try:
            (data, check) = self.__fetchSingleArticle(article)

            if check == '0':
                soup = BeautifulSoup(data, 'lxml')
                ulList = soup.findAll(
                    'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'})
                li_list = ulList[0].findAll('li')
                self.__parseStatistics(article, li_list)
            elif check == '1':
                self.logger.warning(u'要访问的网页404了:%s', article.url)
                return
            else:
                self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url)
                return
        except:
            self.logger.error('Fail to fetch statistics for:%s, %s',
                              article.url, traceback.format_exc())
            return

    def __save_cookies_lwp(self, cookiejar):
        """
        保存cookies到本地
        """
        filename = 'sinaweibocookies'
        lwp_cookiejar = cookielib.LWPCookieJar()
        for c in cookiejar:
            args = dict(vars(c).items())
            args['rest'] = args['_rest']
            del args['_rest']
            c = cookielib.Cookie(**args)
            lwp_cookiejar.set_cookie(c)
        lwp_cookiejar.save(filename, ignore_discard=True)

    def __load_cookies_from_lwp(self):
        """
        读取本地cookies
        """
        filename = 'sinaweibocookies'
        lwp_cookiejar = cookielib.LWPCookieJar()
        lwp_cookiejar.load(filename, ignore_discard=True)
        # self.logger.debug(lwp_cookiejar)
        return lwp_cookiejar

    def __parseStatistics(self, article, soup_li_list):
        # 新版
        collect_count = soup_li_list[0].find('span').text
        collect_count = re.findall(r'\d+', collect_count)
        if len(collect_count) > 0:
            collect_count = int(collect_count[0])
        else:
            collect_count = 0
        forward_count = soup_li_list[1].find('span').text

        forward_count = re.findall(r'\d+', forward_count)
        if len(forward_count) > 0:
            forward_count = int(forward_count[0])
        else:
            forward_count = 0

        reply_count = soup_li_list[2].find('span').text
        reply_count = re.findall(r'\d+', reply_count)
        if len(reply_count) > 0:
            reply_count = int(reply_count[0])
        else:
            reply_count = 0

        like_count = soup_li_list[3].find('span').text
        like_count = re.findall(r'\d+', like_count)
        if len(like_count) > 0:
            like_count = int(like_count[0])
        else:
            like_count = 0
        article.statistics.reply_count = reply_count
        article.statistics.like_count = like_count
        article.statistics.collect_count = collect_count
        article.statistics.forward_count = forward_count

    def __parseSearchPage(self, data):
        '''
        @return: (articleList,hasnext)
        '''
        articleList = list()
        hasnext = False
        soup = BeautifulSoup(data, "lxml")
        # self.logger.info(soup)
        #check if no result
        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return (articleList, hasnext)

        #find page bar to check if more

        pageDivList = soup.findAll('div', {'class': 'W_pages'})
        if len(pageDivList) > 0:
            pageDiv = pageDivList[0]
            if len(pageDiv.findAll('a',
                                   {'class': 'page next S_txt1 S_line1'})) > 0:
                hasnext = True
        if hasnext is False:
            self.logger.info('The last page')

        root_1 = soup.findAll('div', {"action-type": "feed_list_item"})
        # self.logger.debug(root_1)
        for r in root_1:
            root_2 = r.find('div', {'class': "content clearfix"})
            mid = r.attrs['mid']
            article_url = root_2.find('div', {
                'class': "feed_from W_textb"
            }).findNext('a').attrs['href']
            self.logger.debug('1  %s', article_url)
            if not article_url.startswith('http:'):
                article_url = 'http:' + article_url
            # self.logger.debug(article_url)
            root_content = root_2.find('p', {'class': "comment_txt"})

            long_content = root_content.find('a', {'action-type': "fl_unfold"})
            try:
                link_content = root_content.find('a').attrs['href']
                link_content = '  原文链接: ' + link_content
            except:
                link_content = ''
            if long_content:
                content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[
                    'action-data']
                self.session.randomSleep()
                response = self.session.get(content_url, textRspOnly=False)
                try:
                    content_html = response.json()['data']['html']
                    content = BeautifulSoup(
                        content_html, 'html.parser').text.strip().replace(
                            "'", "''").replace("%", "\%").replace(":", "\:")
                except Exception, e:
                    self.logger.debug('Exception: %s' % e)
                    continue
            else:
                content = root_content.text.strip().replace("'", "''").replace(
                    "%", "\%").replace(":", "\:")
                content = content + link_content
                # self.logger.error(content)

            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '
            author_id = r.attrs['tbinfo']
            author_id = re.findall(r'ouid=(\d+)', author_id)[0]
            author_name = root_2.find('a').attrs['nick-name']
            publish_datetime = root_2.find('a', {
                'class': "W_textb"
            }).attrs['date']

            try:
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.localtime(float(publish_datetime) / 1000))
            except:
                continue
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name)

            #fetch statistics
            root_3 = r.find('div', {'class': "feed_action clearfix"})
            soup_li_list = root_3.findAll('li')
            self.__parseStatistics(article, soup_li_list)
            # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \
            #     publish_datetime, reply_count, read_count, like_count, collect_count, forward_count
            if article not in articleList:
                articleList.append(article)
        return (articleList, hasnext)
示例#6
0
class SinanewsBlogCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''

        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.session = SessionCrawler(sleepRange=[3, 8])
        self.channel = channel
        self.articleList = list()
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'search.sina.com.cn',
            'Upgrade-Insecure-Requests': '1'
        }

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(hours=1)
        # startTime=datetime.datetime(2017,11,20,23)
        page = 0

        hasnext = True
        while hasnext:
            hasnext = self.__searchByPage(keywordList, startTime, endTime,
                                          page)
            page += 1
            self.logger.error(u'articlelength:%d', len(self.articleList))
        return self.articleList

    def __searchByPage(self, keywordList, startTime, endTime, page):
        # 在豆瓣内部搜索框搜索
        page = str(page)  #url接收的是str格式
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        params = {
            'c': 'blog',
            'range': 'article',  # 包含全部
            'by': 'all',
            'sort': 'time',
            'col': '',
            'source': '',
            'from': '',
            'country': '',
            'size': '',
            'time': '',
            'a': '',
            'isown': 'false',
            'page': page,
            'dpc': '',
            'q': query
        }
        search_url = "http://search.sina.com.cn/"

        url_page = self.session.get(search_url,
                                    params=params,
                                    headers=self.headers)
        soup = BeautifulSoup(url_page, "lxml")
        # self.logger.debug(soup)
        main_wrap = soup.find('div', attrs={'class': "result-boxes"})

        if main_wrap is None:  #为了防止因网络问题而导致的查找不到
            self.logger.debug(u'第一次查找没有结果再找一遍中')
            url_page = self.session.get(search_url,
                                        params=params,
                                        headers=self.headers)
            soup = BeautifulSoup(url_page, "lxml")
            main_wrap = soup.find('div', attrs={'class': "result-boxes"})
            if main_wrap is None:  #再找一遍如果还是没有才可以算没有结果
                self.logger.debug(u'第二次查找没有结果,该关键词没有结果')
                return (list(), False)

        main_article_list = main_wrap.findAll(
            'div', attrs={'class': "box-result clearfix"})

        hasnext = True
        if main_article_list is not None:
            print '1'
            for title in main_article_list:
                print '2'
                article_publishtime_wrap = title.findAll('p')[1]
                article_publishtime = article_publishtime_wrap.find(
                    'span', attrs={
                        'class': "fgray_time"
                    }).text.strip()
                self.logger.error(article_publishtime)
                urlTime = time.strptime(article_publishtime,
                                        "%Y-%m-%d %H:%M:%S")

                Y, M, D, H = urlTime[0:4]
                urlTime2 = datetime.datetime(Y, M, D, H)
                urlTime2 = time.mktime(urlTime2.timetuple())
                #转换成时间戳来比较 float类型
                startTime = endTime - datetime.timedelta(days=5)
                startTimeIntSecond = time.mktime(startTime.timetuple())
                endTimeIntSecond = time.mktime(endTime.timetuple())

                #如果符合时间范围就爬取
                if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond:
                    print '3'
                    title_wrap = title.find('h2')
                    article_url_parent = title_wrap.find('a').attrs['href']
                    article_url_index = article_url_parent.rfind('/')
                    article_url_child = article_url_parent[article_url_index +
                                                           1:]
                    article_url = 'http://blog.sina.com.cn/s/blog_' + article_url_child + '.html'
                    Ttitle = title_wrap.find('a').text.strip()
                    self.logger.debug(article_url)
                    meta_info = article_url_child
                    (content, reply_count, like_count, read_count,
                     collect_count, forward_count) = self.crawlArticle(
                         article_url, meta_info)
                    self.logger.debug(like_count)

                    if content is None:  # 话题已被删除或则其他格式
                        print '756236'
                        continue
                    tid = 't_' + article_url_child
                    author_name = article_publishtime_wrap.find(
                        'a', attrs={
                            'class': "rib-author"
                        }).text
                    article = Article(tid,
                                      self.channel.channel_id,
                                      Ttitle,
                                      content,
                                      article_publishtime,
                                      article_url,
                                      None,
                                      author_name,
                                      meta_info=meta_info)

                    # self.crawlComment(article)

                    article.statistics.reply_count = reply_count
                    article.statistics.like_count = like_count
                    article.statistics.read_count = read_count
                    article.statistics.collect_count = collect_count
                    article.statistics.forward_count = forward_count
                    # self.logger.debug(article)
                    if article not in self.articleList:
                        self.articleList.append(article)

                else:
                    print '78956'
                    print len(self.articleList)
                    hasnext = False
                    break

            return hasnext

    def crawlStatistics(self, article):  #下次直接获得要统计的变量而不用爬整个网页
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        headers2 = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'comet.blog.sina.com.cn',
            'Upgrade-Insecure-Requests': '1'
        }
        try:
            # 统计数据的请求
            parmas = {
                'maintype': 'num',
                'aids': article.meta_info[-6:],
                'uid': article.meta_info[:-8]
            }
            data_url = 'http://comet.blog.sina.com.cn/api'
            data = self.session.get(data_url, params=parmas, headers=headers2)
            data2 = json.loads(data[data.find(':') + 1:-3])
            like_count = data2['d']
            read_count = data2['r']
            reply_count = data2['c']
            collect_count = data2['f']
            forward_count = data2['z']
            article.statistics.reply_count = reply_count
            article.statistics.like_count = like_count
            article.statistics.read_count = read_count
            article.statistics.collect_count = collect_count
            article.statistics.forward_count = forward_count
        except:
            self.logger.error(u'该话题已被删除或有其他格式')

    def crawlArticle(self, url, meta_info):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'blog.sina.com.cn',
            'Upgrade-Insecure-Requests': '1'
        }
        headers2 = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'comet.blog.sina.com.cn',
            'Upgrade-Insecure-Requests': '1'
        }

        html = self.session.get(url, headers=headers)

        if html:
            print '111'
            soup = BeautifulSoup(html, 'lxml')  # 'html.parser' 解析器
            main = soup.find('div', attrs={'id': "articlebody"})
            if main is not None:
                print '222'
                Tcontent = main.find('div',
                                     attrs={
                                         'id': "sina_keyword_ad_area2"
                                     }).text.strip()
                #统计数据的请求
                parmas = {
                    'maintype': 'num',
                    'aids': meta_info[-6:],
                    'uid': meta_info[:-8]
                }
                data_url = 'http://comet.blog.sina.com.cn/api'
                data = self.session.get(data_url,
                                        params=parmas,
                                        headers=headers2)
                data2 = json.loads(data[data.find(':') + 1:-3])
                like_count = data2['d']
                read_count = data2['r']
                reply_count = data2['c']
                collect_count = data2['f']
                forward_count = data2['z']
                return (Tcontent, reply_count, like_count, read_count,
                        collect_count, forward_count)
            else:
                self.logger.error(u'该话题已被删除或存在其他格式')
                return (None, None, None, None, None, None)

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        self.lastCommentId = None

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        #self.logger.debug('Article:%s', article)
        comment_url = 'http://comment5.news.sina.com.cn/page/info?channel=blog&newsid='
        comment_url = comment_url + article.meta_info
        self.logger.error(comment_url)
        html = self.session.get(comment_url)
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        commentList = list()
        main_comment = json.loads(html)['result']['cmntlist']
        print '8989'
        if len(main_comment) == 0:
            print '12212'
            return (commentList, False)

        for comment in main_comment:
            cid = comment['mid']
            publish_datetime = comment['time']
            user_id = comment['uid']
            user_name = comment['nick']
            content = comment['content']
            location = ','.join(jieba.cut(comment['area']))
            location_region = location[:location.find(',')]
            location_city = location[location.find(',') + 1:]
            print location_city
            location_coutry = 'CN'

            commentList.append(
                Comment(article.tid,
                        self.channel.channel_id,
                        cid,
                        add_datetime,
                        publish_datetime,
                        None,
                        location_coutry,
                        location_region,
                        location_city,
                        user_id,
                        user_name,
                        content,
                        None,
                        None,
                        None,
                        None,
                        dislike_count=None))

        return (commentList, False
                )  #测试的时候 article[0][222].content  可以取出第222条的评论内容
示例#7
0
class WeiXinCralwer(object):
    '''
    classdocs
    '''


    def __init__(self, sleepRange, logger):
        '''
        Constructor
        '''
        self.logger = logger
        self.session = SessionCrawler(None, sleepRange, self.logger)

    def __querySogou(self, sougoUrl):
        '''
        Given the official account id, we expect to uniquely find one and only one item
        Return the url to the official account
        '''
        self.logger.debug('Query sougo %s', sougoUrl)
        content = self.session.get(sougoUrl, SOUGO_HEADER)
        soup = BeautifulSoup(content)
        item = soup.findAll('a',{'uigs':'account_name_0'})[0]
        return item['href']

    def __queryArticleList(self, sougoUrl, officialAccountUrl):
        self.logger.debug('Query ariticle list for %s', officialAccountUrl)
        header = WX_HEADER.copy()
        header['Referer'] = sougoUrl
        self.session.randomSleep()
        content = self.session.get(officialAccountUrl, header)
        lindex = content.find(WX_ARTICLE_LIST_PREFIX)+len(WX_ARTICLE_LIST_PREFIX)
        rindex = content.find(WX_ARTICLE_LIST_SUFFIX)
        rindex = lindex + content[lindex:rindex].rfind(';')
        js = content[lindex:rindex]
        jo = json.loads(js)
        aList = jo['list']
        articleList = list()
        for item in aList:
            app_msg_ext_info = item['app_msg_ext_info']
            comm_msg_info = item['comm_msg_info']            
            article = self.__fetchArticle(app_msg_ext_info)
            article.publishDateTime = comm_msg_info['datetime']
            articleList.append(article)
            if 'multi_app_msg_item_list' in item:
                for embedItem in item['multi_app_msg_item_list']:
                    article = self.__fetchArticle(embedItem)
                    article.publishDateTime = comm_msg_info['datetime']
                    articleList.append(article)
        return articleList
    
    def __fetchArticle(self, item):
        article = Article()
        article.title = item['title']
        article.wid = str(hash(article.title))
        article.author = item['author']
        article.contentUrl = item['content_url']
        article.digest = item['digest']
        article.fileid = item['fileid']
        article.sourceUrl = item['source_url']
        #print article.title,":",article.contentUrl,'\n'
        return article
    
    def __queryComment(self, articleList, referer):
        mainPageHeader = WX_HEADER.copy()
        mainPageHeader['Referer'] = referer
        for article in articleList:
            self.logger.debug('Query comment for %s', article.title)
            #find the signature
            lindex= article.contentUrl.find('signature=')
            rindex = article.contentUrl[lindex:].find('&')
            if rindex>0:
                signature = article.contentUrl[lindex:rindex]
            else:
                signature = article.contentUrl[lindex:]
            #find the timestamp
            lindex= article.contentUrl.find('timestamp=')+len('timestamp=')
            timestamp = int(article.contentUrl[lindex:lindex+10])
            self.session.randomSleep()
            #query main page
            mainUrl = 'http://mp.weixin.qq.com'+article.contentUrl.replace('&amp;','&')
            self.session.randomSleep()
            content = self.session.get(mainUrl, mainPageHeader)
            soup = BeautifulSoup(content)
            div = soup.findAll('div',{'class':'rich_media_content'})
            if len(div)==0:
                #it may be due to that this post has been banned
                self.logger.warn('No content for %s', article.title)
                continue
                #raise Exception('No content for %s'% article.title)
            article.content = div[0].text
            #query comment page
            currentTime = int(time.time())
            url = WX_COMMENT_URL % (timestamp, signature)
            #print url
            header = WX_COMMENT_HEADER.copy()
            header['Referer'] = mainUrl
            self.session.randomSleep()
            content = self.session.get(url, header)
            jo = json.loads(content)
            #print jo.keys()
            article.readCount = jo['read_num']
            article.likeCount = jo['like_num']
            commentList = jo['comment']
            for item in commentList:
                comment = Comment()
                comment.commenterNickName = item['nick_name']
                comment.likeCount = item['like_num']
                comment.content = item['content']
                comment.contentId = item['content_id']
                comment.createTime = item['create_time']
                for replyItem in item['reply']['reply_list']:
                    reply = Reply()
                    reply.content = replyItem['content']
                    reply.createTime = replyItem['create_time']
                    reply.uin = replyItem['uin']
                    reply.toUin = replyItem['to_uin']
                    reply.replyId = replyItem['reply_id']
                    comment.replyList.append(reply)
                article.commentList.append(comment)

    def crawl(self, officialAccountId):
        
        sougoUrl = SOUGO_QUERY_URL % (1, quote(officialAccountId))
        officialAccountUrl = self.__querySogou(sougoUrl)
        articleList = self.__queryArticleList(sougoUrl, officialAccountUrl)
        self.__queryComment(articleList, officialAccountUrl)
        return articleList
        #self.__writeCsv(officialAccountId+'.csv', articleList)

    def writeDb(self, dbConf, officialAccountId, articleList):
        dbProxy = MySqlProxy(host=dbConf['dbHost'], 
                             port=3306, user=dbConf['dbUser'], 
                             passwd=dbConf['dbPasswd'], db=dbConf['dbName'])
        weixinSql = 'INSERT INTO T_WEIXIN (pid, wid, author, title, digest, content, publish_datetime, read_count, like_count) values '
        commentSql = 'INSERT INTO T_WEIXIN_COMMENT(pid, cid, wid, content, publisher_name, publish_datetime,like_count) values '
        replySql = 'INSERT INTO T_WEIXIN_REPLY (rid, cid, content, publish_datetime, uin, touin) values '
        weixinValueList = list()
        commentValueList = list()
        replyValueList = list()
        widSet = set()
        for article in articleList:
            weixinValueList.append('("%s","%s","%s","%s","%s","%s","%s",%d,%d)'%(
                                    officialAccountId,
                                    str(article.wid),
                                    article.author.replace('"','\\"'),
                                    article.title.replace('"','\\"'),
                                    article.digest.replace('"','\\"'),
                                    article.content.replace('"','\\"'),
                                    time.strftime(ISOTIMEFORMAT, time.localtime(article.publishDateTime)),
                                    article.readCount,
                                    article.likeCount
                                        ))
            widSet.add(article.fileid)
            for comment in article.commentList:
                commentValueList.append('("%s","%s","%s","%s","%s","%s",%d)'%(
                                            officialAccountId,
                                            str(comment.contentId),
                                            str(article.wid),
                                            comment.content.replace('"','\\"'),
                                            comment.commenterNickName.replace('"','\\"'),
                                            time.strftime(ISOTIMEFORMAT, time.localtime(comment.createTime)),
                                            comment.likeCount
                                        ))
                for reply in comment.replyList:
                    replyValueList.append('("%s","%s","%s","%s","%s","%s")'%(
                                            str(reply.replyId),
                                            str(comment.contentId),
                                            reply.content.replace('"','\\"'),
                                            time.strftime(ISOTIMEFORMAT, time.localtime(reply.createTime)),
                                            reply.uin,
                                            reply.toUin
                                        ))

        #clear the db firstly
        sql = 'delete from T_WEIXIN where wid in (%s) and pid="%s"' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId)
        dbProxy.execute(sql)
        sql = 'delete from T_WEIXIN_REPLY where cid in (select cid from T_WEIXIN_COMMENT where wid in (%s) and pid="%s")' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId)
        dbProxy.execute(sql)
        sql = 'delete from T_WEIXIN_COMMENT where wid in (%s) and pid="%s"' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId)
        dbProxy.execute(sql)

        #insert to db
        if len(weixinValueList)>0:
            self.logger.info('Insert %d records to weixin', len(weixinValueList))
            dbProxy.execute(weixinSql +','.join(weixinValueList))
        if len(commentValueList)>0:
            self.logger.info('Insert %d records to comment', len(commentValueList))
            dbProxy.execute(commentSql +','.join(commentValueList))
        if len(replyValueList)>0:
            self.logger.info('Insert %d records to reply', len(replyValueList))
            dbProxy.execute(replySql +','.join(replyValueList))
        
    def __writeCsv(self, fileName, articleList):
        #f = codecs.open(fileName,"w","utf-8")
        csvfile = file(fileName,'w')
        csvfile.write(codecs.BOM_UTF8)
        writer = csv.writer(csvfile)
        header = ['Title', 'Digest', 'Author', 'readCount', 'likeCount', 'publishDateTime', 'Comment-NickName', 'Comment-Content', 'Comment-likeCount', 'Comment-CreateTime']
        writer.writerow(header)
        for article in articleList:
            writer.writerow(
                            (
                              article.title.encode('utf8'),
                              article.digest.encode('utf8'),
                              article.author.encode('utf8'),
                              article.readCount,
                              article.likeCount,
                              time.strftime(ISOTIMEFORMAT, time.localtime(article.publishDateTime)),
                              '',
                              '',
                              '',
                              ''
                             )
                            )
            for comment in article.commentList:
                writer.writerow(
                                (
                                    '',
                                    '',
                                    '',
                                    '',
                                    '',
                                    '',
                                    comment.commenterNickName.encode('utf8'),
                                    comment.content.encode('utf8'),
                                    comment.likeCount,
                                    time.strftime(ISOTIMEFORMAT, time.localtime(comment.createTime))
                                 )
                                )
        csvfile.close()
示例#8
0
class DouBanCrawler(object):
    '''
    classdocs
    '''

    def __init__(self,channel,logger=None):
        '''
        Constructor
        '''

        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.session = SessionCrawler(sleepRange=[3, 8])
        self.channel = channel
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'www.douban.com',
            'Upgrade-Insecure-Requests': '1'
        }

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(hours=1)
        # startTime=datetime.datetime(2017,11,20,23)
        page = 0
        articleList = list()
        hasnext = True
        while hasnext:
            (articleListInPage, hasnext) = self.__searchByPage(keywordList, startTime, endTime, page)
            articleList.extend(articleListInPage)
            page += 50
        return articleList

    def __searchByPage(self,keywordList,startTime,endTime,page):
        # 在豆瓣内部搜索框搜索
        page = str(page) #url接收的是str格式
        search_url = "https://www.douban.com/group/search?start="+page+"&cat=1013&sort=time&q="
        #cat:按话题搜索  sort:按最新发布时间分类  q:搜索关键词
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        search_url = search_url+str(query)

        url_page = self.session.get(search_url,headers=self.headers)
        soup = BeautifulSoup(url_page, "lxml")
        # self.logger.debug(soup)
        main_wrap = soup.find('div', attrs={'class': "article"})
        main_article_list = main_wrap.find('div',attrs={'class':"topics"})
        articleList = list()
        hasnext = True
        if main_article_list is not None:
            title_list = main_article_list.findAll('tr', {'class': 'pl'})
            for title in title_list:
                article_publishtime = title.find('td', attrs={'class': "td-time"}).attrs['title']

                urlTime = time.strptime(article_publishtime, "%Y-%m-%d %H:%M:%S")

                Y, M, D, H = urlTime[0:4]
                urlTime2 = datetime.datetime(Y, M, D, H)
                urlTime2 = time.mktime(urlTime2.timetuple())
                #转换成时间戳来比较 float类型
                startTime = endTime - datetime.timedelta(days=2)
                startTimeIntSecond = time.mktime(startTime.timetuple())
                endTimeIntSecond = time.mktime(endTime.timetuple())

                #如果符合时间范围就爬取
                if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond:
                    article_url = title.find('a').attrs['href']
                    self.logger.debug(article_url)
                    (content, author_name, tid,like_count) = self.crawlArticle(article_url)
                    if content is None:  # 话题已被删除或则其他格式
                        continue

                    article_title = title.find('td', attrs={'class': "td-subject"}).text
                    article = Article(tid,  self.channel.channel_id, article_title, content, article_publishtime, article_url, None,
                                      author_name)
                    self.crawlComment(article)
                    reply_count = title.find('td', attrs={'class': "td-reply"}).text.strip()
                    reply_count = re.sub(u'回应', '', reply_count)  # 回复数量去除中文保留数字
                    article.statistics.reply_count = reply_count
                    article.statistics.like_count = like_count
                    if article not in articleList:
                        articleList.append(article)

                else:
                    print len(articleList)
                    hasnext=False
                    break

            return (articleList, hasnext)


    def crawlStatistics(self, article): #下次直接获得要统计的变量而不用爬整个网页
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        try:
            article_main = self.session.get(article.url, headers=self.headers)
            soup = BeautifulSoup(article_main, 'lxml')
            comment_list = soup.find('ul',attrs={'id':"comments"}) #如果还需要其他统计数可以继续添加
            comment_list_li = comment_list.findAll('li')
            article.statistics.reply_count = len(comment_list_li)
            #like_count页面
            url = article.url + '?type=like'
            article_main = self.session.get(url, headers=self.headers)
            soup = BeautifulSoup(article_main, 'lxml')
            main = soup.find('div', attrs={'class': "article"})
            lik_count_wrap = main.find('div', attrs={'class': "list topic-fav-list"})
            lik_count = lik_count_wrap.findAll('li')
            article.statistics.like_count = len(lik_count)
        except:
            self.logger.error(u'该话题已被删除或有其他格式')

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        url = url+'?type=like'
        html = self.session.get(url,headers=self.headers)

        if html:
            print '111'
            soup = BeautifulSoup(html, 'lxml')  # 'html.parser' 解析器
            main = soup.find('div', attrs={'class': "article"})
            if main is not None:
                print '222'
                Tauthor = main.find('span', attrs={'class': "from"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a').text.strip()
                else:
                    Tauthor = 'None'
                Tcontent = main.find('div', attrs={'class': "topic-content"}).text.strip()
                Tid_wrap = main.find('div', attrs={'class': "sns-bar"})
                Tid = Tid_wrap.find('a').attrs['data-tid']
                try:
                    lik_count_wrap = main.find('div',attrs={'class':"list topic-fav-list"})
                    lik_count = lik_count_wrap.findAll('li')
                    lik_count=len(lik_count)
                except:
                    lik_count=0
                return (Tcontent,Tauthor,Tid,lik_count)
            else:
                self.logger.error(u'该话题已被删除或存在其他格式')
                return (None,None,None,None)

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        self.lastCommentId = None

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        #self.logger.debug('Article:%s', article)

        html = self.session.get(article.url, headers=self.headers)
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        commentList = list()
        main_comment = BeautifulSoup(html,'lxml')
        try:
            commentList_html = main_comment.find('ul',attrs={'id':"comments"})
            commentList_html_li = commentList_html.findAll('li')
        except:
            self.logger.error(u'该页面已被删除或则存在其他格式:%s',article.url)
            return (commentList, False)

        for comment in commentList_html_li:
            cid = comment.attrs['data-cid']
            comment_main_wrap = comment.find('div',attrs={'class':"reply-doc content"})
            user_id = comment_main_wrap.find('div',attrs={'class':"operation_div"}).attrs['id']
            user_info_wrap = comment_main_wrap.find('div',attrs={'class':"bg-img-green"})
            user_name = user_info_wrap.find('a').text.strip()
            publish_datetime = user_info_wrap.find('span').text.strip()
            content = comment_main_wrap.find('p').text.strip()
            reply_user_wrap = comment_main_wrap.find('div',attrs={'class':"reply-quote"})
            if reply_user_wrap:
                reply_userid_wrap=reply_user_wrap.find('span',attrs={'class':"pubdate"})
                reply_userid = reply_userid_wrap.find('a').attrs['href']
                reply_userid = re.sub(r'\D', "", reply_userid)
            else:
                reply_userid = ''
            # like_count_wrap = comment_main_wrap.find('div',attrs={'class':"operation_div"})
            # like_count = like_count_wrap.findAll('a')[1].text
            # like_count = re.sub(r'\D', "", like_count) #点赞数难获取不是页面自带的
            like_count = None
            reply_count = None
            location_coutry = 'CN'

            commentList.append(Comment(article.tid,  self.channel.channel_id, cid,
                                   add_datetime, publish_datetime,
                                    None,location_coutry,None,None,
                                   user_id, user_name, content, reply_userid,
                                   None, like_count, reply_count, dislike_count=None
                                   ))

        return (commentList, False)  #测试的时候 article[0][222].content  可以取出第222条的评论内容
示例#9
0
class WetchartpublicCrawler(object):
    '''
    calssdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        构造函数
        :param channel:
        :param logger:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.session = SessionCrawler(logger=self.logger)
        self.channel = channel

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        :param keywordList: 关键字数组
        :param endTime: 搜索时间范围结束
        :return:
        '''
        for keyword in keywordList:
            pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword)
            self.logger.debug('pageUrl:%s', pageUrl)
            response = self.session.get(pageUrl,
                                        textRspOnly=False,
                                        headers=HEADERS_SOGOU)
            soup = BeautifulSoup(response.text)
            main = soup.find('ul', {'class': "news-list2"})
            if main is None:
                self.logger.error('Fail to parse:%s', response.text)
            li_list = main.findAll('li')

            for li in li_list:
                a_title = li.find('p', {'class': "tit"})
                if a_title is not None:
                    title = str(a_title.text.strip())
                    if title == keyword:
                        a_href = a_title.find('a')['href']
                        self.logger.debug(a_href)
                        self.crawlWetchartpublic(a_href)

    def crawlWetchartpublic(self, url):
        response = self.session.get(url,
                                    textRspOnly=False,
                                    headers=HEADERS_SOGOU_PUBLIC)
        soup = BeautifulSoup(response.text)
        script_list = soup.findAll('script')
        for li in script_list:
            li_str = str(li.text)
            sub_str1 = "msgList = "
            sub_str2 = '}]};'
            if li_str.find(sub_str1) != -1:
                index1 = li_str.find(sub_str1)
                index2 = li_str.find(sub_str2)
                main = str(li.text)[index1 + len(sub_str1):index2 + 3]
                articleJson = json.loads(main)
                articlelist = articleJson['list']
                for item in articlelist:
                    mes_info = item['app_msg_ext_info']
                    url = 'https://mp.weixin.qq.com' + mes_info['content_url']
                    url = url.replace('amp;', '')
                    self.crawlArticle(url)
                    multi_item_list = mes_info['multi_app_msg_item_list']
                    for multi_item in multi_item_list:
                        multi_url = 'https://mp.weixin.qq.com' + multi_item[
                            'content_url']
                        multi_url = multi_url.replace('amp;', '')
                        self.crawlArticle(multi_url)

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        :return:返回一个article实例
        '''
        self.session.randomSleep()
        response = self.session.get(url, textRspOnly=False)

    def crawlStatistics(self, article):
        '''
        爬去统计信息
        :param article:
        :return: 无需返回参数,统计信息写入article实例
        '''
        pass

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        pass

    def crawlComment(self, article):
        '''
        根据文章,爬去文章的评论,返回评论列表
        :param article:
        :return:(commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        pass
示例#10
0
class ZhihuCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = SessionCrawler()
        self.channel = channel
        self.nextCommentUrl = None

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeIntSecond = time.mktime(startTime.timetuple())
        endTimeIntSecond = time.mktime(endTime.timetuple())
        urls = self.baiduCrawler.search(self.channel.url, keywordList,
                                        startTimeIntSecond, endTimeIntSecond)
        articleList = list()
        for baiduUrl in urls:
            url = self.__fetchRealUrlFromBaiduUrl(baiduUrl)
            article = self.crawlArticle(url)
            if article is not None and article not in articleList:
                #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                articleList.append(article)
        return articleList

    def __fetchRealUrlFromBaiduUrl(self, baiduUrl):
        '''
        '''
        response = self.session.session.get(baiduUrl, allow_redirects=False)
        if response.status_code == 302:
            return response.headers['Location']

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        #判断url格式,因为从百度查询所得不一定是question,目前只爬question
        if url.find('question') < 0:
            self.logger.warn('Question supported only:%s', url)
            return None
        article_id = re.findall(r'question/(\d+)', url)[0]
        self.session.randomSleep()
        response = self.session.get(url, headers=CRAWL_ARTICLE_HEADERS)
        soup = BeautifulSoup(response)
        main = soup.find('div', attrs={'id': "data"}).attrs['data-state']
        articleJson = json.loads(main)
        questionJson = articleJson['entities']['questions'][article_id]
        title = questionJson['title']
        contentSoup = BeautifulSoup(questionJson['editableDetail'])
        content = contentSoup.text
        author_id = questionJson['author']['id']
        author_name = questionJson['author']['name']
        createTimeInFloat = questionJson['created']
        publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S',
                                         time.localtime(createTimeInFloat))
        reply_count = questionJson['commentCount']
        read_count = questionJson['visitCount']
        collect_count = questionJson['followerCount']
        article = Article(article_id, self.channel.channel_id, title, content,
                          publish_datetime, url, author_id, author_name)
        article.statistics.reply_count = reply_count
        article.statistics.read_count = read_count
        article.statistics.collect_count = collect_count
        return article

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        articleCopy = self.crawlArticle(article.url)
        article.statistics.reply_count = articleCopy.statistics.reply_count
        article.statistics.read_count = articleCopy.statistics.read_count
        article.statistics.collect_count = articleCopy.statistics.collect_count

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        self.nextCommentUrl = None

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        if self.nextCommentUrl is None:
            curl = COMMENT_URL_TEMPLATE % (article.tid, COMMENT_PAGE_SIZE, 0)
            curl = curl.replace('#', '%')
        else:
            curl = self.nextCommentUrl
        self.session.randomSleep()
        result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS)
        jo = json.loads(result)
        paging = jo['paging']
        hasnext = not paging['is_end']
        self.nextCommentUrl = paging['next']
        dataList = jo['data']
        add_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        commentList = list()
        for data in dataList:
            #self.logger.debug('[Zhihu]Comment data keys:%s', data.keys())
            #self.logger.debug('[ZHIHU]Comment url for %s:%s', article.title, data['url'])
            publish_datetime = time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(data['updated_time']))
            comment = Comment(article.tid,
                              article.channel_id,
                              data['id'],
                              add_datetime,
                              publish_datetime,
                              ip_address=None,
                              location_country=None,
                              location_region=None,
                              location_city=None,
                              author_id=data['author']['id'],
                              author_name=data['author']['name'],
                              content=data['content'],
                              reply_author_id=None,
                              read_count=None,
                              like_count=data['voteup_count'],
                              reply_count=data['comment_count'],
                              dislike_count=None)
            commentList.append(comment)
        return (commentList, hasnext)
示例#11
0
class WechatCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.session = SessionCrawler(logger=self.logger)
        self.session_public = SessionCrawler(logger=self.logger)
        self.session_public_article = SessionCrawler(logger=self.logger)
        self.channel = channel
        self.entityId = 'SYSU'

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeStr = startTime.strftime('%Y-%m-%d')
        endTimeStr = endTime.strftime('%Y-%m-%d')
        self.logger.debug('startTime:%s', startTimeStr)
        self.logger.debug('endTime:%s', endTimeStr)
        urlList = list()
        publicList = self.getPublic()
        articleList = self.searchPublic(publicList, endTime)
        for keyword in keywordList:
            # 忽略第一次,第一次不带时间范围
            pageUrl = (SOUGO_WEIXIN_URL_INIT % (urllib.quote(
                keyword.encode('utf-8')), int(time.time() * 1000))).replace(
                    '#', '%')
            self.logger.debug('pageUrl:%s', pageUrl)
            self.session.randomSleep()
            response = self.session.get(pageUrl,
                                        textRspOnly=False,
                                        headers=HEADERS_SOGOU)
            lastPageUrl = pageUrl
            pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote(
                keyword.encode('utf-8')), startTimeStr, endTimeStr)
            pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote(
                keyword.encode('utf-8')))
            self.logger.debug('pageUrl:%s', pageUrl)
            while True:
                # 用上一次url作为这次的referer
                headers = HEADERS_SOGOU.copy()
                headers['Referer'] = lastPageUrl
                self.session.randomSleep()
                response = self.session.get(pageUrl,
                                            textRspOnly=False,
                                            headers=headers)
                soup = BeautifulSoup(response.text, 'lxml')
                main = soup.find('ul', {'class': "news-list"})
                if main is None:
                    self.logger.error('Fail to parse:%s', response.text)
                    return []
                li_list = main.findAll('li')
                #li_list有可能为空,但还可以翻页
                for li in li_list:
                    a_list = li.findAll('a')
                    for a in a_list:
                        if a['uigs'].startswith('article_title'):
                            #self.logger.debug('Article title:%s',a.text)
                            urlList.append((a['href'], pageUrl, a.text))
                            break
                pageBarList = soup.findAll('div', {'id': 'pagebar_container'})
                if len(pageBarList) == 0:
                    #没有翻页,直接退出
                    break
                pageBar = pageBarList[0]
                aList = pageBar.findAll('a')
                foundNextPage = False
                for a in aList:
                    if a['uigs'] == 'page_next':
                        foundNextPage = True
                        lastPageUrl = pageUrl
                        pageUrl = SOGOU_URL + a['href']
                        self.logger.debug('Found next page:%s', a.text)
                        break
                if foundNextPage is False:
                    break
        for item in urlList:
            article = self.crawlArticle(item[0],
                                        referer=item[1],
                                        title=item[2],
                                        flag=0)
            if article is not None:
                if article not in articleList:
                    #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                    articleList.append(article)
        return articleList

    def searchPublic(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索公众号
        :param keywordList:
        :param endTime:
        :return:
        '''
        articleList = list()
        for keyword in keywordList:
            self.logger.debug(keyword)
            pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword[0])
            self.logger.info('pageUrl:%s', pageUrl)
            self.session.randomSleep()
            response = self.session.get(pageUrl,
                                        textRspOnly=False,
                                        headers=HEADERS_SOGOU)
            soup = BeautifulSoup(response.text, 'lxml')
            main = soup.find('ul', {'class': "news-list2"})

            if main is None:
                self.logger.error('Fail to parse:%s', response.text)
            li_list = main.findAll('li')

            for li in li_list:
                a_title = li.find('p', {'class': "tit"})
                if a_title is not None:
                    title = str(a_title.text.strip())
                    if title == keyword[0]:
                        self.logger.debug(title)
                        a_href = a_title.find('a')['href']
                        sub_articleList = self.crawlWetchartpublic(a_href)
                        for article in sub_articleList:
                            articleList.append(article)
        return articleList

    def crawlWetchartpublic(self, url):
        '''
        按公众号爬取文章
        :param url:
        :return:
        '''
        self.logger.debug(url)
        self.session_public.randomSleep()
        response = self.session_public.get(url,
                                           textRspOnly=False,
                                           headers=HEADERS_ARTICLE)
        soup = BeautifulSoup(response.text, 'lxml')
        self.logger.debug(soup)
        script_list = soup.findAll('script')
        # if len(script_list) == 0:
        # script_list = self.isCrawlerPublic(url)

        articleList = list()
        for li in script_list:
            li_str = str(li.text)
            sub_str1 = "msgList = "
            sub_str2 = '}]};'
            if li_str.find(sub_str1) != -1:
                index1 = li_str.find(sub_str1)
                index2 = li_str.find(sub_str2)
                main = str(li.text)[index1 + len(sub_str1):index2 + 3]
                articleJson = json.loads(main)
                articlelistJson = articleJson['list']
                for item in articlelistJson:
                    mes_info = item['app_msg_ext_info']
                    url = 'https://mp.weixin.qq.com' + mes_info['content_url']
                    url = url.replace('amp;', '')
                    self.logger.debug('article_url:' + url)
                    article = self.crawlArticle(url, flag=1)
                    articleList.append(article)
                    multi_item_list = mes_info['multi_app_msg_item_list']
                    for multi_item in multi_item_list:
                        multi_url = 'https://mp.weixin.qq.com' + multi_item[
                            'content_url']
                        multi_url = multi_url.replace('amp;', '')
                        self.logger.debug('article_url:' + multi_url)
                        article = self.crawlArticle(multi_url, flag=1)
                        if article is not None:
                            articleList.append(article)
        return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        pass

    def crawlArticle(self, url, **kwargs):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        #TBD, 转发情况目前先不考虑
        searchTitle = ''
        referer = None
        if (kwargs['flag'] == 0):
            if 'referer' not in kwargs:
                return None
            if 'title' in kwargs:
                searchTitle = kwargs['title']
            else:
                searchTitle = ''
        try:
            self.session_public_article.randomSleep()
            if (kwargs['flag'] == 0):
                referer = kwargs['referer']
                headers = HEADERS_ARTICLE.copy()
                headers['Referer'] = referer
                response = self.session_public_article.get(
                    url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE)
            else:
                response = self.session_public_article.get(
                    url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE)
            mid = re.findall(r'var mid = .*"(\d+)";',
                             response.text)[0] + '-' + re.findall(
                                 r'var idx = .*"(\d+)";', response.text)[0]
            soup = BeautifulSoup(response.text, 'lxml')
            main = soup.find('div', {'id': "img-content"})
            title = main.find('h2').text.strip()
            content = main.find('div', {'id': "js_content"}).text.strip()
            profile = main.find('div', {'class': "profile_inner"})
            author_id = profile.find('span').text.strip()
            author_name = profile.find('strong').text.strip()
            try:
                publish_datetime = main.find('em', {
                    'id': "post-date"
                }).text.strip() + ' 00:00:00'
                publish_datetime = datetime.datetime.strptime(
                    publish_datetime, '%Y-%m-%d %H:%M:%S')
            except:
                self.logger.warn(
                    'Fail to parse publish_datetime, use current time as time')
                publish_datetime = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime=publish_datetime,
                              url=url,
                              author_id=author_id,
                              author_name=author_name,
                              meta_info='{refer="%s"}' % referer)
            self.logger.debug('Successfully parse article:%s', title)

            return article
        except:
            self.logger.error('Fail to get article for %s: %s due to %s', url,
                              searchTitle, traceback.format_exc())
            return None

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        pass

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        return (list(), False)

    def getPublic(self):
        self.conf = self.__readDbConf()
        publictablename = Constants.TABLE_SA_WETCHARTPUBLIC + Constants.TABLE_NAME_DELIMITER + self.entityId
        sql = '''
                SELECT public_name FROM %s
              ''' % (publictablename)
        dbProxy = MySqlProxy(self.conf[CONF_FILE_DBCONF][CONF_FILE_DBHOST],
                             3306,
                             self.conf[CONF_FILE_DBCONF][CONF_FILE_DBUSER],
                             self.conf[CONF_FILE_DBCONF][CONF_FILE_DBPASS],
                             self.conf[CONF_FILE_DBCONF][CONF_FILE_DBNAME])
        # dbProxy = MySqlProxy('localhost', 3306, 'root', 'zzm15331411', 'sentiment_re')
        # dbProxy = MySqlProxy('112.124.47.197', 3306, 'test', 'test', 'sa2')
        dbProxy.execute(sql)
        resultList = dbProxy.fetchall()
        # resultList = [(u'今日中大',),]
        return resultList

    def __readDbConf(self):
        fileName = os.path.join('conf', CONF_FILE_NAME)
        c = Configuration(fileName)
        return c.readConfig()

    def getProxyPageList(self, url):
        url_list = []
        for i in range(1, 100):
            url_new = url + str(i)
            url_list.append(url_new)

        return url_list

    def getProxyIpList(self, url):
        response = self.session.get(url,
                                    textRspOnly=False,
                                    headers=PROXY_IP_URL_HEADER,
                                    timeout=2)
        host_list = etree.HTML(response.text).xpath(
            '//table[contains(@id,"ip_list")]/tr/td[2]/text()')
        port_list = etree.HTML(response.text).xpath(
            '//table[contains(@id,"ip_list")]/tr/td[3]/text()')
        ip_list = list()
        for i in range(0, len(host_list)):
            ip = host_list[i] + r':' + port_list[i]
            ip_list.append(ip)
        return ip_list

    def verifyIp(self, ip):
        proxy = {'http': 'http://' + ip}
        proxy_handler = urllib2.ProxyHandler(proxy)
        opener = urllib2.build_opener(proxy_handler)
        urllib2.install_opener(opener)

        test_url = 'http://www.baidu.com'
        req = urllib2.Request(url=test_url, headers=PROXY_IP_URL_HEADER)
        try:
            res = urllib2.urlopen(req)
            content = res.read()
            if content:
                self.logger.debug(r'https://' + ip + ' is OK')
            else:
                self.logger.debug(r'https://' + ip + ' is BAD')
                ip = ""
            return ip
        except urllib2.URLError as e:
            self.logger.debug(r'https://' + ip + ' ' + str(e.reason))
            return ""
        except:
            self.logger.debug(r'https://' + ip + ' Other Error')
            return ""

    def getVaildIp(self):
        page_url_list = self.getProxyPageList(PROXY_IP_URL)
        ip_list = list()
        for page_url in page_url_list:
            page_ip_list = self.getProxyIpList(page_url)
            for ip in page_ip_list:
                ip = self.verifyIp(ip)
                if ip != "":
                    ip_list.append(ip)
        return ip_list

    def isCrawlerPublic(self, url):
        data_script_list = list()
        page_url_list = self.getProxyPageList(PROXY_IP_URL)
        flag = 0
        for page_url in page_url_list:
            page_ip_list = self.getProxyIpList(page_url)
            flag = 0
            for ip in page_ip_list:
                flag = 0
                ip = self.verifyIp(ip)
                if ip != "":
                    # self.session_public.randomSleep()
                    proxy = {'http': r'http://' + ip}
                    try:
                        response = self.session_public.get(
                            url,
                            textRspOnly=True,
                            headers=HEADERS_ARTICLE,
                            proxies=proxy)
                        soup = BeautifulSoup(response.text, 'lxml')
                        self.logger.debug(response.text)
                        script_list = soup.findAll('script')
                        if len(script_list) != 0:
                            flag = 0
                            for li in script_list:
                                li_str = str(li.text)
                                sub_str1 = "msgList = "
                                if li_str.find(sub_str1) != -1:
                                    data_script_list.append(li)
                                    flag = 1
                                    break
                            if flag == 1:
                                break
                    except:
                        self.logger.debug(
                            'The ip can not be used to crawler public')
            if flag == 1:
                break
        if (flag == 0):
            self.logger.debug('The ip can not be used to crawler public')

        return data_script_list
示例#12
0
class WeiboCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.channel = channel
        self.entityId = 'SYSU'
        self.user_name = None
        self.pass_word = None

        self.email = SendEmail()
        self.db = InsertDB()
        self.monitor_title = 'Weibo crawler monitor'
        self.HTTP = 'https:'

        self.session = SessionCrawler(sleepRange=[3, 8])

    def change_cookie(self, user_name=None):
        '''
        随机获取一个cookie
        '''
        # usename_list = [
        #             '18814095644','13432881156','*****@*****.**','15018377821','*****@*****.**',
        #             '15767199023','13427287354','13532011721','13640792755','13794342903',
        #             '17825769929','18211493432','17827278983','13922771190','15999916968',
        #             '15119820746','15986585396','13430915912','15012471375','17880567972',
        #             '17876156948','15915132451','13543985544','13717382951','13640695490',
        #             '15711707673','13680181412','13414759320','17820956139','18476072534',
        #             '17806699214','13418852766','17827181603','15919354070','15088137907'
        #                ]
        user_name_list = [
            '13432881156', '*****@*****.**', '15018377821',
            '*****@*****.**', '15767199023', '18814095644'
        ]
        if user_name is None:
            user_name = random.choice(user_name_list)
        else:
            user_name = user_name

        self.user_name = user_name

    def get_username_password(self, user_name=None):
        '''
        随机赋值用户和密码
        :return:
        '''
        # user_name_password1 = '18814095644:ljda.18814095644'
        # user_name_password2 = '13432881156:liang452035397'
        # user_name_password3 = '[email protected]:810214bee810214'
        # user_name_password4 = '15018377821:zzm15331411'
        # user_name_password5 = '15767199023:j980216'
        # user_name_password6 = '[email protected]:uwinvip'

        user_dict = {
            '18814095644': 'ljda.18814095644',
            '13432881156': 'liang452035397',
            '*****@*****.**': '810214bee810214',
            '15018377821': 'zzm15331411',
            '15767199023': 'j980216',
            '*****@*****.**': 'uwinvip'
        }

        user_name_list = [
            '13432881156', '*****@*****.**', '15018377821',
            '*****@*****.**', '15767199023', '18814095644'
        ]
        # user_list = [user_name_password1, user_name_password2,
        #              user_name_password3, user_name_password4,
        #              user_name_password5, user_name_password6]

        # 判断是否存在user_name
        if user_name is None:
            user_name = random.choice(user_name_list)
        else:
            user_name = user_name

        user_password = user_dict[user_name]

        self.user_name = user_name
        self.pass_word = user_password

    def searchArticle(self, keyword_list, end_time):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        # 手动更新cookie
        # self.__login()

        run_msg = '微博爬虫开始运行'
        self.db.Insert(self.channel.channel_id, self.entityId, run_msg)

        # 搜索时间
        start_time = end_time - datetime.timedelta(hours=2.5)

        page = 1
        count_page = 0
        article_list = list()
        page_url_list = self.__search_page_url(keyword_list, start_time,
                                               end_time, page)

        # 如果没获取到数据,则通过这个user_name进行登录获取新的cookie
        if len(page_url_list) == 0:
            self.__login(self.user_name)
            page_url_list = self.__search_page_url(keyword_list, start_time,
                                                   end_time, page)

        for page_url in page_url_list:
            time.sleep(1.5)
            count_page += 1
            self.logger.debug('Start to crawl: %s' % page_url)
            article_list_page = self.__parse_page_url(page_url, count_page)
            article_list.extend(article_list_page)

        if article_list is None:
            article_msg = '微博没有爬取到数据'
            self.email.send(self.monitor_title, article_msg)
            self.db.Insert(self.channel.channel_id, self.entityId, article_msg)

        return article_list

    def __search_page_url(self, keywordList, startTime, endTime, page):
        '''
        :param keywordList: 关键词
        :param startTime: 搜索开始时间
        :param endTime: 搜索结束时间
        :param page: 页码
        :return: 返回微博内容列表的链接
        '''
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        params = {
            'typeall':
            '1',
            'suball':
            '1',  # 包含全部
            'timescope':
            'custom:%s:%s' % (startTime.strftime("%Y-%m-%d-%H"),
                              (endTime.strftime("%Y-%m-%d-%H"))),  # 时间
            # 微博搜索的时间范围格式不同 不能写%Y-%m-%d-%H
            'Refer':
            'g',
            'page':
            page
        }
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Host': 's.weibo.com',
            'Referer': 'http://s.weibo.com/',
            'User-Agent': user_agent
        }

        index_url = 'http://s.weibo.com/weibo/' + query  # 搜索主页+

        # 随机选取cookie
        self.change_cookie()
        self.logger.debug('Use Cookie %s' % self.user_name)
        # 若加载的cookie没有,则重新登录
        try:
            cookies = self.__load_cookies_from_lwp(self.user_name)
            html = self.session.get(index_url,
                                    params=params,
                                    headers=headers,
                                    cookies=cookies)  # 加载本地cookies
            # print(html)
            response = etree.HTML(html)
            # 获取总共有多少页
            page_urls = response.xpath(
                '//div[@class="m-page"]//span[@class="list"]//li/a/@href')
            page_url_list = list()
            for page in page_urls:
                url = self.HTTP + page
                page_url_list.append(url)
                print(url)
            return page_url_list

        except Exception as e:
            loginFlag = self.__login(self.user_name)
            self.logger.debug('Use username: %s' % self.user_name)
            if loginFlag is False:
                self.logger.error('Fail to login')
                login_msg = 'Weibo login field! Username is %s' % self.user_name
                self.email.send(self.monitor_title, login_msg)
                self.db.Insert(self.channel.channel_id, self.entityId,
                               login_msg)
                return

            cookies = self.__load_cookies_from_lwp(self.user_name)
            self.logger.debug('Get a new Cookie: %s' % cookies)
            try:
                html = self.session.get(index_url,
                                        params=params,
                                        headers=headers,
                                        cookies=cookies)  # 加载本地cookies
                response = etree.HTML(html)
                # 获取总共有多少页
                page_urls = response.xpath(
                    '//div[@class="m-page"]//span[@class="list"]//li/a/@href')
                page_url_list = list()
                for page in page_urls:
                    url = self.HTTP + page
                    page_url_list.append(url)
                return page_url_list
            except Exception as e:
                self.logger.warning('Crawler failed: %s' % e)
                msg = 'No return data, because of %s' % e
                self.email.send(self.monitor_title, msg)
                self.db.Insert(self.channel.channel_id, self.entityId, msg)

    def __parse_page_url(self, page_url, count_page=0):
        '''
        获取每一页的微博信息
        @return: article_list
        '''
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Host': 's.weibo.com',
            'Referer': 'http://s.weibo.com/',
            'User-Agent': user_agent
        }

        self.change_cookie()
        cookies = self.__load_cookies_from_lwp(self.user_name)
        response = self.session.get(url=page_url,
                                    headers=headers,
                                    cookies=cookies)

        # 若此cookie无效,则重新登录获取新的cookie
        if re.match(r'File not .*', response) is not None:
            self.__login(self.user_name)
            cookies = self.__load_cookies_from_lwp(self.user_name)
            response = self.session.get(url=page_url,
                                        headers=headers,
                                        cookies=cookies)

        html = etree.HTML(response)
        # 内容list
        card_wrap_list = html.xpath(
            '//div[@class="m-wrap"]/div[@id="pl_feedlist_index"]//div[@class="card-wrap"]'
        )
        article_list = list()
        for card_wrap in card_wrap_list:
            try:
                mid = card_wrap.xpath('./@mid')[0]
            except Exception as e:
                continue
            info = card_wrap.xpath('.//div[@class="content"]')[0]
            author_name = info.xpath('.//a[@class="name"]/text()')[0]
            author_id = re.search(
                r'/(\d+).?refer_flag',
                info.xpath('.//a[@class="name"]/@href')[0]).group(1)

            txt = info.xpath('.//p[@class="txt"]')[0]
            content = txt.xpath('string(.)').strip()
            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '

            article_info = info.xpath('.//p[@class="from"]/a')[0]
            article_url = self.HTTP + article_info.xpath('./@href')[0]

            statistics = card_wrap.xpath(
                './/div[@class="card-act"]//li/a/text()')

            collect_count = re.search(r'(\d+)', statistics[0])
            forward_count = re.search(r'(\d+)', statistics[1])
            reply_count = re.search(r'(\d+)', statistics[2])
            like_count = card_wrap.xpath(
                './/div[@class="card-act"]//li/a/em/text()')

            if collect_count:
                collect_count = collect_count.group(1).strip()
            else:
                collect_count = 0
            if forward_count:
                forward_count = forward_count.group(1).strip()
            else:
                forward_count = 0
            if reply_count:
                reply_count = reply_count.group(1).strip()
            else:
                reply_count = 0
            if len(like_count):
                like_count = like_count[0]
            else:
                like_count = 0

            publish_datetime = self.__parse_time(article_url, cookies)

            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              url=article_url,
                              publish_datetime=publish_datetime,
                              author_id=author_id,
                              author_name=author_name)

            article.statistics.collect_count = collect_count
            article.statistics.forward_count = forward_count
            article.statistics.reply_count = reply_count
            article.statistics.like_count = like_count

            if article not in article_list:
                article_list.append(article)

        self.logger.debug('%d page crawl article is %s' %
                          (count_page, len(article_list)))
        return article_list

    def __parse_time(self, article_url, cookies):
        time.sleep(1.5)
        url = '%s&type=comment' % article_url
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'User-Agent': user_agent
        }

        response = self.session.get(url, headers=headers, cookies=cookies)
        try:
            date_str = re.search(r'date=\\"(\d+)\\"', response).group(1)
            localtime = time.localtime(int(date_str[0:10]))
            publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S', localtime)
        except Exception as e:
            publish_datetime = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')

        return publish_datetime

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        # return
        try:
            (data, check) = self.__fetchSingleArticle(article)

            if check == '0':
                soup = BeautifulSoup(data, 'lxml')
                ulList = soup.findAll(
                    'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'})
                li_list = ulList[0].findAll('li')
                self.__parseStatistics(article, li_list)
            elif check == '1':
                self.logger.warning(u'要访问的网页404了:%s', article.url)
                return
            else:
                self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url)
                return
        except:
            self.logger.error('Fail to fetch statistics for:%s, %s',
                              article.url, traceback.format_exc())
            return

    def __parseStatistics(self, article, soup_li_list):
        # 新版
        collect_count = soup_li_list[0].find('span').text
        collect_count = re.findall(r'\d+', collect_count)
        if len(collect_count) > 0:
            collect_count = int(collect_count[0])
        else:
            collect_count = 0
        forward_count = soup_li_list[1].find('span').text

        forward_count = re.findall(r'\d+', forward_count)
        if len(forward_count) > 0:
            forward_count = int(forward_count[0])
        else:
            forward_count = 0

        reply_count = soup_li_list[2].find('span').text
        reply_count = re.findall(r'\d+', reply_count)
        if len(reply_count) > 0:
            reply_count = int(reply_count[0])
        else:
            reply_count = 0

        like_count = soup_li_list[3].find('span').text
        like_count = re.findall(r'\d+', like_count)
        if len(like_count) > 0:
            like_count = int(like_count[0])
        else:
            like_count = 0
        article.statistics.reply_count = reply_count
        article.statistics.like_count = like_count
        article.statistics.collect_count = collect_count
        article.statistics.forward_count = forward_count

    def __fetchSingleArticle(self, article):
        '''
        根据文章url获取文章
        '''
        user_agent = [
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;  Trident/5.0)",
        ]
        randdom_header = random.choice(user_agent)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'weibo.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': '%s' % randdom_header
        }

        data2 = {'type': 'comment'}
        print article.url
        time.sleep(2)
        self.change_cookie()
        cookies = self.__load_cookies_from_lwp(self.user_name)
        html = self.session.get(article.url,
                                headers=headers,
                                params=data2,
                                cookies=cookies)
        # 若cookie无效,则重新登录获取
        if re.match(r'File not .*', html) is not None:
            self.__login(self.user_name)
            cookies = self.__load_cookies_from_lwp(self.user_name)
            html = self.session.get(article.url,
                                    headers=headers,
                                    params=data2,
                                    cookies=cookies)

        if html.find('<div class="page_error">') == -1:
            lindex = html.find(
                '<script>FM.view({"ns":"pl.content.weiboDetail.index"')
            rindex = html[lindex:].find('</script>')
            rindex = lindex + rindex - 1
            lindex = lindex + len('<script>FM.view(')
            # self.logger.debug(html[lindex:rindex])
            try:
                jo = json.loads(html[lindex:rindex])
                data = jo['html']  # 实时微博页面
                return (data, '0')
            except:
                return ({}, '1')
        else:
            return ({}, '2')

    def __get_username(self, user_name):
        """
        get legal username
        """
        username_quote = urllib.quote(user_name)
        username_base64 = base64.b64encode(username_quote.encode("utf-8"))
        return username_base64.decode("utf-8")

    def __get_password(self, servertime, nonce, pubkey):
        """
        get legal password
        """
        string = (str(servertime) + "\t" + str(nonce) + "\n" +
                  str(self.pass_word)).encode("utf-8")
        public_key = rsa.PublicKey(int(pubkey, 16), int("10001", 16))
        password = rsa.encrypt(string, public_key)
        password = binascii.b2a_hex(password)
        return password.decode()

    def __get_json_data(self, su_value):
        """
        get the value of "servertime", "nonce", "pubkey", "rsakv" and "showpin", etc
        """
        params = {
            "entry": "weibo",
            "callback": "sinaSSOController.preloginCallBack",
            "rsakt": "mod",
            "checkpin": "1",
            "client": "ssologin.js(v1.4.18)",
            "su": su_value,
            "_": int(time.time() * 1000),
        }
        try:
            response = self.session.get(
                "http://login.sina.com.cn/sso/prelogin.php", params=params)
            json_data = json.loads(
                re.search(r"\((?P<data>.*)\)", response).group("data"))
        except Exception:
            json_data = {}
            self.logger.error("WeiBoLogin get_json_data error: %s",
                              traceback.format_exc())

        self.logger.debug("WeiBoLogin get_json_data: %s", json_data)
        return json_data

    def __login(self, user_name=None):

        self.user_uniqueid = None
        self.user_nick = None

        # 根据传入的user_name获取用户名和密码
        self.get_username_password(user_name)

        # get json data
        s_user_name = self.__get_username(self.user_name)

        # user_name
        self.logger.debug('login weibo use %s' % self.user_name)

        json_data = self.__get_json_data(su_value=s_user_name)
        if not json_data:
            return False
        s_pass_word = self.__get_password(json_data["servertime"],
                                          json_data["nonce"],
                                          json_data["pubkey"])

        # make post_data
        post_data = {
            "entry": "weibo",
            "gateway": "1",
            "from": "",
            "savestate": "7",
            "userticket": "1",
            "vsnf": "1",
            "service": "miniblog",
            "encoding": "UTF-8",
            "pwencode": "rsa2",
            "sr": "1280*800",
            "prelt": "529",
            "url":
            "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack",
            "rsakv": json_data["rsakv"],
            "servertime": json_data["servertime"],
            "nonce": json_data["nonce"],
            "su": s_user_name,
            "sp": s_pass_word,
            "returntype": "TEXT",
        }

        # get captcha code
        if json_data["showpin"] == 1:

            # captcha_msg = '微博爬虫进入验证码页面,虫子已被反爬'
            # self.email.send(self.monitor_title,captcha_msg)
            # self.db.Insert(self.channel.channel_id,self.entityId,captcha_msg)

            url = "http://login.sina.com.cn/cgi/pin.php?r=%d&s=0&p=%s" % (int(
                time.time()), json_data["pcid"])
            print(url)
            with open("captcha.jpg", "wb") as file_out:
                file_out.write(
                    self.session.get(url, textRspOnly=False).content)

            print self.session.get(url, textRspOnly=False).content
            code = raw_input("请输入验证码:")
            post_data["pcid"] = json_data["pcid"]
            post_data["door"] = code

        # login weibo.com
        login_url_1 = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)&_=%d" % int(
            time.time())
        json_data_1 = self.session.post(login_url_1,
                                        data=post_data,
                                        textRspOnly=False).json()
        if json_data_1["retcode"] == "0":
            params = {
                "callback": "sinaSSOController.callbackLoginStatus",
                "client": "ssologin.js(v1.4.18)",
                "ticket": json_data_1["ticket"],
                "ssosavestate": int(time.time()),
                "_": int(time.time() * 1000),
            }
            response = self.session.get(
                "https://passport.weibo.com/wbsso/login",
                textRspOnly=False,
                params=params)
            json_data_2 = json.loads(
                re.search(r"\((?P<result>.*)\)",
                          response.text).group("result"))
            if json_data_2["result"] is True:
                self.user_uniqueid = json_data_2["userinfo"]["uniqueid"]
                self.user_nick = json_data_2["userinfo"]["displayname"]
                self.logger.info("WeiBoLogin succeed!")
                self.__save_cookies_lwp(response.cookies,
                                        self.user_name)  # 保存cookies到本地
            else:
                self.logger.warning("WeiBoLogin failed: %s", json_data_2)
        else:
            self.logger.warning("WeiBoLogin failed: %s", json_data_1)
        return True if self.user_uniqueid and self.user_nick else False

    def __save_cookies_lwp(self, cookiejar, usename):
        """
        保存cookies到本地
        """
        filename = 'sina_cookie_pool/sinaweibocookies_%s' % usename
        lwp_cookiejar = cookielib.LWPCookieJar()
        for c in cookiejar:
            args = dict(vars(c).items())
            args['rest'] = args['_rest']
            del args['_rest']
            c = cookielib.Cookie(**args)
            lwp_cookiejar.set_cookie(c)
        lwp_cookiejar.save(filename, ignore_discard=True)

    def __load_cookies_from_lwp(self, usename):
        """
        读取本地cookies
        """
        lwp_cookiejar = None
        try:
            filename = 'sina_cookie_pool/sinaweibocookies_%s' % usename
            self.logger.debug('Use cookie filename is %s' % usename)
            lwp_cookiejar = cookielib.LWPCookieJar()
            lwp_cookiejar.load(filename, ignore_discard=True)
        except:
            self.__login()
            filename = 'sina_cookie_pool/sinaweibocookies_%s' % '*****@*****.**'
            self.logger.debug('cookies_filename: %s' % filename)
            lwp_cookiejar = cookielib.LWPCookieJar()
            lwp_cookiejar.load(filename, ignore_discard=True)
        return lwp_cookiejar

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        self.lastCommentId = None

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        # return (list(), False)
        commentList = list()

        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        headers = {
            'Accept':
            '* / *',
            'Accept-Encoding':
            'gzip, deflate, sdch, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Content - Type':
            'application / x - www - form - urlencoded',
            'Host':
            'weibo.com',
            'Referer':
            '%s' % article.url[:article.url.find('?') + 1],
            'refer_flag':
            '1001030103_ & type = comment',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
        }
        data1 = {'ajwvr': '6', 'id': '%s' % article.tid, 'from': 'singleWeiBo'}

        re_url = 'https://weibo.com/aj/v6/comment/big'

        html = self.session.get(re_url,
                                params=data1,
                                headers=headers,
                                cookies=self.__load_cookies_from_lwp(
                                    self.user_name))

        jo = json.loads(html)
        data = jo['data']['html']
        # self.logger.error( data)
        soup = BeautifulSoup(data, 'lxml')
        if soup.find('div', {'class': "WB_empty"}) is None:

            commentbox = soup.find('div', {'class': "list_box"})
            root_commentlist = commentbox.find('div',
                                               {'node-type': "comment_list"})
            root_commentlist2 = root_commentlist.find_all(
                'div', {'node-type': "root_comment"})
            count_comment = 1
            self.logger.debug('root_commentlist:%d   %s',
                              len(root_commentlist2), article.url)
            for root_comment in root_commentlist2:

                if count_comment > 20:
                    break  # 爬取前20条评论
                self.logger.error('count_comment:%d', count_comment)
                comment_id = root_comment.attrs['comment_id']  # 一级评论id
                list_con = root_comment.find('div', {'class': "list_con"})
                firstcomentwrap = list_con.find('div', {'class': "WB_text"})
                firstcoment = firstcomentwrap.text.strip()  # 一级评论内容

                useridwrap = firstcomentwrap.find('a')
                user_id = useridwrap.attrs['usercard']  # 一级评论者id
                user_id = re.findall(r'id=(\d+)', user_id)[0]

                user_name = useridwrap.test  # 一级评论者name
                if user_name is None:
                    user_name = ' '
                publish_timediv = list_con.find('div',
                                                {'class': "WB_func clearfix"})
                try:
                    publish_time = self.parseDateTime(
                        publish_timediv.findAll('div')[1].text)  # 一级评论发布时间
                except:
                    continue
                # self.logger.error(publish_time)
                # if publish_time < datetime.datetime.now()-datetime.timedelta(hours=48):
                #     break

                like_count_div = publish_timediv.findAll('div')[0]
                try:
                    like_count_li = like_count_div.findAll('li')[3]
                    like_count = like_count_li.findAll('em')[1].text
                except:
                    like_count = 0
                if u'赞' == like_count:
                    like_count = 0

                commentList.append(
                    Comment(article.tid,
                            self.channel.channel_id,
                            comment_id,
                            add_datetime,
                            publish_time,
                            None,
                            None,
                            None,
                            None,
                            user_id,
                            user_name,
                            firstcoment,
                            None,
                            None,
                            like_count,
                            None,
                            dislike_count=None))
                count_comment += 1  # 评论数计数

        return (commentList, False)

    def parseDateTime(self, datetimeStr):
        if datetimeStr.find(u'\u79d2\u524d') > 0:
            secondsDelta = float(datetimeStr.replace(u'\u79d2\u524d', ''))
            return datetime.datetime.now() - datetime.timedelta(
                seconds=secondsDelta)
        if datetimeStr.find(u'\u5206\u949f\u524d') > 0:
            secondsDelta = float(datetimeStr.replace(u'\u5206\u949f\u524d',
                                                     '')) * 60
            return datetime.datetime.now() - datetime.timedelta(
                seconds=secondsDelta)
        if datetimeStr.find(u'\u4eca\u5929') >= 0:
            datetimeStr = datetime.datetime.today().strftime(
                '%Y-%m-%d') + datetimeStr.replace(u'\u4eca\u5929', '')
            return datetime.datetime.strptime(datetimeStr, '%Y-%m-%d %H:%M')

        if datetimeStr.find(u'\u6708') >= 0:
            datetimeStr = str(datetime.datetime.today().year) + '-' + \
                        datetimeStr.replace(u'\u6708', '-').replace(u'\u65e5','')
            return datetime.datetime.strptime(datetimeStr, '%Y-%m-%d %H:%M')
        return datetime.datetime.strptime(datetimeStr, '%Y-%m-%d %H:%M')