示例#1
0
class JianShuCrawler(object):
    def __init__(self, channel, logger=None):
        '''

        :param channel:
        :param logger:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.channel = channel  #获取媒体实体
        # 设置请求头和代理
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'www.jianshu.com',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        }
        self.headers_1 = {
            'Host': 'www.jianshu.com',
            'Connection': 'keep-alive',
            'Connection-Length': '0',
            'Accept': 'application/json',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept-Encoding': 'gzip,deflate,br',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        #用户信息
        self.usename = '13432881156'
        self.password = '******'
        #用cookie进行登录
        self.session = SessionCrawler()
        self.session.cookies = cookielib.LWPCookieJar(
            filename='cookie')  #读取cookie
        try:
            self.session.cookies.load(ignore_discard=True)
        except:
            print('未能加载cookie')
        if self.islogin():
            print('已经登录简书')
        else:
            self.login(self.usename, self.password)

    def searchArticle(self, keywordList, endTime):
        '''
            根据关键字数组,开始时间和结束时间范围搜索文章
            :param keywordList: 关键字数组
            :param endTime: 搜索时间范围结束
            :return:
        '''
        if self.islogin():
            page = 0  # 页数
            articleList = list()
            hasnext = True  # 开始爬虫
            while hasnext:
                page += 1
                if page == 5:
                    break
                (articleListInPage,
                 hasnext) = self._searchByPage(keywordList, endTime, page)
                is_article = list()
                for article in articleListInPage:
                    if article not in articleList:
                        is_article.append(article)
                articleList.extend(is_article)
                print(len(articleList))

            self.logger.debug('总共抓取文章有:%s' % len(articleList))
            return articleList
        else:
            self.logger.error(u'未登录简书,请确认登录')
            return None

    def _searchByPage(self, keywordList, endTime, page):
        hasnext = True
        articleList = list()
        page = str(page)  # 在简书内部搜索框搜索
        search_url_1 = "https://www.jianshu.com/search/do?q="
        search_url_2 = "&type=note&page=" + page + "&order_by=published_at&time_range=a_day"
        # q:是关键词搜索,page:页数,order_by是排序,time_range是按天排序
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        search_url = search_url_1 + str(query) + search_url_2  # 抓取页面的网址
        csrf = self.getCsrf(keywordList, page)
        post_data = {'X-CSRF-Token': csrf}
        # 获取页面信息
        url_page = self.session.post(search_url,
                                     headers=self.headers_1,
                                     data=post_data,
                                     textRspOnly=False)
        if url_page.status_code == 200:
            self.logger.debug(u'已经获取中大新闻页面')
        else:
            self.logger.debug(u'中大新闻获取完毕')
            return (articleList, False)  #已经爬取到新闻界面的最后一页

        # 以下对中大新闻页面进行解析
        articleJson = None
        try:
            articleJson = json.loads(url_page.text)
        except Exception, e:
            self.logger.error(e)

        if articleJson is None:
            self.logger.error('articleJson is None')
            return

        allInfo = articleJson['entries']  # allinfo代表字典0:{}\1:{}.....
        for info in allInfo:
            pre_publishedTime = info["first_shared_at"]
            publishedTime_1 = ''.join(
                re.findall('[^A-Za-z]', pre_publishedTime[0:-5]))  # 文章发表时间
            publishedTime = publishedTime_1[0:10] + ' ' + publishedTime_1[10:18]
            #print(publishedTime)
            urlTime = time.strptime(publishedTime, '%Y-%m-%d %H:%M:%S')
            Y, M, D, H = urlTime[0:4]
            urlTime2 = datetime.datetime(Y, M, D, H)
            # 转化成时间戳来比较float
            urlTime2 = time.mktime(urlTime2.timetuple())  # 文章发表时间的时间戳
            startTime = endTime - datetime.timedelta(
                days=self.channel.search_ranges)
            startTimeIntSecond = time.mktime(startTime.timetuple())  # 开始爬取的时间戳
            endTimeIntSecond = time.mktime(endTime.timetuple())  # 结束爬取的时间错
            if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond:
                # 获取文章链接,爬取文章内容
                pre_article_url = info['slug']
                articleUrl = "https://www.jianshu.com/p/" + pre_article_url  # 文章的链接
                (content, authorName,
                 title) = self.crawlArticle(articleUrl)  #爬取文章内容
                #print(title)
                if content is None:  # 话题已被删除或则其他格式
                    print("没有爬到文章")
                    continue
                authorId = info["user"]["id"]  #作者id
                likeCount = info["likes_count"]  # 点赞数
                readCount = info["views_count"]  # 文章阅读数
                replyCount = info["public_comments_count"]  # 文章评论数
                tid = info["id"]  # 文章的id
                article = Article(tid, self.channel.channel_id, title, content,
                                  publishedTime, articleUrl, authorId,
                                  authorName)
                #self.crawlComment(article)
                article.statistics.reply_count = replyCount
                article.statistics.like_count = likeCount
                article.statistics.read_count = readCount
                if (article is not None) and (article not in articleList):
                    articleList.append(article)
            else:
                print('结束爬虫')
                hasnext = False
                break
        return (articleList, hasnext)
示例#2
0
class WeiboCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.channel = channel
        self.entityId = 'SYSU'
        self.user_name = None
        self.pass_word = None

        self.email = SendEmail()
        self.db = InsertDB()
        self.monitor_title = 'Weibo crawler monitor'
        self.HTTP = 'https:'

        self.session = SessionCrawler(sleepRange=[3, 8])

    def change_cookie(self, user_name=None):
        '''
        随机获取一个cookie
        '''
        # usename_list = [
        #             '18814095644','13432881156','*****@*****.**','15018377821','*****@*****.**',
        #             '15767199023','13427287354','13532011721','13640792755','13794342903',
        #             '17825769929','18211493432','17827278983','13922771190','15999916968',
        #             '15119820746','15986585396','13430915912','15012471375','17880567972',
        #             '17876156948','15915132451','13543985544','13717382951','13640695490',
        #             '15711707673','13680181412','13414759320','17820956139','18476072534',
        #             '17806699214','13418852766','17827181603','15919354070','15088137907'
        #                ]
        user_name_list = [
            '13432881156', '*****@*****.**', '15018377821',
            '*****@*****.**', '15767199023', '18814095644'
        ]
        if user_name is None:
            user_name = random.choice(user_name_list)
        else:
            user_name = user_name

        self.user_name = user_name

    def get_username_password(self, user_name=None):
        '''
        随机赋值用户和密码
        :return:
        '''
        # user_name_password1 = '18814095644:ljda.18814095644'
        # user_name_password2 = '13432881156:liang452035397'
        # user_name_password3 = '[email protected]:810214bee810214'
        # user_name_password4 = '15018377821:zzm15331411'
        # user_name_password5 = '15767199023:j980216'
        # user_name_password6 = '[email protected]:uwinvip'

        user_dict = {
            '18814095644': 'ljda.18814095644',
            '13432881156': 'liang452035397',
            '*****@*****.**': '810214bee810214',
            '15018377821': 'zzm15331411',
            '15767199023': 'j980216',
            '*****@*****.**': 'uwinvip'
        }

        user_name_list = [
            '13432881156', '*****@*****.**', '15018377821',
            '*****@*****.**', '15767199023', '18814095644'
        ]
        # user_list = [user_name_password1, user_name_password2,
        #              user_name_password3, user_name_password4,
        #              user_name_password5, user_name_password6]

        # 判断是否存在user_name
        if user_name is None:
            user_name = random.choice(user_name_list)
        else:
            user_name = user_name

        user_password = user_dict[user_name]

        self.user_name = user_name
        self.pass_word = user_password

    def searchArticle(self, keyword_list, end_time):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        # 手动更新cookie
        # self.__login()

        run_msg = '微博爬虫开始运行'
        self.db.Insert(self.channel.channel_id, self.entityId, run_msg)

        # 搜索时间
        start_time = end_time - datetime.timedelta(hours=2.5)

        page = 1
        count_page = 0
        article_list = list()
        page_url_list = self.__search_page_url(keyword_list, start_time,
                                               end_time, page)

        # 如果没获取到数据,则通过这个user_name进行登录获取新的cookie
        if len(page_url_list) == 0:
            self.__login(self.user_name)
            page_url_list = self.__search_page_url(keyword_list, start_time,
                                                   end_time, page)

        for page_url in page_url_list:
            time.sleep(1.5)
            count_page += 1
            self.logger.debug('Start to crawl: %s' % page_url)
            article_list_page = self.__parse_page_url(page_url, count_page)
            article_list.extend(article_list_page)

        if article_list is None:
            article_msg = '微博没有爬取到数据'
            self.email.send(self.monitor_title, article_msg)
            self.db.Insert(self.channel.channel_id, self.entityId, article_msg)

        return article_list

    def __search_page_url(self, keywordList, startTime, endTime, page):
        '''
        :param keywordList: 关键词
        :param startTime: 搜索开始时间
        :param endTime: 搜索结束时间
        :param page: 页码
        :return: 返回微博内容列表的链接
        '''
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        params = {
            'typeall':
            '1',
            'suball':
            '1',  # 包含全部
            'timescope':
            'custom:%s:%s' % (startTime.strftime("%Y-%m-%d-%H"),
                              (endTime.strftime("%Y-%m-%d-%H"))),  # 时间
            # 微博搜索的时间范围格式不同 不能写%Y-%m-%d-%H
            'Refer':
            'g',
            'page':
            page
        }
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Host': 's.weibo.com',
            'Referer': 'http://s.weibo.com/',
            'User-Agent': user_agent
        }

        index_url = 'http://s.weibo.com/weibo/' + query  # 搜索主页+

        # 随机选取cookie
        self.change_cookie()
        self.logger.debug('Use Cookie %s' % self.user_name)
        # 若加载的cookie没有,则重新登录
        try:
            cookies = self.__load_cookies_from_lwp(self.user_name)
            html = self.session.get(index_url,
                                    params=params,
                                    headers=headers,
                                    cookies=cookies)  # 加载本地cookies
            # print(html)
            response = etree.HTML(html)
            # 获取总共有多少页
            page_urls = response.xpath(
                '//div[@class="m-page"]//span[@class="list"]//li/a/@href')
            page_url_list = list()
            for page in page_urls:
                url = self.HTTP + page
                page_url_list.append(url)
                print(url)
            return page_url_list

        except Exception as e:
            loginFlag = self.__login(self.user_name)
            self.logger.debug('Use username: %s' % self.user_name)
            if loginFlag is False:
                self.logger.error('Fail to login')
                login_msg = 'Weibo login field! Username is %s' % self.user_name
                self.email.send(self.monitor_title, login_msg)
                self.db.Insert(self.channel.channel_id, self.entityId,
                               login_msg)
                return

            cookies = self.__load_cookies_from_lwp(self.user_name)
            self.logger.debug('Get a new Cookie: %s' % cookies)
            try:
                html = self.session.get(index_url,
                                        params=params,
                                        headers=headers,
                                        cookies=cookies)  # 加载本地cookies
                response = etree.HTML(html)
                # 获取总共有多少页
                page_urls = response.xpath(
                    '//div[@class="m-page"]//span[@class="list"]//li/a/@href')
                page_url_list = list()
                for page in page_urls:
                    url = self.HTTP + page
                    page_url_list.append(url)
                return page_url_list
            except Exception as e:
                self.logger.warning('Crawler failed: %s' % e)
                msg = 'No return data, because of %s' % e
                self.email.send(self.monitor_title, msg)
                self.db.Insert(self.channel.channel_id, self.entityId, msg)

    def __parse_page_url(self, page_url, count_page=0):
        '''
        获取每一页的微博信息
        @return: article_list
        '''
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Host': 's.weibo.com',
            'Referer': 'http://s.weibo.com/',
            'User-Agent': user_agent
        }

        self.change_cookie()
        cookies = self.__load_cookies_from_lwp(self.user_name)
        response = self.session.get(url=page_url,
                                    headers=headers,
                                    cookies=cookies)

        # 若此cookie无效,则重新登录获取新的cookie
        if re.match(r'File not .*', response) is not None:
            self.__login(self.user_name)
            cookies = self.__load_cookies_from_lwp(self.user_name)
            response = self.session.get(url=page_url,
                                        headers=headers,
                                        cookies=cookies)

        html = etree.HTML(response)
        # 内容list
        card_wrap_list = html.xpath(
            '//div[@class="m-wrap"]/div[@id="pl_feedlist_index"]//div[@class="card-wrap"]'
        )
        article_list = list()
        for card_wrap in card_wrap_list:
            try:
                mid = card_wrap.xpath('./@mid')[0]
            except Exception as e:
                continue
            info = card_wrap.xpath('.//div[@class="content"]')[0]
            author_name = info.xpath('.//a[@class="name"]/text()')[0]
            author_id = re.search(
                r'/(\d+).?refer_flag',
                info.xpath('.//a[@class="name"]/@href')[0]).group(1)

            txt = info.xpath('.//p[@class="txt"]')[0]
            content = txt.xpath('string(.)').strip()
            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '

            article_info = info.xpath('.//p[@class="from"]/a')[0]
            article_url = self.HTTP + article_info.xpath('./@href')[0]

            statistics = card_wrap.xpath(
                './/div[@class="card-act"]//li/a/text()')

            collect_count = re.search(r'(\d+)', statistics[0])
            forward_count = re.search(r'(\d+)', statistics[1])
            reply_count = re.search(r'(\d+)', statistics[2])
            like_count = card_wrap.xpath(
                './/div[@class="card-act"]//li/a/em/text()')

            if collect_count:
                collect_count = collect_count.group(1).strip()
            else:
                collect_count = 0
            if forward_count:
                forward_count = forward_count.group(1).strip()
            else:
                forward_count = 0
            if reply_count:
                reply_count = reply_count.group(1).strip()
            else:
                reply_count = 0
            if len(like_count):
                like_count = like_count[0]
            else:
                like_count = 0

            publish_datetime = self.__parse_time(article_url, cookies)

            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              url=article_url,
                              publish_datetime=publish_datetime,
                              author_id=author_id,
                              author_name=author_name)

            article.statistics.collect_count = collect_count
            article.statistics.forward_count = forward_count
            article.statistics.reply_count = reply_count
            article.statistics.like_count = like_count

            if article not in article_list:
                article_list.append(article)

        self.logger.debug('%d page crawl article is %s' %
                          (count_page, len(article_list)))
        return article_list

    def __parse_time(self, article_url, cookies):
        time.sleep(1.5)
        url = '%s&type=comment' % article_url
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'User-Agent': user_agent
        }

        response = self.session.get(url, headers=headers, cookies=cookies)
        try:
            date_str = re.search(r'date=\\"(\d+)\\"', response).group(1)
            localtime = time.localtime(int(date_str[0:10]))
            publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S', localtime)
        except Exception as e:
            publish_datetime = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')

        return publish_datetime

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        # return
        try:
            (data, check) = self.__fetchSingleArticle(article)

            if check == '0':
                soup = BeautifulSoup(data, 'lxml')
                ulList = soup.findAll(
                    'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'})
                li_list = ulList[0].findAll('li')
                self.__parseStatistics(article, li_list)
            elif check == '1':
                self.logger.warning(u'要访问的网页404了:%s', article.url)
                return
            else:
                self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url)
                return
        except:
            self.logger.error('Fail to fetch statistics for:%s, %s',
                              article.url, traceback.format_exc())
            return

    def __parseStatistics(self, article, soup_li_list):
        # 新版
        collect_count = soup_li_list[0].find('span').text
        collect_count = re.findall(r'\d+', collect_count)
        if len(collect_count) > 0:
            collect_count = int(collect_count[0])
        else:
            collect_count = 0
        forward_count = soup_li_list[1].find('span').text

        forward_count = re.findall(r'\d+', forward_count)
        if len(forward_count) > 0:
            forward_count = int(forward_count[0])
        else:
            forward_count = 0

        reply_count = soup_li_list[2].find('span').text
        reply_count = re.findall(r'\d+', reply_count)
        if len(reply_count) > 0:
            reply_count = int(reply_count[0])
        else:
            reply_count = 0

        like_count = soup_li_list[3].find('span').text
        like_count = re.findall(r'\d+', like_count)
        if len(like_count) > 0:
            like_count = int(like_count[0])
        else:
            like_count = 0
        article.statistics.reply_count = reply_count
        article.statistics.like_count = like_count
        article.statistics.collect_count = collect_count
        article.statistics.forward_count = forward_count

    def __fetchSingleArticle(self, article):
        '''
        根据文章url获取文章
        '''
        user_agent = [
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;  Trident/5.0)",
        ]
        randdom_header = random.choice(user_agent)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'weibo.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': '%s' % randdom_header
        }

        data2 = {'type': 'comment'}
        print article.url
        time.sleep(2)
        self.change_cookie()
        cookies = self.__load_cookies_from_lwp(self.user_name)
        html = self.session.get(article.url,
                                headers=headers,
                                params=data2,
                                cookies=cookies)
        # 若cookie无效,则重新登录获取
        if re.match(r'File not .*', html) is not None:
            self.__login(self.user_name)
            cookies = self.__load_cookies_from_lwp(self.user_name)
            html = self.session.get(article.url,
                                    headers=headers,
                                    params=data2,
                                    cookies=cookies)

        if html.find('<div class="page_error">') == -1:
            lindex = html.find(
                '<script>FM.view({"ns":"pl.content.weiboDetail.index"')
            rindex = html[lindex:].find('</script>')
            rindex = lindex + rindex - 1
            lindex = lindex + len('<script>FM.view(')
            # self.logger.debug(html[lindex:rindex])
            try:
                jo = json.loads(html[lindex:rindex])
                data = jo['html']  # 实时微博页面
                return (data, '0')
            except:
                return ({}, '1')
        else:
            return ({}, '2')

    def __get_username(self, user_name):
        """
        get legal username
        """
        username_quote = urllib.quote(user_name)
        username_base64 = base64.b64encode(username_quote.encode("utf-8"))
        return username_base64.decode("utf-8")

    def __get_password(self, servertime, nonce, pubkey):
        """
        get legal password
        """
        string = (str(servertime) + "\t" + str(nonce) + "\n" +
                  str(self.pass_word)).encode("utf-8")
        public_key = rsa.PublicKey(int(pubkey, 16), int("10001", 16))
        password = rsa.encrypt(string, public_key)
        password = binascii.b2a_hex(password)
        return password.decode()

    def __get_json_data(self, su_value):
        """
        get the value of "servertime", "nonce", "pubkey", "rsakv" and "showpin", etc
        """
        params = {
            "entry": "weibo",
            "callback": "sinaSSOController.preloginCallBack",
            "rsakt": "mod",
            "checkpin": "1",
            "client": "ssologin.js(v1.4.18)",
            "su": su_value,
            "_": int(time.time() * 1000),
        }
        try:
            response = self.session.get(
                "http://login.sina.com.cn/sso/prelogin.php", params=params)
            json_data = json.loads(
                re.search(r"\((?P<data>.*)\)", response).group("data"))
        except Exception:
            json_data = {}
            self.logger.error("WeiBoLogin get_json_data error: %s",
                              traceback.format_exc())

        self.logger.debug("WeiBoLogin get_json_data: %s", json_data)
        return json_data

    def __login(self, user_name=None):

        self.user_uniqueid = None
        self.user_nick = None

        # 根据传入的user_name获取用户名和密码
        self.get_username_password(user_name)

        # get json data
        s_user_name = self.__get_username(self.user_name)

        # user_name
        self.logger.debug('login weibo use %s' % self.user_name)

        json_data = self.__get_json_data(su_value=s_user_name)
        if not json_data:
            return False
        s_pass_word = self.__get_password(json_data["servertime"],
                                          json_data["nonce"],
                                          json_data["pubkey"])

        # make post_data
        post_data = {
            "entry": "weibo",
            "gateway": "1",
            "from": "",
            "savestate": "7",
            "userticket": "1",
            "vsnf": "1",
            "service": "miniblog",
            "encoding": "UTF-8",
            "pwencode": "rsa2",
            "sr": "1280*800",
            "prelt": "529",
            "url":
            "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack",
            "rsakv": json_data["rsakv"],
            "servertime": json_data["servertime"],
            "nonce": json_data["nonce"],
            "su": s_user_name,
            "sp": s_pass_word,
            "returntype": "TEXT",
        }

        # get captcha code
        if json_data["showpin"] == 1:

            # captcha_msg = '微博爬虫进入验证码页面,虫子已被反爬'
            # self.email.send(self.monitor_title,captcha_msg)
            # self.db.Insert(self.channel.channel_id,self.entityId,captcha_msg)

            url = "http://login.sina.com.cn/cgi/pin.php?r=%d&s=0&p=%s" % (int(
                time.time()), json_data["pcid"])
            print(url)
            with open("captcha.jpg", "wb") as file_out:
                file_out.write(
                    self.session.get(url, textRspOnly=False).content)

            print self.session.get(url, textRspOnly=False).content
            code = raw_input("请输入验证码:")
            post_data["pcid"] = json_data["pcid"]
            post_data["door"] = code

        # login weibo.com
        login_url_1 = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)&_=%d" % int(
            time.time())
        json_data_1 = self.session.post(login_url_1,
                                        data=post_data,
                                        textRspOnly=False).json()
        if json_data_1["retcode"] == "0":
            params = {
                "callback": "sinaSSOController.callbackLoginStatus",
                "client": "ssologin.js(v1.4.18)",
                "ticket": json_data_1["ticket"],
                "ssosavestate": int(time.time()),
                "_": int(time.time() * 1000),
            }
            response = self.session.get(
                "https://passport.weibo.com/wbsso/login",
                textRspOnly=False,
                params=params)
            json_data_2 = json.loads(
                re.search(r"\((?P<result>.*)\)",
                          response.text).group("result"))
            if json_data_2["result"] is True:
                self.user_uniqueid = json_data_2["userinfo"]["uniqueid"]
                self.user_nick = json_data_2["userinfo"]["displayname"]
                self.logger.info("WeiBoLogin succeed!")
                self.__save_cookies_lwp(response.cookies,
                                        self.user_name)  # 保存cookies到本地
            else:
                self.logger.warning("WeiBoLogin failed: %s", json_data_2)
        else:
            self.logger.warning("WeiBoLogin failed: %s", json_data_1)
        return True if self.user_uniqueid and self.user_nick else False

    def __save_cookies_lwp(self, cookiejar, usename):
        """
        保存cookies到本地
        """
        filename = 'sina_cookie_pool/sinaweibocookies_%s' % usename
        lwp_cookiejar = cookielib.LWPCookieJar()
        for c in cookiejar:
            args = dict(vars(c).items())
            args['rest'] = args['_rest']
            del args['_rest']
            c = cookielib.Cookie(**args)
            lwp_cookiejar.set_cookie(c)
        lwp_cookiejar.save(filename, ignore_discard=True)

    def __load_cookies_from_lwp(self, usename):
        """
        读取本地cookies
        """
        lwp_cookiejar = None
        try:
            filename = 'sina_cookie_pool/sinaweibocookies_%s' % usename
            self.logger.debug('Use cookie filename is %s' % usename)
            lwp_cookiejar = cookielib.LWPCookieJar()
            lwp_cookiejar.load(filename, ignore_discard=True)
        except:
            self.__login()
            filename = 'sina_cookie_pool/sinaweibocookies_%s' % '*****@*****.**'
            self.logger.debug('cookies_filename: %s' % filename)
            lwp_cookiejar = cookielib.LWPCookieJar()
            lwp_cookiejar.load(filename, ignore_discard=True)
        return lwp_cookiejar

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        self.lastCommentId = None

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        # return (list(), False)
        commentList = list()

        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        headers = {
            'Accept':
            '* / *',
            'Accept-Encoding':
            'gzip, deflate, sdch, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Content - Type':
            'application / x - www - form - urlencoded',
            'Host':
            'weibo.com',
            'Referer':
            '%s' % article.url[:article.url.find('?') + 1],
            'refer_flag':
            '1001030103_ & type = comment',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
        }
        data1 = {'ajwvr': '6', 'id': '%s' % article.tid, 'from': 'singleWeiBo'}

        re_url = 'https://weibo.com/aj/v6/comment/big'

        html = self.session.get(re_url,
                                params=data1,
                                headers=headers,
                                cookies=self.__load_cookies_from_lwp(
                                    self.user_name))

        jo = json.loads(html)
        data = jo['data']['html']
        # self.logger.error( data)
        soup = BeautifulSoup(data, 'lxml')
        if soup.find('div', {'class': "WB_empty"}) is None:

            commentbox = soup.find('div', {'class': "list_box"})
            root_commentlist = commentbox.find('div',
                                               {'node-type': "comment_list"})
            root_commentlist2 = root_commentlist.find_all(
                'div', {'node-type': "root_comment"})
            count_comment = 1
            self.logger.debug('root_commentlist:%d   %s',
                              len(root_commentlist2), article.url)
            for root_comment in root_commentlist2:

                if count_comment > 20:
                    break  # 爬取前20条评论
                self.logger.error('count_comment:%d', count_comment)
                comment_id = root_comment.attrs['comment_id']  # 一级评论id
                list_con = root_comment.find('div', {'class': "list_con"})
                firstcomentwrap = list_con.find('div', {'class': "WB_text"})
                firstcoment = firstcomentwrap.text.strip()  # 一级评论内容

                useridwrap = firstcomentwrap.find('a')
                user_id = useridwrap.attrs['usercard']  # 一级评论者id
                user_id = re.findall(r'id=(\d+)', user_id)[0]

                user_name = useridwrap.test  # 一级评论者name
                if user_name is None:
                    user_name = ' '
                publish_timediv = list_con.find('div',
                                                {'class': "WB_func clearfix"})
                try:
                    publish_time = self.parseDateTime(
                        publish_timediv.findAll('div')[1].text)  # 一级评论发布时间
                except:
                    continue
                # self.logger.error(publish_time)
                # if publish_time < datetime.datetime.now()-datetime.timedelta(hours=48):
                #     break

                like_count_div = publish_timediv.findAll('div')[0]
                try:
                    like_count_li = like_count_div.findAll('li')[3]
                    like_count = like_count_li.findAll('em')[1].text
                except:
                    like_count = 0
                if u'赞' == like_count:
                    like_count = 0

                commentList.append(
                    Comment(article.tid,
                            self.channel.channel_id,
                            comment_id,
                            add_datetime,
                            publish_time,
                            None,
                            None,
                            None,
                            None,
                            user_id,
                            user_name,
                            firstcoment,
                            None,
                            None,
                            like_count,
                            None,
                            dislike_count=None))
                count_comment += 1  # 评论数计数

        return (commentList, False)

    def parseDateTime(self, datetimeStr):
        if datetimeStr.find(u'\u79d2\u524d') > 0:
            secondsDelta = float(datetimeStr.replace(u'\u79d2\u524d', ''))
            return datetime.datetime.now() - datetime.timedelta(
                seconds=secondsDelta)
        if datetimeStr.find(u'\u5206\u949f\u524d') > 0:
            secondsDelta = float(datetimeStr.replace(u'\u5206\u949f\u524d',
                                                     '')) * 60
            return datetime.datetime.now() - datetime.timedelta(
                seconds=secondsDelta)
        if datetimeStr.find(u'\u4eca\u5929') >= 0:
            datetimeStr = datetime.datetime.today().strftime(
                '%Y-%m-%d') + datetimeStr.replace(u'\u4eca\u5929', '')
            return datetime.datetime.strptime(datetimeStr, '%Y-%m-%d %H:%M')

        if datetimeStr.find(u'\u6708') >= 0:
            datetimeStr = str(datetime.datetime.today().year) + '-' + \
                        datetimeStr.replace(u'\u6708', '-').replace(u'\u65e5','')
            return datetime.datetime.strptime(datetimeStr, '%Y-%m-%d %H:%M')
        return datetime.datetime.strptime(datetimeStr, '%Y-%m-%d %H:%M')