Пример #1
0
 def crawlArticle(self, url):
     '''
     根据url爬取文章内容和统计信息
     @return: 返回一个Article实例
     '''
     #判断url格式,因为从百度查询所得不一定是question,目前只爬question
     if url.find('question') < 0:
         self.logger.warn('Question supported only:%s', url)
         return None
     article_id = re.findall(r'question/(\d+)', url)[0]
     self.session.randomSleep()
     response = self.session.get(url, headers=CRAWL_ARTICLE_HEADERS)
     soup = BeautifulSoup(response)
     main = soup.find('div', attrs={'id': "data"}).attrs['data-state']
     articleJson = json.loads(main)
     questionJson = articleJson['entities']['questions'][article_id]
     title = questionJson['title']
     contentSoup = BeautifulSoup(questionJson['editableDetail'])
     content = contentSoup.text
     author_id = questionJson['author']['id']
     author_name = questionJson['author']['name']
     createTimeInFloat = questionJson['created']
     publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(createTimeInFloat))
     reply_count = questionJson['commentCount']
     read_count = questionJson['visitCount']
     collect_count = questionJson['followerCount']
     article = Article(article_id, self.channel.channel_id, title, content,
                       publish_datetime, url, author_id, author_name)
     article.statistics.reply_count = reply_count
     article.statistics.read_count = read_count
     article.statistics.collect_count = collect_count
     return article
Пример #2
0
    def __filterRemovedArticle(self, articleList, entityId, eventId=None):
        '''
        与remove表格对比,进行文章过滤
        返回不存在remove表中的文章list
        '''
        if len(articleList) == 0:
            return []
        if eventId is not None:
            tableName = Constants.TABLE_SA_EVENT_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ' event_id=%d and ' % eventId
        else:
            tableName = Constants.TABLE_SA_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ''
        # 在remove表里查找文章
        selectSql = '''
            SELECT TID, CHANNEL_ID FROM %s where %s (%s)
        '''
        whereClauseList = map(
            lambda article: '(TID="%s" and CHANNEL_ID=%d)' %
            (article.tid, article.channel_id), articleList)

        self.dbProxy.execute(
            selectSql %
            (tableName, eventCondition, ' or '.join(whereClauseList)))
        resultList = self.dbProxy.fetchall()  # 查询返回结果集
        removedArticleList = map(lambda x: Article(x[0], x[1]), resultList)
        filteredArticle = filter(lambda x: x not in removedArticleList,
                                 articleList)
        #self.logger.debug('originalList:%s', map(lambda article: article.tid, articleList))
        #self.logger.debug('removedArticleList:%s', map(lambda article: article.tid, removedArticleList))
        #self.logger.debug('filteredArticle:%s', map(lambda article: article.tid, filteredArticle))
        return filteredArticle
Пример #3
0
    def __searchByPage(self,keywordList,startTime,endTime,page):
        # 在豆瓣内部搜索框搜索
        page = str(page) #url接收的是str格式
        search_url = "https://www.douban.com/group/search?start="+page+"&cat=1013&sort=time&q="
        #cat:按话题搜索  sort:按最新发布时间分类  q:搜索关键词
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        search_url = search_url+str(query)

        url_page = self.session.get(search_url,headers=self.headers)
        soup = BeautifulSoup(url_page, "lxml")
        # self.logger.debug(soup)
        main_wrap = soup.find('div', attrs={'class': "article"})
        main_article_list = main_wrap.find('div',attrs={'class':"topics"})
        articleList = list()
        hasnext = True
        if main_article_list is not None:
            title_list = main_article_list.findAll('tr', {'class': 'pl'})
            for title in title_list:
                article_publishtime = title.find('td', attrs={'class': "td-time"}).attrs['title']

                urlTime = time.strptime(article_publishtime, "%Y-%m-%d %H:%M:%S")

                Y, M, D, H = urlTime[0:4]
                urlTime2 = datetime.datetime(Y, M, D, H)
                urlTime2 = time.mktime(urlTime2.timetuple())
                #转换成时间戳来比较 float类型
                startTime = endTime - datetime.timedelta(days=2)
                startTimeIntSecond = time.mktime(startTime.timetuple())
                endTimeIntSecond = time.mktime(endTime.timetuple())

                #如果符合时间范围就爬取
                if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond:
                    article_url = title.find('a').attrs['href']
                    self.logger.debug(article_url)
                    (content, author_name, tid,like_count) = self.crawlArticle(article_url)
                    if content is None:  # 话题已被删除或则其他格式
                        continue

                    article_title = title.find('td', attrs={'class': "td-subject"}).text
                    article = Article(tid,  self.channel.channel_id, article_title, content, article_publishtime, article_url, None,
                                      author_name)
                    self.crawlComment(article)
                    reply_count = title.find('td', attrs={'class': "td-reply"}).text.strip()
                    reply_count = re.sub(u'回应', '', reply_count)  # 回复数量去除中文保留数字
                    article.statistics.reply_count = reply_count
                    article.statistics.like_count = like_count
                    if article not in articleList:
                        articleList.append(article)

                else:
                    print len(articleList)
                    hasnext=False
                    break

            return (articleList, hasnext)
Пример #4
0
    def crawlArticle(self, url, **kwargs):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        # TBD, 转发情况目前先不考虑
        searchTitle = ''
        referer = None
        if (kwargs['flag'] == 0):
            if 'referer' not in kwargs:
                return None
            if 'title' in kwargs:
                searchTitle = kwargs['title']
            else:
                searchTitle = ''
        try:
            self.session_public_article.randomSleep()
            if kwargs['flag'] == 0:
                referer = kwargs['referer']
                headers = HEADERS_ARTICLE.copy()
                headers['Referer'] = referer
                response = self.session_public_article.get(
                    url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE)
            else:
                response = self.session_public_article.get(
                    url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE)
            mid = re.findall(r'var mid = .*"(\d+)";',
                             response.text)[0] + '-' + re.findall(
                                 r'var idx = .*"(\d+)";', response.text)[0]
            soup = BeautifulSoup(response.text, 'lxml')
            main = soup.find('div', {'id': "img-content"})
            title = main.find('h2').text.strip()
            content = main.find('div', {'id': "js_content"}).text.strip()
            profile = main.find('div', {'class': "profile_inner"})
            author_id = profile.find('span').text.strip()
            author_name = profile.find('strong').text.strip()
            publish_datetime = kwargs['publish_datetime']

            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime=publish_datetime,
                              url=url,
                              author_id=author_id,
                              author_name=author_name,
                              meta_info='{refer="%s"}' % referer)
            self.logger.debug('Successfully parse article:%s', title)
            return article
        except:
            self.logger.error('Fail to get article for %s: %s due to %s', url,
                              searchTitle, traceback.format_exc())
            return None
Пример #5
0
    def __fetchOldArticleList(self, articleList, articleCount=100):
        '''
        从全局文章表,获取尚未消亡的文章id,而且这些文章并不在本次爬虫爬回来的记录里
        '''
        #用来查询总页数
        selectSql_count = 'SELECT COUNT(*) FROM %s where extinct="N" and channel_id=%d '
        sql2 = selectSql_count % (Constants.TABLE_SA_ARTICLE,
                                  self.channel.channel_id)
        #获取旧文章的sql
        selectSql = 'SELECT TID,title, publish_datetime,url, meta_info FROM %s where extinct="N" and channel_id=%d '
        sql = selectSql % (Constants.TABLE_SA_ARTICLE, self.channel.channel_id)

        if len(articleList) > 0:
            whereClauseList = map(
                lambda article: ' tid<>"%s" ' % (article.tid), articleList)
            sql += ' and (%s)' % (' and '.join(whereClauseList))
            sql2 += ' and (%s)' % (' and '.join(whereClauseList))

        sql2 += ' order by add_datetime desc'

        self.dbProxy.execute(sql2)
        resultList2 = self.dbProxy.fetchall()
        # print '12456789sssssssssssssssssss'
        # print resultList2 #((53,),)
        resultList2 = re.findall(r'\d+', str(resultList2))  #返回一个list
        # print resultList2[0]
        if int(resultList2[0]) > int(articleCount):
            randpage = random.randint(
                0, int(math.ceil(float(resultList2[0]) / articleCount)))
        else:
            randpage = 0  #用来随机取数据库页数

        sql += ' order by add_datetime desc limit %d,%d' % (randpage,
                                                            articleCount)
        self.dbProxy.execute(sql)
        resultList = self.dbProxy.fetchall()

        return map(
            lambda item: Article(item[0],
                                 self.channel.channel_id,
                                 title=item[1],
                                 publish_datetime=item[2],
                                 url=item[3],
                                 meta_info=item[4]), resultList)
Пример #6
0
    def __filterRemovedArticle(self, articleList, entityId, eventId=None):
        '''
        与remove表格对比,进行文章过滤
        返回不存在remove表中的文章list
        '''
        if len(articleList) == 0:
            return []
        if eventId is not None:
            tableName = Constants.TABLE_SA_EVENT_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ' event_id=%d and ' % eventId

            start_datetime, end_datetime = self.__fetchEventTime(
                entityId, eventId)

            # 过滤掉不在该事件开始时间和结束之间内的文章
            article_new_list = list()
            for article in articleList:
                if (str(article.publish_datetime) > str(start_datetime)) and (
                        str(article.publish_datetime) < str(end_datetime)):
                    article_new_list.append(article)

            articleList = article_new_list

        else:
            tableName = Constants.TABLE_SA_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ''
        # 在remove表里查找文章
        selectSql = '''
            SELECT TID, CHANNEL_ID FROM %s where %s (%s)
        '''
        whereClauseList = map(
            lambda article: '(TID="%s" and CHANNEL_ID=%d)' %
            (article.tid, article.channel_id), articleList)

        self.dbProxy.execute(
            selectSql %
            (tableName, eventCondition, ' or '.join(whereClauseList)))
        resultList = self.dbProxy.fetchall()  # 查询返回结果集
        removedArticleList = map(lambda x: Article(x[0], x[1]), resultList)
        filteredArticle = filter(lambda x: x not in removedArticleList,
                                 articleList)

        return filteredArticle
Пример #7
0
    def __seperateNewOldArticles(self, articleList, entityId=None):
        '''
        查询全局文章表,区分新文章和旧文章
        '''
        if len(articleList) == 0:
            return ([], [])
        if entityId is None:
            selectSql = 'select tid, channel_id from %s where ' % Constants.TABLE_SA_ARTICLE
        else:
            selectSql = 'select tid, channel_id from %s where ' % (
                Constants.TABLE_SA_ARTICLE + Constants.TABLE_NAME_DELIMITER +
                entityId)
        whereClauseList = map(
            lambda article: '(tid="%s" and channel_id=%d)' %
            (article.tid, article.channel_id), articleList)
        self.dbProxy.execute(selectSql + ' or '.join(whereClauseList))
        resultList = map(lambda x: Article(x[0], x[1]),
                         self.dbProxy.fetchall())

        existingArticleList = filter(lambda x: x in resultList, articleList)
        newArticleList = filter(lambda x: x not in resultList, articleList)
        return (existingArticleList, newArticleList)
Пример #8
0
    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        :param url:
        :return:返回一个Article实例
        '''
        cookies = None
        html = self.session.download(url=url,
                                     encoding='utf-8',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        if html:
            article_url = html['url']
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            article_id = re.findall(r'c_\d+', article_url)[0]
            article_id = article_id[2:]
            soup = BeautifulSoup(html['html'], "html.parser")
            main1 = soup.find('div', attrs={'class': "widthMain main"})
            main2 = soup.find('div', attrs={'class': "main pagewidth"})
            main3 = soup.find('body', attrs={'class': "streamline-page"})
            main4 = soup.find('div', attrs={'class': "h-title"})
            main5 = soup.find('div', attrs={'id': "article"})
            main6 = soup.find('div', attrs={'id': "Title"})
            main7 = soup.find('div', attrs={'class': "article"})

            if main1 is not None:
                self.logger.debug("main1")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain1(main1)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main2 is not None:
                self.logger.debug("main2")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain2(main2)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main3 is not None:
                self.logger.debug("main3")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain3(main3)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main3 is None and main4 is not None:
                self.logger.debug("main4")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain4(soup)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main5 is not None:
                self.logger.debug("main5")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain5(main5)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main6 is not None:
                self.logger.debug("main6")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain6(soup)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main7 is not None:
                self.logger.debug("main7")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain7(soup)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if (main1 is None and main2 is None and main3 is None
                    and main4 is None and main5 is None and main6 is None
                    and main7 is None):
                self.logger.error(u"存在另外一种html格式::%s", url)
                return
Пример #9
0
    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        article = None
        html = self.session.download(url,
                                     encoding='utf-8',
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        article_url = html['url']

        # if article_url.find(self.channel.url)<0:
        #     self.logger.warn('Unrelated url found:%s',url)
        #     continue

        # self.logger.debug(article_url)
        soup = BeautifulSoup(html['html'], 'html.parser')

        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return article

        main = soup.find('div', attrs={'class': "left_section"})

        if main:
            Ttitle = main.find('div', attrs={
                'id': "j_core_title_wrap"
            }).find('h1')
            Ttitle1 = main.find('div', attrs={
                'id': "j_core_title_wrap"
            }).find('h3')
            if Ttitle:
                Ttitle = Ttitle.text.strip()
            elif Ttitle1:
                Ttitle = Ttitle1.text.strip()
            else:
                Ttitle = ''

            # self.logger.debug(u'标题%s',Ttitle)

            data_field = main.find('div', attrs={
                'id': "j_p_postlist"
            }).find('div').attrs['data-field'].strip()
            data_field = json.loads(data_field)
            publish_datetime = data_field['content']
            if 'date' in publish_datetime.keys():
                publish_datetime = publish_datetime['date']
            else:
                publish_datetime = main.find('div',
                                             attrs={
                                                 'id': "j_p_postlist"
                                             }).find('div').find_all(
                                                 'span',
                                                 attrs={'class': "tail-info"
                                                        })[-1].text.strip()

            publish_datetime = re.findall(
                r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                publish_datetime)[0]

            if len(publish_datetime[4]) > 1:
                publish_datetime = publish_datetime[
                    0] + '-' + publish_datetime[1] + '-' + publish_datetime[
                        2] + ' ' + publish_datetime[3] + publish_datetime[4]
            else:
                publish_datetime = publish_datetime[
                    0] + '-' + publish_datetime[1] + '-' + publish_datetime[
                        2] + ' ' + publish_datetime[3] + ':00'

            Tid = data_field['author']['user_id']
            Tauthor = data_field['author']['user_name']
            Treply = soup.find('li', attrs={
                'class': "l_reply_num"
            }).find('span').text.strip()  #总回复数
            Tcontent = main.find('div', attrs={
                'id': "j_p_postlist"
            }).find('div').find('cc').text.strip()

            article = Article(Tid,
                              self.channel.channel_id,
                              Ttitle,
                              Tcontent,
                              publish_datetime,
                              url=article_url,
                              author_id=None,
                              author_name=Tauthor,
                              meta_info=None)

            article.statistics.reply_count = Treply

        else:
            self.logger.warn(u'很抱歉,该贴已被删除。%s', article_url)

        return article
Пример #10
0
    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        html = self.session.download(url,
                                     encoding='utf-8',
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        article_url = html['url']
        self.logger.debug(article_url)
        soup = BeautifulSoup(html['html'], 'html.parser')

        main = soup.find('div', attrs={'id': "bd"})
        main1 = soup.find('div', attrs={
            'class': "wd-question"
        })  #论坛提问帖子http://bbs.tianya.cn/post-730-5795-1-1.shtml
        article = None
        if main:
            Ttitle = main.find('h1').find('span').text
            Ttime = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[1].text.strip()
            Ttime = re.sub(u'[\u4e00-\u9fa5]+:', '', Ttime)
            Tid = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[0].find('a').attrs['uid'].strip()
            Tauthor = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[0].find('a').attrs['uname'].strip()
            Tclick = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[2].text.strip()
            Tclick = re.sub(u'[\u4e00-\u9fa5]+:', '', Tclick)
            Tclick = int(Tclick)
            Treply = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[3].text.strip()
            Treply = re.sub(u'[\u4e00-\u9fa5]+:', '', Treply)
            Treply = int(Treply)
            Tlike = main.find('a', attrs={'class': "zantop"}).attrs['_count']
            Tcontent = main.find('div',
                                 attrs={
                                     'class': "bbs-content clearfix"
                                 }).text.strip()

            article = Article(Tid,
                              self.channel.channel_id,
                              Ttitle,
                              Tcontent,
                              Ttime,
                              url=article_url,
                              author_id=None,
                              author_name=Tauthor)

            article.statistics.reply_count = Treply
            article.statistics.click_count = Tclick
            article.statistics.like_count = Tlike

        elif main1:
            Ttitle = main1.find('h1').find('span').text
            Ttime = main1.find('div').attrs['js_replytime']
            Ttime = re.findall(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})',
                               Ttime)[0]
            Tid = main1.find('div').attrs['_host']
            Tauthor = main1.find('div', attrs={
                'class': "q-info"
            }).find('a').text
            Tclick = main1.find('div').attrs['js_clickcount']
            Treply = main1.find('div').attrs['js_powerreply']
            Tcontent = main1.find('div', attrs={'class': "q-content atl-item"})
            if Tcontent:
                Tcontent = Tcontent.find('div', attrs={
                    'class': "text"
                }).text.strip()
            else:
                Tcontent = ''

            article = Article(Tid,
                              self.channel.channel_id,
                              Ttitle,
                              Tcontent,
                              Ttime,
                              url=article_url,
                              author_id=None,
                              author_name=Tauthor)

            article.statistics.reply_count = Treply
            article.statistics.click_count = Tclick
        return article
Пример #11
0
    def crawlArticle(self, req_data):

        hasnext = False

        req_url = 'http://120.55.151.61/Treehole/V4/Cave/getList.action'
        data_r = self.s.post(url=req_url, data=req_data, headers=self.headers)
        data_r = data_r.text

        true = True
        false = False

        data_j = eval(data_r)
        data_js = json.dumps(data_j)
        data_dict = simplejson.loads(data_js)

        data = data_dict['data']
        timestamp = data['timestampLong']
        timestampLong = data['timestampLong'] / 1000
        timestampLong_struct = datetime.datetime.fromtimestamp(timestampLong)
        timestampLong_str = timestampLong_struct.strftime('%Y-%m-%d %H:%M:%S')
        messageBO = data['messageBOs']

        for each in messageBO:  # 20条记录保存在一组里
            if 'studentBO' in each:
                content = each.get('content', False)
                if content:
                    publish_timeStamp = each['issueTime'] / 1000
                    datetime_struct = datetime.datetime.fromtimestamp(
                        publish_timeStamp)
                    publish_time = datetime_struct.strftime(
                        '%Y-%m-%d %H:%M:%S')
                    if datetime.datetime.strptime(
                            publish_time,
                            '%Y-%m-%d %H:%M:%S') < self.startTime:
                        hasnext = False
                        break

                    schoolName = each['schoolName']
                    readCount = each['readCount']
                    relayedCount = each['relayedCount']
                    reply_count = each['comments']
                    content = each['content']
                    if len(content) > 8:
                        title = content[:8] + '...'
                    elif len(content) <= 0:
                        title = u'内容为图片'
                    else:
                        title = content

                    studentBO = each['studentBO']
                    studentBO_id = studentBO['studentId']
                    studentBO_name = studentBO['nickName']
                    likeCount = each['likeCount']
                    article_id = each['messageId']
                    publish_method = each['source']
                    publish_method = publish_method[publish_method.find('_') +
                                                    1:]

                    data2 = 'platform=1&phoneBrand=HONOR&phoneVersion=24&versionNumber=9.1.1&phoneModel=BLN-AL10&sharePlatform=4&channel=huaweiMarket&plateId=1&messageId=' + str(
                        article_id) + '&'
                    article_urlPOST = 'http://120.55.151.61/Treehole/V4/Message/preShare.action'
                    data_urlPOST = self.s.post(url=article_urlPOST,
                                               data=data2,
                                               headers=self.headers)

                    data3 = data_urlPOST.text
                    data_j3 = eval(data3)
                    data_js3 = json.dumps(data_j3)
                    data_dict3 = simplejson.loads(data_js3)

                    data4 = data_dict3['data']

                    article_url = data4['shareUrl']

                    article = Article(article_id,
                                      self.channel.channel_id,
                                      title,
                                      content,
                                      publish_time,
                                      article_url,
                                      author_id=studentBO_id,
                                      author_name=studentBO_name,
                                      publish_method=publish_method,
                                      entity=schoolName)
                    article.statistics.reply_count = reply_count
                    article.statistics.like_count = likeCount
                    article.statistics.read_count = readCount
                    article.statistics.forward_count = relayedCount
                    # print article.__str__()

                    if article is not None and article not in self.articleList:
                        self.articleList.append(article)

        # print datetime.datetime.strptime(timestampLong_str,'%Y-%m-%d %H:%M:%S')
        # print '1111111'
        if datetime.datetime.strptime(timestampLong_str,
                                      '%Y-%m-%d %H:%M:%S') < self.startTime:
            hasnext = False
            return ('', hasnext)
        else:
            hasnext = True
            res_data = 'timestamp=' + str(
                timestamp
            ) + '&preMoodTimestap=' + str(
                timestamp
            ) + '&platform=1&phoneBrand=HONOR&phoneVersion=24&versionNumber=9.1.1&phoneModel=BLN-AL10&type=1&channel=huaweiMarket&'
            return (res_data, hasnext)
Пример #12
0
    def __searchByPage(self, keywordList, startTime, endTime, page):
        # 在豆瓣内部搜索框搜索
        page = str(page)  #url接收的是str格式
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        params = {
            'c': 'blog',
            'range': 'article',  # 包含全部
            'by': 'all',
            'sort': 'time',
            'col': '',
            'source': '',
            'from': '',
            'country': '',
            'size': '',
            'time': '',
            'a': '',
            'isown': 'false',
            'page': page,
            'dpc': '',
            'q': query
        }
        search_url = "http://search.sina.com.cn/"

        url_page = self.session.get(search_url,
                                    params=params,
                                    headers=self.headers)
        soup = BeautifulSoup(url_page, "lxml")
        # self.logger.debug(soup)
        main_wrap = soup.find('div', attrs={'class': "result-boxes"})

        if main_wrap is None:  #为了防止因网络问题而导致的查找不到
            self.logger.debug(u'第一次查找没有结果再找一遍中')
            url_page = self.session.get(search_url,
                                        params=params,
                                        headers=self.headers)
            soup = BeautifulSoup(url_page, "lxml")
            main_wrap = soup.find('div', attrs={'class': "result-boxes"})
            if main_wrap is None:  #再找一遍如果还是没有才可以算没有结果
                self.logger.debug(u'第二次查找没有结果,该关键词没有结果')
                return (list(), False)

        main_article_list = main_wrap.findAll(
            'div', attrs={'class': "box-result clearfix"})

        hasnext = True
        if main_article_list is not None:
            print '1'
            for title in main_article_list:
                print '2'
                article_publishtime_wrap = title.findAll('p')[1]
                article_publishtime = article_publishtime_wrap.find(
                    'span', attrs={
                        'class': "fgray_time"
                    }).text.strip()
                self.logger.error(article_publishtime)
                urlTime = time.strptime(article_publishtime,
                                        "%Y-%m-%d %H:%M:%S")

                Y, M, D, H = urlTime[0:4]
                urlTime2 = datetime.datetime(Y, M, D, H)
                urlTime2 = time.mktime(urlTime2.timetuple())
                #转换成时间戳来比较 float类型
                startTime = endTime - datetime.timedelta(days=5)
                startTimeIntSecond = time.mktime(startTime.timetuple())
                endTimeIntSecond = time.mktime(endTime.timetuple())

                #如果符合时间范围就爬取
                if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond:
                    print '3'
                    title_wrap = title.find('h2')
                    article_url_parent = title_wrap.find('a').attrs['href']
                    article_url_index = article_url_parent.rfind('/')
                    article_url_child = article_url_parent[article_url_index +
                                                           1:]
                    article_url = 'http://blog.sina.com.cn/s/blog_' + article_url_child + '.html'
                    Ttitle = title_wrap.find('a').text.strip()
                    self.logger.debug(article_url)
                    meta_info = article_url_child
                    (content, reply_count, like_count, read_count,
                     collect_count, forward_count) = self.crawlArticle(
                         article_url, meta_info)
                    self.logger.debug(like_count)

                    if content is None:  # 话题已被删除或则其他格式
                        print '756236'
                        continue
                    tid = 't_' + article_url_child
                    author_name = article_publishtime_wrap.find(
                        'a', attrs={
                            'class': "rib-author"
                        }).text
                    article = Article(tid,
                                      self.channel.channel_id,
                                      Ttitle,
                                      content,
                                      article_publishtime,
                                      article_url,
                                      None,
                                      author_name,
                                      meta_info=meta_info)

                    # self.crawlComment(article)

                    article.statistics.reply_count = reply_count
                    article.statistics.like_count = like_count
                    article.statistics.read_count = read_count
                    article.statistics.collect_count = collect_count
                    article.statistics.forward_count = forward_count
                    # self.logger.debug(article)
                    if article not in self.articleList:
                        self.articleList.append(article)

                else:
                    print '78956'
                    print len(self.articleList)
                    hasnext = False
                    break

            return hasnext
Пример #13
0
class JianShuCrawler(object):
    def __init__(self, channel, logger=None):
        '''

        :param channel:
        :param logger:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.channel = channel  #获取媒体实体
        # 设置请求头和代理
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'www.jianshu.com',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        }
        self.headers_1 = {
            'Host': 'www.jianshu.com',
            'Connection': 'keep-alive',
            'Connection-Length': '0',
            'Accept': 'application/json',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept-Encoding': 'gzip,deflate,br',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        #用户信息
        self.usename = '13432881156'
        self.password = '******'
        #用cookie进行登录
        self.session = SessionCrawler()
        self.session.cookies = cookielib.LWPCookieJar(
            filename='cookie')  #读取cookie
        try:
            self.session.cookies.load(ignore_discard=True)
        except:
            print('未能加载cookie')
        if self.islogin():
            print('已经登录简书')
        else:
            self.login(self.usename, self.password)

    def searchArticle(self, keywordList, endTime):
        '''
            根据关键字数组,开始时间和结束时间范围搜索文章
            :param keywordList: 关键字数组
            :param endTime: 搜索时间范围结束
            :return:
        '''
        if self.islogin():
            page = 0  # 页数
            articleList = list()
            hasnext = True  # 开始爬虫
            while hasnext:
                page += 1
                if page == 5:
                    break
                (articleListInPage,
                 hasnext) = self._searchByPage(keywordList, endTime, page)
                is_article = list()
                for article in articleListInPage:
                    if article not in articleList:
                        is_article.append(article)
                articleList.extend(is_article)
                print(len(articleList))

            self.logger.debug('总共抓取文章有:%s' % len(articleList))
            return articleList
        else:
            self.logger.error(u'未登录简书,请确认登录')
            return None

    def _searchByPage(self, keywordList, endTime, page):
        hasnext = True
        articleList = list()
        page = str(page)  # 在简书内部搜索框搜索
        search_url_1 = "https://www.jianshu.com/search/do?q="
        search_url_2 = "&type=note&page=" + page + "&order_by=published_at&time_range=a_day"
        # q:是关键词搜索,page:页数,order_by是排序,time_range是按天排序
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        search_url = search_url_1 + str(query) + search_url_2  # 抓取页面的网址
        csrf = self.getCsrf(keywordList, page)
        post_data = {'X-CSRF-Token': csrf}
        # 获取页面信息
        url_page = self.session.post(search_url,
                                     headers=self.headers_1,
                                     data=post_data,
                                     textRspOnly=False)
        if url_page.status_code == 200:
            self.logger.debug(u'已经获取中大新闻页面')
        else:
            self.logger.debug(u'中大新闻获取完毕')
            return (articleList, False)  #已经爬取到新闻界面的最后一页

        # 以下对中大新闻页面进行解析
        articleJson = None
        try:
            articleJson = json.loads(url_page.text)
        except Exception, e:
            self.logger.error(e)

        if articleJson is None:
            self.logger.error('articleJson is None')
            return

        allInfo = articleJson['entries']  # allinfo代表字典0:{}\1:{}.....
        for info in allInfo:
            pre_publishedTime = info["first_shared_at"]
            publishedTime_1 = ''.join(
                re.findall('[^A-Za-z]', pre_publishedTime[0:-5]))  # 文章发表时间
            publishedTime = publishedTime_1[0:10] + ' ' + publishedTime_1[10:18]
            #print(publishedTime)
            urlTime = time.strptime(publishedTime, '%Y-%m-%d %H:%M:%S')
            Y, M, D, H = urlTime[0:4]
            urlTime2 = datetime.datetime(Y, M, D, H)
            # 转化成时间戳来比较float
            urlTime2 = time.mktime(urlTime2.timetuple())  # 文章发表时间的时间戳
            startTime = endTime - datetime.timedelta(
                days=self.channel.search_ranges)
            startTimeIntSecond = time.mktime(startTime.timetuple())  # 开始爬取的时间戳
            endTimeIntSecond = time.mktime(endTime.timetuple())  # 结束爬取的时间错
            if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond:
                # 获取文章链接,爬取文章内容
                pre_article_url = info['slug']
                articleUrl = "https://www.jianshu.com/p/" + pre_article_url  # 文章的链接
                (content, authorName,
                 title) = self.crawlArticle(articleUrl)  #爬取文章内容
                #print(title)
                if content is None:  # 话题已被删除或则其他格式
                    print("没有爬到文章")
                    continue
                authorId = info["user"]["id"]  #作者id
                likeCount = info["likes_count"]  # 点赞数
                readCount = info["views_count"]  # 文章阅读数
                replyCount = info["public_comments_count"]  # 文章评论数
                tid = info["id"]  # 文章的id
                article = Article(tid, self.channel.channel_id, title, content,
                                  publishedTime, articleUrl, authorId,
                                  authorName)
                #self.crawlComment(article)
                article.statistics.reply_count = replyCount
                article.statistics.like_count = likeCount
                article.statistics.read_count = readCount
                if (article is not None) and (article not in articleList):
                    articleList.append(article)
            else:
                print('结束爬虫')
                hasnext = False
                break
        return (articleList, hasnext)
Пример #14
0
    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        cookies = 'U_TRS1=000000fa.9fe376b4.58573ebc.bde2f2c3; UOR=,vip.stock.finance.sina.com.cn,; vjuids=3923fcfb8.15914cd122a.0.e347599b65a6; SINAGLOBAL=183.63.92.250_1482112700.861930; SUB=_2AkMvC7H0f8NhqwJRmP4WzWzrb4xwzgnEieLBAH7sJRMyHRl-yD83qlNetRBAqqE4nv4pjjxQaUfLZo_Os-Bxsw..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFZzJ6nbHTRfVEqOXp-S.5z; SGUID=1482721389362_efec0e8d; vjlast=1488765553.1489054965.10; bdshare_firstime=1492414283526; _ct_uid=58f46f61.537a7929; lxlrtst=1492423120_o; rotatecount=2; Apache=59.42.29.149_1492670298.869113; ULV=1492670299361:18:6:6:59.42.29.149_1492670298.869113:1492670298484; afpCT=1; CNZZDATA1252916811=1442218969-1492654141-http%253A%252F%252Fnews.sina.com.cn%252F%7C1492664941; UM_distinctid=15b8a154522e79-0a3f79bddc9d05-4e45042e-100200-15b8a154523a49; CNZZDATA5399792=cnzz_eid%3D349789736-1492650802-http%253A%252F%252Fnews.sina.com.cn%252F%26ntime%3D1492667002; U_TRS2=00000095.1c285e96.58f85761.e07aa962; lxlrttp=1492423120'
        html = self.session.download(url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, cookies=cookies)
        if html:
            article_url = html['url']
            article_url = re.findall(r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.info('[SinaNews]'+article_url)
            #获取发布时间
            date = re.findall(r'/(\d{4}-\d{2}-\d{2})/', article_url)
            if len(date) == 0:
                return None
            # if date[0] < '2015-07-01':
            #     html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True)

            soup = BeautifulSoup(html['html'], 'lxml')
            main = soup.find('div', attrs={'class': "wrap-inner"})
            main1 = soup.find('div', attrs={'class': "Main clearfix"})
            main2 = soup.find('div', attrs ={'class': "main-content w1240"})

            #第一种网页格式
            if main is not None:
                self.logger.debug('走第一种格式')
                #获取标题
                Ttitle = main.find('h1', attrs={'id': "artibodyTitle"})
                if Ttitle is None:
                    self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #获取发布时间
                Ttime = main.find('span', attrs={'class': 'time-source'})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}).*', Ttime)[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3]
                else:
                    self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取作者信息
                Tauthor = soup.find('span', attrs={'class': "time-source"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                    else:
                        Tauthor = None
                else:
                    Tauthor = None

                #获取内容
                Tcontent = main.find('div', attrs={'id': "artibody"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                #获取评论数
                try:
                    channel = re.findall(r"channel: '(.*)',", html['html'])[0]
                    newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
                    data = {
                        'format': 'js',
                        'channel': channel,
                        'newsid': newsid,
                        'group': '',
                        'compress': '1',
                        'ie': 'gbk',
                        'oe': 'gbk',
                        'page': '1',
                        'page_size': '20'
                    }
                    re_url = 'http://comment5.news.sina.com.cn/page/info'
                    html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
                    html1 = re.sub(r'(.*=)\{', '{', html1)
                    html1 = json.loads(html1)
                    totalcount = html1['result']['count']['show']
                    Treply = totalcount
                except:
                    Treply = None

                # 获取文章的id
                articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0]

                article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor)
                article.statistics.reply_count = Treply
                return article

            #第二种网页格式
            elif main1 is not None:
                self.logger.debug('走第二种格式')
                #获取标题
                Ttitle = main1.find('h1', attrs={'id': "artibodyTitle"})
                if Ttitle is None:
                    self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #获取时间
                Ttime = main1.find('span', attrs={'id': "pub_date"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})', Ttime)[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3]
                else:
                    self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取发布者
                Tauthor = main1.find('span', attrs={'id': "media_name"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a').text.strip()
                else:
                    Tauthor = None

                #获取内容
                Tcontent = main1.find('div', attrs={'id': "artibody"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                try:
                    channel = re.findall(r"channel: '(.*)',", html['html'])[0]
                    newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
                    data = {
                        'format': 'js',
                        'channel': channel,
                        'newsid': newsid,
                        'group': '',
                        'compress': '1',
                        'ie': 'gbk',
                        'oe': 'gbk',
                        'page': '1',
                        'page_size': '20'
                    }
                    re_url = 'http://comment5.news.sina.com.cn/page/info'
                    html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
                    html1 = re.sub(r'(.*=)\{', '{', html1)
                    html1 = json.loads(html1)
                    totalcount = html1['result']['count']['show']
                    Treply = totalcount
                except:
                    Treply = None

                # 获取文章的id
                articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0]

                article = Article(tid=articleid, channel_id=self.channel.channel_id,title=Ttitle, content=Tcontent, publish_datetime=Ttime, url=article_url, author_name=Tauthor)
                article.statistics.reply_count = Treply
                return article

            #第三种网页格式
            elif main2 is not None:
                self.logger.debug(u'第三种格式')
                #获取标题
                Ttitle = main2.find('div', attrs={'class': "second-title"})

                if Ttitle is None:
                    self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                # 获取时间
                Ttime = main2.find('span', attrs={'class': "date"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D ' '(\d{2}:\d{2}).*', Ttime)[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3]
                else:
                    self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                # 获取发布者
                Tauthor = main2.find('a', attrs={'class': "source"})
                if Tauthor is not None:
                    Tauthor = Tauthor.text.strip()
                else:
                    Tauthor = None

                # 获取内容
                Tcontent = main2.find('div', attrs={'id': "article"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                # 获取评论数
                try:
                    channel = re.findall(r"channel: '(.*)',", html['html'])[0]
                    newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
                    data = {
                        'format': 'js',
                        'channel': channel,
                        'newsid': newsid,
                        'group': '',
                        'compress': '1',
                        'ie': 'gbk',
                        'oe': 'gbk',
                        'page': '1',
                        'page_size': '20'
                    }
                    re_url = 'http://comment5.news.sina.com.cn/page/info'
                    html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
                    html1 = re.sub(r'(.*=)\{', '{', html1)
                    html1 = json.loads(html1)
                    totalcount = html1['result']['count']['show']
                    Treply = totalcount
                except:
                    Treply = None

                # 获取文章的id
                articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0]

                article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor)
                article.statistics.reply_count = Treply
                return article
Пример #15
0
    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056'
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        if html:
            article_url = html['url']
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.info(article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')
            main = soup.find('div', attrs={'class': "post_content_main"})
            main1 = soup.find('div', attrs={'class': "ep-content-main"})

            #第一种网页格式
            if main is not None:

                #获取标题
                Ttitle = main.find('h1')
                if Ttitle is None:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #获取发布时间
                Ttime = main.find('div', attrs={'class': "post_time_source"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                       Ttime)[0]
                else:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取发布作者
                Tauthor = main.find('div', attrs={'class': "post_time_source"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                    else:
                        Tauthor = None

                #获取发布内容
                Tcontent = main.find('div', attrs={'class': "post_text"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                    dr = re.compile(r'<[^>]+>', re.S)
                    Tcontent = dr.sub('', Tcontent)
                else:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                    # 获取评论数
                articleid = ""
                try:
                    articleid = re.findall(r'"docId" : "(.*)",',
                                           html['html'])[0]
                    re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid
                    html1 = self.session.download(url=re_url,
                                                  encoding='utf-8',
                                                  data=None,
                                                  isJson=True,
                                                  timeout=10,
                                                  retry=3)
                    Treply = html1["tcount"]
                except:
                    Treply = None
                    self.logger.error('[NeteaseComment]url:' + article_url +
                                      ', tid:' + articleid + ', %s' +
                                      traceback.format_exc())
                finally:
                    article = Article(tid=articleid,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      content=Tcontent,
                                      publish_datetime=Ttime,
                                      url=article_url,
                                      author_name=Tauthor)
                    article.statistics.reply_count = Treply
                    return article

            #第二种网页格式
            elif main1 is not None:

                #标题
                Ttitle = main1.find('h1')
                if Ttitle is None:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #发布的时间
                Ttime = main1.find('div',
                                   attrs={'class': "ep-time-source cDGray"})
                Ttime1 = main1.find('div', attrs={'class': "ep-info cDGray"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(
                        r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0]
                elif Ttime1 is not None:
                    Ttime = Ttime1.text.strip()
                    Ttime = re.findall(
                        r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0]
                else:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取作者信息
                Tauthor = main1.find('div',
                                     attrs={'class': "ep-time-soure cDGray"})
                Tauthor1 = main1.find('div',
                                      attrs={'class': "ep-source cDGray"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                    else:
                        Tauthor = None
                elif Tauthor1 is not None:
                    Tauthor = Tauthor1.find('span')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                        print Tauthor
                        Tauthor = re.findall(r'来源:(.*)"', Tauthor)[0]
                    else:
                        Tauthor = None
                else:
                    Tauthor = None

                #获取内容
                Tcontent = main1.find('div', attrs={'id': "endText"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                    dr = re.compile(r'<[^>]+>', re.S)
                    Tcontent = dr.sub('', Tcontent)
                else:
                    self.logger.error('[SinaNews]' +
                                      '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                #获取评论数
                try:
                    articleid = re.findall(r'"docId" : "(.*)",',
                                           html['html'])[0]
                    re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid
                    html1 = json.load(
                        self.session.download(re_url,
                                              encoding='utf-8',
                                              data=None,
                                              isJson=True,
                                              timeout=10,
                                              retry=3))
                    Treply = html1['tcount']
                except:
                    Treply = None
                    self.logger.error('[NeteaseComment]url:' + article_url +
                                      ', tid:' + articleid + ', %s' +
                                      traceback.format_exc())
                finally:
                    article = Article(tid=articleid,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      content=Tcontent,
                                      publish_datetime=Ttime,
                                      url=article_url,
                                      author_name=Tauthor)
                    self.logger.debug("[crawlArticle]" + article.tid)
                    article.statistics.reply_count = Treply
                    return article
Пример #16
0
    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        :return:返回一个article实例
        '''
        article_id = re.findall(r'\d+', url)[0]

        html = self.session.get(url=url, headers=CRAWL_ARTICLE_HEADERS)

        soup = BeautifulSoup(html.text, 'lxml')

        if (url.find('zhuanlan') != -1):
            main = soup.find('textarea', attrs={'id': "preloadedState"})
            try:
                if main is None:
                    return main
                sub = re.findall(r'new Date\(\".*\"\)', main.text)[0]
                sub = re.findall(r'".*"', sub)[0]
                maintext = re.subn(r'new Date\(\".*\"\)', sub, main.text)[0]
                try:
                    articleJson = json.loads(
                        maintext)["database"]["Post"][article_id]
                except:
                    return None
                #获取标题
                title = articleJson["title"]
                self.logger.info(title)
                authorName = articleJson["author"]
                contentSoup = BeautifulSoup(articleJson["content"])
                content = contentSoup.text
                commentCount = articleJson["commentCount"]
                collapsedCount = articleJson["collapsedCount"]
                likeCount = articleJson["likeCount"]
                publishedTime = articleJson["publishedTime"][0:18]
                timeArray = time.strptime(publishedTime, "%Y-%m-%dT%H:%M:%S")
                publishedTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
                article = Article(article_id, self.channel.channel_id, title,
                                  content, publishedTime, url, None,
                                  authorName)
                article.statistics.reply_count = commentCount
                article.statistics.like_count = likeCount
                article.statistics.collect_count = collapsedCount
                self.session.close()
                return article
            except:
                return None
        if (url.find('question') != -1):
            # 获取标题
            try:  # 针对404页面
                main = soup.find('div', attrs={
                    'id': "data"
                }).attrs['data-state']
                articleJson = json.loads(main)
                questionJson = articleJson['entities']['questions'][article_id]
                title = questionJson['title']
                self.logger.info(title)
                contentSoup = BeautifulSoup(questionJson['editableDetail'],
                                            'lxml')
                content = contentSoup.text
                author_id = questionJson['author']['id']
                author_name = questionJson['author']['name']
                createTimeInFloat = questionJson['created']
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(createTimeInFloat))
                reply_count = questionJson['commentCount']
                read_count = questionJson['visitCount']
                collect_count = questionJson['followerCount']
                article = Article(article_id, self.channel.channel_id, title,
                                  content, publish_datetime, url, author_id,
                                  author_name)
                article.statistics.reply_count = reply_count
                article.statistics.read_count = read_count
                article.statistics.collect_count = collect_count
                return article
            except:
                return None
Пример #17
0
    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        html = self.session.download(url,
                                     encoding='utf-8',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)

        if html:
            article_url = html['url']
            if article_url.find(self.channel.url) < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[iFengnews]' + article_url)
            soup = BeautifulSoup(html['html'], 'lxml')  # 'html.parser' 解析器
            main = soup.find('div', attrs={'class': "main"})
            main1 = soup.find('div', attrs={'class': "yc_main"})

            if main is not None:
                self.logger.debug(u'走第一种格式')
                Ttitle = main.find('h1').text.strip()  # 标题
                if Ttitle is None:
                    self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                      url)
                    return
                else:
                    Tpublishtime = main.find('span', attrs={
                        'class': "ss01"
                    }).text.strip()
                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url)
                        return
                    else:
                        Tpublishtime = Tpublishtime.replace(u'年', '-').replace(
                            u'月', '-').replace(u'日', '')

                        Tauthor = main.find('a', attrs={'target': "_blank"})
                        if Tauthor is not None:
                            Tauthor = Tauthor.text.strip()
                        else:
                            Tauthor = 'None'
                        Tcontent = main.find('div',
                                             attrs={'id': "main_content"})
                        # print Tcontent
                        # Tcontent = Tcontent.find('p')
                        if Tcontent is not None:
                            Tcontent = Tcontent.text.strip()
                            Tcontent = re.sub(r'\n|\t', '', Tcontent)
                        else:
                            self.logger.error(
                                u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                url)
                            return

                        doc_url = re.findall(r'"commentUrl":"(.*)",',
                                             html['html'])
                        if doc_url:
                            doc_url = doc_url[0]
                        else:
                            doc_url = url

                        data1 = {
                            'callback': 'newCommentListCallBack',
                            # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml',
                            'doc_url': doc_url,
                            'job': '1',
                            'callback': 'newCommentListCallBack'
                        }
                        re_url = 'http://comment.ifeng.com/get.php'
                        html1 = self.session.download(re_url,
                                                      encoding='gbk',
                                                      data=data1,
                                                      timeout=10,
                                                      retry=3,
                                                      addr=False,
                                                      isJson=True)
                        # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] )

                        Treply = html1['count']
                        if len(html1['comments']) is not 0:
                            articleid = html1['comments'][0]['article_id']
                        else:
                            articleid = article_url
                            articleid = articleid[articleid.find('a/') + 2:
                                                  -6]  # 由于数据库字段长度有限,所以截取部分作为ID
                            self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid',
                                             article_url)

                        meta_info = doc_url

                        article = Article(articleid,
                                          self.channel.channel_id,
                                          Ttitle,
                                          Tcontent,
                                          Tpublishtime,
                                          article_url,
                                          None,
                                          Tauthor,
                                          meta_info=meta_info)
                        article.statistics.reply_count = Treply

                        self.logger.info(article)
                        return article

            ##对第二种格式的爬取
            if main1 is not None:
                self.logger.debug(u'走第二种格式')
                Ttitle = main1.find('h1').text.strip()
                if Ttitle is None:
                    self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                      url)
                    return
                else:
                    Tpublishtime = main1.find('span').text.strip()
                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url)
                        return

                # return Tpublishtime
                Tauthor = main1.find('a', attrs={'target': "_blank"})
                if Tauthor is not None:
                    Tauthor = Tauthor.text.strip()
                else:
                    Tauthor = 'None'
                Tcontent = main1.find('div', attrs={'class': "yc_con_txt"})

                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.warn(
                        u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url)
                    return

                doc_url = re.findall(r'"commentUrl":"(.*)",', html['html'])
                if doc_url:
                    doc_url = doc_url[0]
                else:
                    doc_url = url

                data1 = {
                    'callback': 'newCommentListCallBack',
                    # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml',
                    'doc_url': doc_url,
                    'job': '1',
                    'callback': 'newCommentListCallBack'
                }
                re_url = 'http://comment.ifeng.com/get.php'
                html1 = self.session.download(re_url,
                                              encoding='gbk',
                                              data=data1,
                                              timeout=10,
                                              retry=3,
                                              addr=False,
                                              isJson=True)
                # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] )
                try:
                    Treply = html1['count']
                except:
                    Treply = None

                if len(html1['comments']) is not 0:
                    articleid = html1['comments'][0]['article_id']
                else:
                    articleid = url.strip()
                    articleid = articleid[articleid.find('a/') +
                                          2:-6]  # 由于数据库字段长度有限,所以截取部分作为ID
                    self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid',
                                     article_url)

                meta_info = doc_url
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Tpublishtime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                article.statistics.reply_count = Treply
                self.logger.info(article)
                return article

            if (main is None) and (main1 is None):
                self.logger.warn(u"存在另外一种html格式::%s", url)
                return
Пример #18
0
    def __parseSearchPage(self, data):
        '''
        @return: (articleList,hasnext)
        '''
        articleList = list()
        hasnext = False
        soup = BeautifulSoup(data, "lxml")
        self.logger.info(soup)
        # check if no result
        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return (articleList, hasnext)

        # find page bar to check if more

        pageDivList = soup.findAll('div', {'class': 'W_pages'})
        if len(pageDivList) > 0:
            pageDiv = pageDivList[0]
            if len(pageDiv.findAll('a',
                                   {'class': 'page next S_txt1 S_line1'})) > 0:
                hasnext = True
        if hasnext is False:
            self.logger.info('The last page')

        root_1 = soup.findAll('div', {"action-type": "feed_list_item"})
        # self.logger.debug(root_1)
        for r in root_1:
            root_2 = r.find('div', {'class': "content clearfix"})
            mid = r.attrs['mid']
            article_url = root_2.find('div', {
                'class': "feed_from W_textb"
            }).findNext('a').attrs['href']
            self.logger.debug('1  %s', article_url)
            if not article_url.startswith('http:'):
                article_url = 'http:' + article_url
            # self.logger.debug(article_url)
            root_content = root_2.find('p', {'class': "comment_txt"})

            long_content = root_content.find('a', {'action-type': "fl_unfold"})
            try:
                link_content = root_content.find('a').attrs['href']
                link_content = '  原文链接: ' + link_content
            except:
                link_content = ''
            if long_content:
                content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[
                    'action-data']
                self.session.randomSleep()
                response = self.session.get(content_url, textRspOnly=False)
                try:
                    content_html = response.json()['data']['html']
                    content = BeautifulSoup(
                        content_html, 'html.parser').text.strip().replace(
                            "'", "''").replace("%", "\%").replace(":", "\:")
                except Exception, e:
                    self.logger.debug('Exception: %s' % e)
                    continue
            else:
                content = root_content.text.strip().replace("'", "''").replace(
                    "%", "\%").replace(":", "\:")
                content = content + link_content
                # self.logger.error(content)

            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '
            author_id = r.attrs['tbinfo']
            author_id = re.findall(r'ouid=(\d+)', author_id)[0]
            author_name = root_2.find('a').attrs['nick-name']
            publish_datetime = root_2.find('a', {
                'class': "W_textb"
            }).attrs['date']

            try:
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.localtime(float(publish_datetime) / 1000))
            except:
                continue
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name)

            # fetch statistics
            root_3 = r.find('div', {'class': "feed_action clearfix"})
            soup_li_list = root_3.findAll('li')
            self.__parseStatistics(article, soup_li_list)
            # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \
            #     publish_datetime, reply_count, read_count, like_count, collect_count, forward_count
            if article not in articleList:
                articleList.append(article)
Пример #19
0
    def crawlNewArticle(self, url):
        '''
        爬取url前缀为new.qq.com的文章
        :param url:
        :return:
        '''
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        if html:
            article_url = html['url']
            if article_url.find('new.qq.com/omn') < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return

            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[TencentNew]' + article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')

            script_tags = soup.head.find_all('script')
            data = dict()
            for tag in script_tags:
                text = re.search(r'window.DATA = (.*)', tag.text, re.S)
                if text:
                    data = json.loads(text.group(1))

            tid = data['article_id']
            title = data['title']
            author_name = data['media']
            author_id = data['media_id']
            publish_datetime = data['pubtime']
            comment_id = data['comment_id']

            main = soup.find('div', attrs={'class': 'qq_conent clearfix'})
            t_content = ''
            if main is not None:
                contents = main.find_all('p', {'class': 'one-p'})
                for content in contents:
                    if content.string is None:
                        continue
                    t_content += str(content.get_text().strip())

            get_comment_count_url = 'https://coral.qq.com/article/%s/commentnum?callback=_article%scommentnum' % (
                comment_id, comment_id)
            comment_data = self.session.download(get_comment_count_url)
            comment_data = re.search(
                r'_article%scommentnum\((.*)\)' % comment_id, comment_data)

            comment_dict = eval(comment_data.group(1))
            reply_count = comment_dict['data']['commentnum']
            meta_info = '{"commentid":"%s"}' % comment_id

            article = Article(tid=tid,
                              channel_id=self.channel.channel_id,
                              title=title,
                              content=t_content,
                              publish_datetime=publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name,
                              meta_info=meta_info)
            article.statistics.reply_count = reply_count
            return article
        return None
Пример #20
0
    def crawlNewsArticle(self, url):
        '''
        爬取url前缀为news.qq.com和gd.qq.com的文章
        :param url:
        :return:
        '''
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        if html:
            article_url = html['url']
            if article_url.find('news.qq.com') < 0 and article_url.find(
                    'gd.qq.com') < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[TencentNews]' + article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')
            main = soup.find('div', attrs={'id': "Main-Article-QQ"})
            main1 = soup.find('div', attrs={'id': "Main-P-QQ"})
            if main is not None:
                Ttitle = main.find('h1').text.strip()  #标题
                Ttime = main.find('span', attrs={'class':
                                                 "article-time"})  #发布时间
                Ttime1 = main.find('span', attrs={'class': "a_time"})
                Ttime2 = main.find('span', attrs={'class': "pubTime"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                elif Ttime1 is not None:
                    Ttime1 = Ttime1.text.strip()
                    Ttime = Ttime1
                elif Ttime2 is not None:
                    Ttime2 = Ttime2.text.strip()
                    Ttime = Ttime2
                else:
                    Ttime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                Tauthor = main.find('span', attrs={'class': "a_source"})
                Tauthor1 = main.find('span', attrs={'class': "color-a-1"})
                if Tauthor is not None:
                    #Tauthor = Tauthor.find('a').text.strip()
                    Tauthor = Tauthor.text.strip()
                elif Tauthor1 is not None:
                    #Tauthor1 = Tauthor1.find('a').text.strip()
                    Tauthor1 = Tauthor1.text.strip()
                    Tauthor = Tauthor1
                else:
                    Tauthor = None
                Tcontent = main.find('div',
                                     attrs={'id': "Cnt-Main-Article-QQ"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    Tcontent = None
                articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0]
                try:
                    commentid = re.findall(r'cmt_id = (\d+);', html['html'])[0]
                    meta_info = '{"commentid":"%s"}' % commentid
                except:
                    commentid = None
                    meta_info = None
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Ttime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                if commentid is not None:
                    try:
                        re_url = 'http://coral.qq.com/article/' + commentid + '/commentnum'
                        html1 = json.loads(
                            self.session.download(re_url,
                                                  encoding='utf-8',
                                                  data=None,
                                                  timeout=10,
                                                  retry=3))
                        Treply = int(html1['data']['commentnum'])
                    except Exception:
                        traceInfo = traceback.format_exc()
                        self.logger.error(
                            'Faile to parse comment for %s (cid=%s):%s',
                            articleid, commentid, traceInfo)
                        Treply = None
                    article.statistics.reply_count = Treply
                return article
            elif main1 is not None:
                Ttitle = soup.find('meta', attrs={
                    'name': "Description"
                }).attrs['content']  # 标题
                Ttime = re.findall(
                    r"pubtime\D+(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})\',",
                    html['html'])
                if Ttime is not None:
                    Ttime = Ttime[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[
                        2] + ' ' + Ttime[3]
                else:
                    Ttime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'
                Tauthor = re.findall(r'para = {\s+name: \"(.*)\",',
                                     html['html'])
                if Tauthor is not None:
                    Tauthor = Tauthor[0]
                else:
                    Tauthor = None
                con_url = re.sub(r'\.htm\?.*', '.hdBigPic.js', article_url)
                con_html = self.session.download(con_url,
                                                 encoding='gbk',
                                                 data=None,
                                                 timeout=10,
                                                 retry=3)
                con_list = re.findall(r'<p>(.*?)</p>', con_html)
                if con_list is not None:
                    TT = []
                    for i in con_list:
                        if i.strip() not in TT:
                            TT.append(i)
                    Tcontent = ''.join(TT)
                else:
                    Tcontent = None
                articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0]
                try:
                    commentid = re.findall(r'aid\D+(\d+)\",', html['html'])[0]
                    meta_info = '{"commentid":"%s"}' % commentid
                except:
                    commentid = None
                    meta_info = None
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Ttime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                try:
                    if commentid is not None:
                        re_url = 'http://coral.qq.com/article/batchcommentnum'
                        data1 = {'targetid': articleid}
                        html1 = json.loads(
                            self.session.download(re_url,
                                                  encoding='utf-8',
                                                  data=data1,
                                                  timeout=10,
                                                  retry=3))
                        Treply = int(html1['data'][0]['commentnum'])
                    else:
                        Treply = None
                except:
                    Treply = None
                article.statistics.reply_count = Treply
                return article
        return None
Пример #21
0
    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        cookies = '_HY_lvt_c5928ed57343443aa175434a09ea2804=1492582419784; _HY_CTK_c5928ed57343443aa175434a09ea2804=0acc9a79b115c4ca1931c41e303bec28; BAIDU_SSP_lcr=https://www.baidu.com/link?url=NszYD2w_HgkPWqrzDQ3WKApYldw_9MpUVun9r-R09M7r0dh09MUwTHzG087WaJrhBwMCY-7pDfds4xjtWArRf2xh01DHOWWWd9DpBnHwZ03&wd=&eqid=84d5edfe0003dbdd0000000658f6c280; ALLYESID4=0DB901C6E627D980; sso_c=0; wdcid=5838509dcecc0a53; _people_ip_new_code=510000; UM_distinctid=15b802cdbd364d-02e7f218c26ae6-4e45042e-100200-15b802cdbd49ce; wdses=62d3f0f698d07532; sfr=1; CNZZDATA1260954200=1457761991-1492499618-null%7C1492580619; CNZZDATA1260954203=33096124-1492503288-null%7C1492578888; CNZZDATA1256327855=1768205365-1492503342-null%7C1492578947; wdlast=1492582420'
        html = self.session.download(url,
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)

        soup = BeautifulSoup(html['html'], 'html.parser')  # 'html.parser' 解析器
        try:
            meta = soup.find('meta').attrs['content']
        except:
            self.logger.warn(u'找不到meta里的content')
            return
        # self.logger.error('%s',meta)

        if "GB2312" in meta:
            encoding1 = 'GB2312'
        elif "UTF-8" in meta:
            encoding1 = 'UTF-8'
        elif "utf-8" in meta:
            encoding1 = 'utf-8'
        else:
            encoding1 = 'gbk'

        html = self.session.download(url,
                                     encoding=encoding1,
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        # 不同网页编码格式让其重新下载一遍
        soup = BeautifulSoup(html['html'], 'html.parser')  # 'html.parser' 解析器
        main = soup.find('body')

        if html:
            article_url = html['url']
            # self.logger.debug(article_url)
            if article_url.find(self.channel.url) < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None

            # if '.html' not in article_url:
            #     self.logger.error(u'非文章类型网址:%s ',article_url)
            #     return

            try:
                article_url = re.findall(
                    r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            except:
                self.logger.error(u'网址后缀不符合:%s ', article_url)
                return

            self.logger.debug('[peoplenews]' + article_url)
            articleid = article_url
            articleid = articleid[articleid.find('cn/') +
                                  3:-5]  # 由于数据库字段长度有限,所以截取部分作为ID
            self.logger.warn(u'地址 %s 以url地址部分字段作为tid', article_url)

            if 'bbs1' not in article_url:

                main1 = soup.find(
                    'div', attrs={'class': "i_cont"}
                )  #http://health.people.com.cn/n1/2017/1011/c14739-29579836.html
                main2 = soup.find(
                    'div', attrs={'class': "text_c"}
                )  #http://rencai.people.com.cn/n/2014/0721/c244800-25311391.html

                if (main1 is None) and (main2 is None):
                    self.logger.debug(u'走main')
                    try:
                        Ttitle = main.find('h1').text.strip()  # 标题
                    except:
                        self.logger.error(u'Ttitle存在走了main部分却不满足其他格式的的url::%s',
                                          article_url)
                        return

                    if Ttitle is None:
                        self.logger.error(
                            u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Ttitle_crawl = soup.find('div',
                                                 attrs={'class':
                                                        "box01"})  #对应一种格式
                        if Ttitle_crawl is None:
                            self.logger.error(
                                u'Ttitle_crawl存在走了main部分却不满足其他格式的的url::%s',
                                article_url)
                            return
                        try:
                            Tpublishtime = Ttitle_crawl.find('div',
                                                             attrs={
                                                                 'class': "fl"
                                                             }).text.strip()
                        except:
                            self.logger.error(u'main中发布时间不匹配')
                            return

                        if Tpublishtime is None:
                            self.logger.error(
                                u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                article_url)
                            return
                        else:
                            # self.logger.error(Tpublishtime)
                            Tpublishtime = re.findall(
                                r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                                Tpublishtime)[0]

                            # Tpublishtime = Tpublishtime[:18]
                            if len(Tpublishtime[4]) > 1:
                                Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                               Tpublishtime[3] + Tpublishtime[4]
                            else:
                                Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                               Tpublishtime[3] + ':00'

                            # Tpublishtime = Tpublishtime.replace(u'年', '-').replace(u'月', '-').replace(u'日', '')

                            Tauthor = Ttitle_crawl.find(
                                'a', attrs={'target': "_blank"})
                            if Tauthor is not None:
                                Tauthor = Tauthor.text.strip()
                            else:
                                Tauthor = 'None'
                            Tcontent = soup.find('div',
                                                 attrs={'class': "box_con"})
                            if Tcontent is not None:
                                Tcontent = Tcontent.text.strip()
                                Tcontent = re.sub(r'\n|\t', '', Tcontent)
                            else:
                                self.logger.error(
                                    u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                    article_url)
                                return

                            Treply = None  #这种格式下没有这些统计可以获取
                            meta_info = None

                            article = Article(articleid,
                                              self.channel.channel_id,
                                              Ttitle,
                                              Tcontent,
                                              Tpublishtime,
                                              article_url,
                                              None,
                                              Tauthor,
                                              meta_info=meta_info)
                            article.statistics.reply_count = Treply
                            #self.logger.info(article)
                            return article

                elif (main1 is not None):
                    self.logger.debug(u'走main1')
                    Ttitle = main1.find('h2')  # 标题

                    if Ttitle is None:
                        self.logger.error(
                            u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Ttitle = Ttitle.text.strip()

                    try:
                        Tpublishtime = main1.find('div',
                                                  attrs={
                                                      'class': "artOri"
                                                  }).text.strip()
                    except:
                        self.logger.error(u'main1中发布时间不匹配')
                        return

                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Tpublishtime = re.findall(
                            r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                            Tpublishtime)[0]
                        # self.logger.error(Tpublishtime)
                        # Tpublishtime = Tpublishtime[:18]
                        if len(Tpublishtime[4]) > 1:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + Tpublishtime[4]
                        else:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + ':00'

                        Tauthor = main1.find('div', attrs={
                            'class': "artOri"
                        }).find('a', attrs={'target': "_blank"})
                        # self.logger.debug(u"作者:%s",Tauthor)
                        if Tauthor is not None:
                            Tauthor = Tauthor.text.strip()
                        else:
                            Tauthor = 'None'

                        Tcontent = main1.find('div', attrs={'class': "artDet"})
                        if Tcontent is not None:
                            Tcontent = Tcontent.text.strip()
                            Tcontent = re.sub(r'\n|\t', '', Tcontent)
                        else:
                            self.logger.error(
                                u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                article_url)
                            return

                        Treply = None  # 这种格式下没有这些统计可以获取
                        meta_info = None

                        article = Article(articleid,
                                          self.channel.channel_id,
                                          Ttitle,
                                          Tcontent,
                                          Tpublishtime,
                                          article_url,
                                          None,
                                          Tauthor,
                                          meta_info=meta_info)
                        article.statistics.reply_count = Treply
                        self.logger.info(article)
                        return article

                elif (main2 is not None):
                    self.logger.debug(u'走main2')
                    Ttitle = main2.find('h2', attrs={'class': "one"})  # 标题

                    if Ttitle is None:
                        self.logger.error(
                            u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Ttitle = Ttitle.text.strip()
                    try:
                        Tpublishtime = main2.find('span',
                                                  attrs={
                                                      'id': "p_publishtime"
                                                  }).text.strip()
                    except:
                        self.logger.error(u'main2中发布时间不匹配')
                        return

                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Tpublishtime = re.findall(
                            r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                            Tpublishtime)[0]
                        # self.logger.error(Tpublishtime)
                        # Tpublishtime = Tpublishtime[:18]
                        if len(Tpublishtime[4]) > 1:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + Tpublishtime[4]
                        else:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + ':00'

                        Tauthor = main2.find('span', attrs={
                            'id': "p_origin"
                        }).find('a', attrs={'target': "_blank"})
                        # self.logger.debug(u"作者:%s",Tauthor)
                        if Tauthor is not None:
                            Tauthor = Tauthor.text.strip()
                        else:
                            Tauthor = 'None'

                        Tcontent = main2.find('div',
                                              attrs={'class': "show_text"})
                        if Tcontent is not None:
                            Tcontent = Tcontent.text.strip()
                            Tcontent = re.sub(r'\n|\t', '', Tcontent)
                        else:
                            self.logger.error(
                                u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                article_url)
                            return

                        Treply = None  # 这种格式下没有这些统计可以获取
                        meta_info = None

                        article = Article(articleid,
                                          self.channel.channel_id,
                                          Ttitle,
                                          Tcontent,
                                          Tpublishtime,
                                          article_url,
                                          None,
                                          Tauthor,
                                          meta_info=meta_info)
                        article.statistics.reply_count = Treply
                        self.logger.info(article)
                        return article

                else:
                    self.logger.warn(u'存在另外一种html格式 %s', article_url)
Пример #22
0
 def __updateOldArticleToArticleHistoryTable(self,
                                             articleList,
                                             currentTableName,
                                             historyTableName,
                                             isEventTable=False):
     '''
     更新到文章历史表
     @param currentTableName: 当前文章表:全局文章表、实体文章表或者实体事件文章表
     @param historyTableName: 历史文章表:全局文章表、实体文章表或者实体事件文章表
     @param eventId: 如果更新到实体事件文章表,则需要提供事件id,否则为None
     '''
     if len(articleList) > 0:
         if isEventTable is False:
             eventIdFieldName = ''
         else:
             eventIdFieldName = ',EVENT_ID'
         #找寻老文章
         selectSql = '''
         SELECT TID, CHANNEL_ID %s FROM %s where %s
         '''
         whereClauseList = map(
             lambda article: '(TID="%s" and CHANNEL_ID=%d)' %
             (article.tid, article.channel_id), articleList)
         self.dbProxy.execute(selectSql %
                              (eventIdFieldName, currentTableName,
                               ' or '.join(whereClauseList)))
         resultList = self.dbProxy.fetchall()
         if isEventTable:
             existingArticleList = map(
                 lambda item: Article(item[0], item[1], eventId=item[2]),
                 resultList)
         else:
             existingArticleList = map(
                 lambda item: Article(item[0], item[1]), resultList)
         toBeUpdateArticleList = list()
         for item in existingArticleList:
             index = articleList.index(item)
             obj = copy.copy(articleList[index])
             obj.eventId = item.eventId
             toBeUpdateArticleList.append(obj)
         if len(toBeUpdateArticleList) > 0:
             n = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             if isEventTable is False:
                 eventIdFieldName = ''
             else:
                 eventIdFieldName = 'EVENT_ID,'
             insertSql = '''
             INSERT INTO %s (TID, %s CHANNEL_ID,
                 READ_COUNT,LIKE_COUNT, REPLY_COUNT,
                 FORWARD_COUNT, COLLECT_COUNT, HEAT, ADD_DATETIME)
             VALUES %s 
             '''
             valueList = list()
             for article in toBeUpdateArticleList:
                 statistics = article.statistics
                 if isEventTable is False:
                     eventIdFieldValue = ''
                 else:
                     eventIdFieldValue = str(article.eventId) + ','
                 valueList.append(
                     '("%s", %s %d, %s, %s, %s, %s, %s, %s, "%s")' %
                     (article.tid, eventIdFieldValue, article.channel_id,
                      statistics.read_count if statistics.read_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.like_count if statistics.like_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.reply_count if statistics.reply_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.forward_count if statistics.forward_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.collect_count if statistics.collect_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.heat if statistics.heat is not None else
                      Constants.DEFAULT_NUM, n))
             if len(valueList) > 0:
                 self.dbProxy.execute(insertSql %
                                      (historyTableName, eventIdFieldName,
                                       ','.join(valueList)))
                 self.dbProxy.commit()
Пример #23
0
    def __parse_page_url(self, page_url, count_page=0):
        '''
        获取每一页的微博信息
        @return: article_list
        '''
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Host': 's.weibo.com',
            'Referer': 'http://s.weibo.com/',
            'User-Agent': user_agent
        }

        self.change_cookie()
        cookies = self.__load_cookies_from_lwp(self.user_name)
        response = self.session.get(url=page_url,
                                    headers=headers,
                                    cookies=cookies)

        # 若此cookie无效,则重新登录获取新的cookie
        if re.match(r'File not .*', response) is not None:
            self.__login(self.user_name)
            cookies = self.__load_cookies_from_lwp(self.user_name)
            response = self.session.get(url=page_url,
                                        headers=headers,
                                        cookies=cookies)

        html = etree.HTML(response)
        # 内容list
        card_wrap_list = html.xpath(
            '//div[@class="m-wrap"]/div[@id="pl_feedlist_index"]//div[@class="card-wrap"]'
        )
        article_list = list()
        for card_wrap in card_wrap_list:
            try:
                mid = card_wrap.xpath('./@mid')[0]
            except Exception as e:
                continue
            info = card_wrap.xpath('.//div[@class="content"]')[0]
            author_name = info.xpath('.//a[@class="name"]/text()')[0]
            author_id = re.search(
                r'/(\d+).?refer_flag',
                info.xpath('.//a[@class="name"]/@href')[0]).group(1)

            txt = info.xpath('.//p[@class="txt"]')[0]
            content = txt.xpath('string(.)').strip()
            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '

            article_info = info.xpath('.//p[@class="from"]/a')[0]
            article_url = self.HTTP + article_info.xpath('./@href')[0]

            statistics = card_wrap.xpath(
                './/div[@class="card-act"]//li/a/text()')

            collect_count = re.search(r'(\d+)', statistics[0])
            forward_count = re.search(r'(\d+)', statistics[1])
            reply_count = re.search(r'(\d+)', statistics[2])
            like_count = card_wrap.xpath(
                './/div[@class="card-act"]//li/a/em/text()')

            if collect_count:
                collect_count = collect_count.group(1).strip()
            else:
                collect_count = 0
            if forward_count:
                forward_count = forward_count.group(1).strip()
            else:
                forward_count = 0
            if reply_count:
                reply_count = reply_count.group(1).strip()
            else:
                reply_count = 0
            if len(like_count):
                like_count = like_count[0]
            else:
                like_count = 0

            publish_datetime = self.__parse_time(article_url, cookies)

            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              url=article_url,
                              publish_datetime=publish_datetime,
                              author_id=author_id,
                              author_name=author_name)

            article.statistics.collect_count = collect_count
            article.statistics.forward_count = forward_count
            article.statistics.reply_count = reply_count
            article.statistics.like_count = like_count

            if article not in article_list:
                article_list.append(article)

        self.logger.debug('%d page crawl article is %s' %
                          (count_page, len(article_list)))
        return article_list