Python Article примеры использования

Язык программирования: Python

Пространство имен/Пакет: com.naswork.sentiment.objectmodel.article

Класс/Тип: Article

Примеров на hotexamples.com: 23

Python Article - 23 примеров найдено. Это лучшие примеры Python кода для com.naswork.sentiment.objectmodel.article.Article, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Article(23)

Основные методы

Article (23)

Пример #1

Показать файл

Файл: zhihu1.py Проект: magicgis/outfile

 def crawlArticle(self, url):
     '''
     根据url爬取文章内容和统计信息
     @return: 返回一个Article实例
     '''
     #判断url格式，因为从百度查询所得不一定是question，目前只爬question
     if url.find('question') < 0:
         self.logger.warn('Question supported only:%s', url)
         return None
     article_id = re.findall(r'question/(\d+)', url)[0]
     self.session.randomSleep()
     response = self.session.get(url, headers=CRAWL_ARTICLE_HEADERS)
     soup = BeautifulSoup(response)
     main = soup.find('div', attrs={'id': "data"}).attrs['data-state']
     articleJson = json.loads(main)
     questionJson = articleJson['entities']['questions'][article_id]
     title = questionJson['title']
     contentSoup = BeautifulSoup(questionJson['editableDetail'])
     content = contentSoup.text
     author_id = questionJson['author']['id']
     author_name = questionJson['author']['name']
     createTimeInFloat = questionJson['created']
     publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(createTimeInFloat))
     reply_count = questionJson['commentCount']
     read_count = questionJson['visitCount']
     collect_count = questionJson['followerCount']
     article = Article(article_id, self.channel.channel_id, title, content,
                       publish_datetime, url, author_id, author_name)
     article.statistics.reply_count = reply_count
     article.statistics.read_count = read_count
     article.statistics.collect_count = collect_count
     return article

Пример #2

Показать файл

Файл: saeventcrawler.py Проект: magicgis/outfile

    def __filterRemovedArticle(self, articleList, entityId, eventId=None):
        '''
        与remove表格对比，进行文章过滤
        返回不存在remove表中的文章list
        '''
        if len(articleList) == 0:
            return []
        if eventId is not None:
            tableName = Constants.TABLE_SA_EVENT_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ' event_id=%d and ' % eventId
        else:
            tableName = Constants.TABLE_SA_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ''
        # 在remove表里查找文章
        selectSql = '''
            SELECT TID, CHANNEL_ID FROM %s where %s (%s)
        '''
        whereClauseList = map(
            lambda article: '(TID="%s" and CHANNEL_ID=%d)' %
            (article.tid, article.channel_id), articleList)

        self.dbProxy.execute(
            selectSql %
            (tableName, eventCondition, ' or '.join(whereClauseList)))
        resultList = self.dbProxy.fetchall()  # 查询返回结果集
        removedArticleList = map(lambda x: Article(x[0], x[1]), resultList)
        filteredArticle = filter(lambda x: x not in removedArticleList,
                                 articleList)
        #self.logger.debug('originalList:%s', map(lambda article: article.tid, articleList))
        #self.logger.debug('removedArticleList:%s', map(lambda article: article.tid, removedArticleList))
        #self.logger.debug('filteredArticle:%s', map(lambda article: article.tid, filteredArticle))
        return filteredArticle

Пример #3

Показать файл

    def __searchByPage(self,keywordList,startTime,endTime,page):
        # 在豆瓣内部搜索框搜索
        page = str(page) #url接收的是str格式
        search_url = "https://www.douban.com/group/search?start="+page+"&cat=1013&sort=time&q="
        #cat:按话题搜索  sort：按最新发布时间分类  q:搜索关键词
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        search_url = search_url+str(query)

        url_page = self.session.get(search_url,headers=self.headers)
        soup = BeautifulSoup(url_page, "lxml")
        # self.logger.debug(soup)
        main_wrap = soup.find('div', attrs={'class': "article"})
        main_article_list = main_wrap.find('div',attrs={'class':"topics"})
        articleList = list()
        hasnext = True
        if main_article_list is not None:
            title_list = main_article_list.findAll('tr', {'class': 'pl'})
            for title in title_list:
                article_publishtime = title.find('td', attrs={'class': "td-time"}).attrs['title']

                urlTime = time.strptime(article_publishtime, "%Y-%m-%d %H:%M:%S")

                Y, M, D, H = urlTime[0:4]
                urlTime2 = datetime.datetime(Y, M, D, H)
                urlTime2 = time.mktime(urlTime2.timetuple())
                #转换成时间戳来比较 float类型
                startTime = endTime - datetime.timedelta(days=2)
                startTimeIntSecond = time.mktime(startTime.timetuple())
                endTimeIntSecond = time.mktime(endTime.timetuple())

                #如果符合时间范围就爬取
                if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond:
                    article_url = title.find('a').attrs['href']
                    self.logger.debug(article_url)
                    (content, author_name, tid,like_count) = self.crawlArticle(article_url)
                    if content is None:  # 话题已被删除或则其他格式
                        continue

                    article_title = title.find('td', attrs={'class': "td-subject"}).text
                    article = Article(tid,  self.channel.channel_id, article_title, content, article_publishtime, article_url, None,
                                      author_name)
                    self.crawlComment(article)
                    reply_count = title.find('td', attrs={'class': "td-reply"}).text.strip()
                    reply_count = re.sub(u'回应', '', reply_count)  # 回复数量去除中文保留数字
                    article.statistics.reply_count = reply_count
                    article.statistics.like_count = like_count
                    if article not in articleList:
                        articleList.append(article)

                else:
                    print len(articleList)
                    hasnext=False
                    break

            return (articleList, hasnext)

Пример #4

Показать файл

    def crawlArticle(self, url, **kwargs):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        # TBD, 转发情况目前先不考虑
        searchTitle = ''
        referer = None
        if (kwargs['flag'] == 0):
            if 'referer' not in kwargs:
                return None
            if 'title' in kwargs:
                searchTitle = kwargs['title']
            else:
                searchTitle = ''
        try:
            self.session_public_article.randomSleep()
            if kwargs['flag'] == 0:
                referer = kwargs['referer']
                headers = HEADERS_ARTICLE.copy()
                headers['Referer'] = referer
                response = self.session_public_article.get(
                    url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE)
            else:
                response = self.session_public_article.get(
                    url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE)
            mid = re.findall(r'var mid = .*"(\d+)";',
                             response.text)[0] + '-' + re.findall(
                                 r'var idx = .*"(\d+)";', response.text)[0]
            soup = BeautifulSoup(response.text, 'lxml')
            main = soup.find('div', {'id': "img-content"})
            title = main.find('h2').text.strip()
            content = main.find('div', {'id': "js_content"}).text.strip()
            profile = main.find('div', {'class': "profile_inner"})
            author_id = profile.find('span').text.strip()
            author_name = profile.find('strong').text.strip()
            publish_datetime = kwargs['publish_datetime']

            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime=publish_datetime,
                              url=url,
                              author_id=author_id,
                              author_name=author_name,
                              meta_info='{refer="%s"}' % referer)
            self.logger.debug('Successfully parse article:%s', title)
            return article
        except:
            self.logger.error('Fail to get article for %s: %s due to %s', url,
                              searchTitle, traceback.format_exc())
            return None

Пример #5

Показать файл

Файл: sacrawler.py Проект: magicgis/outfile

    def __fetchOldArticleList(self, articleList, articleCount=100):
        '''
        从全局文章表，获取尚未消亡的文章id，而且这些文章并不在本次爬虫爬回来的记录里
        '''
        #用来查询总页数
        selectSql_count = 'SELECT COUNT(*) FROM %s where extinct="N" and channel_id=%d '
        sql2 = selectSql_count % (Constants.TABLE_SA_ARTICLE,
                                  self.channel.channel_id)
        #获取旧文章的sql
        selectSql = 'SELECT TID,title, publish_datetime,url, meta_info FROM %s where extinct="N" and channel_id=%d '
        sql = selectSql % (Constants.TABLE_SA_ARTICLE, self.channel.channel_id)

        if len(articleList) > 0:
            whereClauseList = map(
                lambda article: ' tid<>"%s" ' % (article.tid), articleList)
            sql += ' and (%s)' % (' and '.join(whereClauseList))
            sql2 += ' and (%s)' % (' and '.join(whereClauseList))

        sql2 += ' order by add_datetime desc'

        self.dbProxy.execute(sql2)
        resultList2 = self.dbProxy.fetchall()
        # print '12456789sssssssssssssssssss'
        # print resultList2 #((53,),)
        resultList2 = re.findall(r'\d+', str(resultList2))  #返回一个list
        # print resultList2[0]
        if int(resultList2[0]) > int(articleCount):
            randpage = random.randint(
                0, int(math.ceil(float(resultList2[0]) / articleCount)))
        else:
            randpage = 0  #用来随机取数据库页数

        sql += ' order by add_datetime desc limit %d,%d' % (randpage,
                                                            articleCount)
        self.dbProxy.execute(sql)
        resultList = self.dbProxy.fetchall()

        return map(
            lambda item: Article(item[0],
                                 self.channel.channel_id,
                                 title=item[1],
                                 publish_datetime=item[2],
                                 url=item[3],
                                 meta_info=item[4]), resultList)

Пример #6

Показать файл

Файл: sacrawler.py Проект: magicgis/outfile

    def __filterRemovedArticle(self, articleList, entityId, eventId=None):
        '''
        与remove表格对比，进行文章过滤
        返回不存在remove表中的文章list
        '''
        if len(articleList) == 0:
            return []
        if eventId is not None:
            tableName = Constants.TABLE_SA_EVENT_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ' event_id=%d and ' % eventId

            start_datetime, end_datetime = self.__fetchEventTime(
                entityId, eventId)

            # 过滤掉不在该事件开始时间和结束之间内的文章
            article_new_list = list()
            for article in articleList:
                if (str(article.publish_datetime) > str(start_datetime)) and (
                        str(article.publish_datetime) < str(end_datetime)):
                    article_new_list.append(article)

            articleList = article_new_list

        else:
            tableName = Constants.TABLE_SA_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId
            eventCondition = ''
        # 在remove表里查找文章
        selectSql = '''
            SELECT TID, CHANNEL_ID FROM %s where %s (%s)
        '''
        whereClauseList = map(
            lambda article: '(TID="%s" and CHANNEL_ID=%d)' %
            (article.tid, article.channel_id), articleList)

        self.dbProxy.execute(
            selectSql %
            (tableName, eventCondition, ' or '.join(whereClauseList)))
        resultList = self.dbProxy.fetchall()  # 查询返回结果集
        removedArticleList = map(lambda x: Article(x[0], x[1]), resultList)
        filteredArticle = filter(lambda x: x not in removedArticleList,
                                 articleList)

        return filteredArticle

Пример #7

Показать файл

Файл: sacrawler.py Проект: magicgis/outfile

    def __seperateNewOldArticles(self, articleList, entityId=None):
        '''
        查询全局文章表，区分新文章和旧文章
        '''
        if len(articleList) == 0:
            return ([], [])
        if entityId is None:
            selectSql = 'select tid, channel_id from %s where ' % Constants.TABLE_SA_ARTICLE
        else:
            selectSql = 'select tid, channel_id from %s where ' % (
                Constants.TABLE_SA_ARTICLE + Constants.TABLE_NAME_DELIMITER +
                entityId)
        whereClauseList = map(
            lambda article: '(tid="%s" and channel_id=%d)' %
            (article.tid, article.channel_id), articleList)
        self.dbProxy.execute(selectSql + ' or '.join(whereClauseList))
        resultList = map(lambda x: Article(x[0], x[1]),
                         self.dbProxy.fetchall())

        existingArticleList = filter(lambda x: x in resultList, articleList)
        newArticleList = filter(lambda x: x not in resultList, articleList)
        return (existingArticleList, newArticleList)

Пример #8

Показать файл

Файл: xinhuanews.py Проект: magicgis/outfile

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        :param url:
        :return:返回一个Article实例
        '''
        cookies = None
        html = self.session.download(url=url,
                                     encoding='utf-8',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        if html:
            article_url = html['url']
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            article_id = re.findall(r'c_\d+', article_url)[0]
            article_id = article_id[2:]
            soup = BeautifulSoup(html['html'], "html.parser")
            main1 = soup.find('div', attrs={'class': "widthMain main"})
            main2 = soup.find('div', attrs={'class': "main pagewidth"})
            main3 = soup.find('body', attrs={'class': "streamline-page"})
            main4 = soup.find('div', attrs={'class': "h-title"})
            main5 = soup.find('div', attrs={'id': "article"})
            main6 = soup.find('div', attrs={'id': "Title"})
            main7 = soup.find('div', attrs={'class': "article"})

            if main1 is not None:
                self.logger.debug("main1")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain1(main1)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main2 is not None:
                self.logger.debug("main2")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain2(main2)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main3 is not None:
                self.logger.debug("main3")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain3(main3)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main3 is None and main4 is not None:
                self.logger.debug("main4")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain4(soup)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main5 is not None:
                self.logger.debug("main5")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain5(main5)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main6 is not None:
                self.logger.debug("main6")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain6(soup)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main7 is not None:
                self.logger.debug("main7")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain7(soup)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if (main1 is None and main2 is None and main3 is None
                    and main4 is None and main5 is None and main6 is None
                    and main7 is None):
                self.logger.error(u"存在另外一种html格式：：%s", url)
                return

Пример #9

Показать файл

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        article = None
        html = self.session.download(url,
                                     encoding='utf-8',
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        article_url = html['url']

        # if article_url.find(self.channel.url)<0:
        #     self.logger.warn('Unrelated url found:%s',url)
        #     continue

        # self.logger.debug(article_url)
        soup = BeautifulSoup(html['html'], 'html.parser')

        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return article

        main = soup.find('div', attrs={'class': "left_section"})

        if main:
            Ttitle = main.find('div', attrs={
                'id': "j_core_title_wrap"
            }).find('h1')
            Ttitle1 = main.find('div', attrs={
                'id': "j_core_title_wrap"
            }).find('h3')
            if Ttitle:
                Ttitle = Ttitle.text.strip()
            elif Ttitle1:
                Ttitle = Ttitle1.text.strip()
            else:
                Ttitle = ''

            # self.logger.debug(u'标题%s',Ttitle)

            data_field = main.find('div', attrs={
                'id': "j_p_postlist"
            }).find('div').attrs['data-field'].strip()
            data_field = json.loads(data_field)
            publish_datetime = data_field['content']
            if 'date' in publish_datetime.keys():
                publish_datetime = publish_datetime['date']
            else:
                publish_datetime = main.find('div',
                                             attrs={
                                                 'id': "j_p_postlist"
                                             }).find('div').find_all(
                                                 'span',
                                                 attrs={'class': "tail-info"
                                                        })[-1].text.strip()

            publish_datetime = re.findall(
                r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                publish_datetime)[0]

            if len(publish_datetime[4]) > 1:
                publish_datetime = publish_datetime[
                    0] + '-' + publish_datetime[1] + '-' + publish_datetime[
                        2] + ' ' + publish_datetime[3] + publish_datetime[4]
            else:
                publish_datetime = publish_datetime[
                    0] + '-' + publish_datetime[1] + '-' + publish_datetime[
                        2] + ' ' + publish_datetime[3] + ':00'

            Tid = data_field['author']['user_id']
            Tauthor = data_field['author']['user_name']
            Treply = soup.find('li', attrs={
                'class': "l_reply_num"
            }).find('span').text.strip()  #总回复数
            Tcontent = main.find('div', attrs={
                'id': "j_p_postlist"
            }).find('div').find('cc').text.strip()

            article = Article(Tid,
                              self.channel.channel_id,
                              Ttitle,
                              Tcontent,
                              publish_datetime,
                              url=article_url,
                              author_id=None,
                              author_name=Tauthor,
                              meta_info=None)

            article.statistics.reply_count = Treply

        else:
            self.logger.warn(u'很抱歉，该贴已被删除。%s', article_url)

        return article

Пример #10

Показать файл

Файл: tianya.py Проект: magicgis/outfile

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        html = self.session.download(url,
                                     encoding='utf-8',
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        article_url = html['url']
        self.logger.debug(article_url)
        soup = BeautifulSoup(html['html'], 'html.parser')

        main = soup.find('div', attrs={'id': "bd"})
        main1 = soup.find('div', attrs={
            'class': "wd-question"
        })  #论坛提问帖子http://bbs.tianya.cn/post-730-5795-1-1.shtml
        article = None
        if main:
            Ttitle = main.find('h1').find('span').text
            Ttime = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[1].text.strip()
            Ttime = re.sub(u'[\u4e00-\u9fa5]+：', '', Ttime)
            Tid = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[0].find('a').attrs['uid'].strip()
            Tauthor = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[0].find('a').attrs['uname'].strip()
            Tclick = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[2].text.strip()
            Tclick = re.sub(u'[\u4e00-\u9fa5]+：', '', Tclick)
            Tclick = int(Tclick)
            Treply = main.find('div', attrs={
                'class': "atl-info"
            }).find_all('span')[3].text.strip()
            Treply = re.sub(u'[\u4e00-\u9fa5]+：', '', Treply)
            Treply = int(Treply)
            Tlike = main.find('a', attrs={'class': "zantop"}).attrs['_count']
            Tcontent = main.find('div',
                                 attrs={
                                     'class': "bbs-content clearfix"
                                 }).text.strip()

            article = Article(Tid,
                              self.channel.channel_id,
                              Ttitle,
                              Tcontent,
                              Ttime,
                              url=article_url,
                              author_id=None,
                              author_name=Tauthor)

            article.statistics.reply_count = Treply
            article.statistics.click_count = Tclick
            article.statistics.like_count = Tlike

        elif main1:
            Ttitle = main1.find('h1').find('span').text
            Ttime = main1.find('div').attrs['js_replytime']
            Ttime = re.findall(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})',
                               Ttime)[0]
            Tid = main1.find('div').attrs['_host']
            Tauthor = main1.find('div', attrs={
                'class': "q-info"
            }).find('a').text
            Tclick = main1.find('div').attrs['js_clickcount']
            Treply = main1.find('div').attrs['js_powerreply']
            Tcontent = main1.find('div', attrs={'class': "q-content atl-item"})
            if Tcontent:
                Tcontent = Tcontent.find('div', attrs={
                    'class': "text"
                }).text.strip()
            else:
                Tcontent = ''

            article = Article(Tid,
                              self.channel.channel_id,
                              Ttitle,
                              Tcontent,
                              Ttime,
                              url=article_url,
                              author_id=None,
                              author_name=Tauthor)

            article.statistics.reply_count = Treply
            article.statistics.click_count = Tclick
        return article

Пример #11

Показать файл

    def crawlArticle(self, req_data):

        hasnext = False

        req_url = 'http://120.55.151.61/Treehole/V4/Cave/getList.action'
        data_r = self.s.post(url=req_url, data=req_data, headers=self.headers)
        data_r = data_r.text

        true = True
        false = False

        data_j = eval(data_r)
        data_js = json.dumps(data_j)
        data_dict = simplejson.loads(data_js)

        data = data_dict['data']
        timestamp = data['timestampLong']
        timestampLong = data['timestampLong'] / 1000
        timestampLong_struct = datetime.datetime.fromtimestamp(timestampLong)
        timestampLong_str = timestampLong_struct.strftime('%Y-%m-%d %H:%M:%S')
        messageBO = data['messageBOs']

        for each in messageBO:  # 20条记录保存在一组里
            if 'studentBO' in each:
                content = each.get('content', False)
                if content:
                    publish_timeStamp = each['issueTime'] / 1000
                    datetime_struct = datetime.datetime.fromtimestamp(
                        publish_timeStamp)
                    publish_time = datetime_struct.strftime(
                        '%Y-%m-%d %H:%M:%S')
                    if datetime.datetime.strptime(
                            publish_time,
                            '%Y-%m-%d %H:%M:%S') < self.startTime:
                        hasnext = False
                        break

                    schoolName = each['schoolName']
                    readCount = each['readCount']
                    relayedCount = each['relayedCount']
                    reply_count = each['comments']
                    content = each['content']
                    if len(content) > 8:
                        title = content[:8] + '...'
                    elif len(content) <= 0:
                        title = u'内容为图片'
                    else:
                        title = content

                    studentBO = each['studentBO']
                    studentBO_id = studentBO['studentId']
                    studentBO_name = studentBO['nickName']
                    likeCount = each['likeCount']
                    article_id = each['messageId']
                    publish_method = each['source']
                    publish_method = publish_method[publish_method.find('_') +
                                                    1:]

                    data2 = 'platform=1&phoneBrand=HONOR&phoneVersion=24&versionNumber=9.1.1&phoneModel=BLN-AL10&sharePlatform=4&channel=huaweiMarket&plateId=1&messageId=' + str(
                        article_id) + '&'
                    article_urlPOST = 'http://120.55.151.61/Treehole/V4/Message/preShare.action'
                    data_urlPOST = self.s.post(url=article_urlPOST,
                                               data=data2,
                                               headers=self.headers)

                    data3 = data_urlPOST.text
                    data_j3 = eval(data3)
                    data_js3 = json.dumps(data_j3)
                    data_dict3 = simplejson.loads(data_js3)

                    data4 = data_dict3['data']

                    article_url = data4['shareUrl']

                    article = Article(article_id,
                                      self.channel.channel_id,
                                      title,
                                      content,
                                      publish_time,
                                      article_url,
                                      author_id=studentBO_id,
                                      author_name=studentBO_name,
                                      publish_method=publish_method,
                                      entity=schoolName)
                    article.statistics.reply_count = reply_count
                    article.statistics.like_count = likeCount
                    article.statistics.read_count = readCount
                    article.statistics.forward_count = relayedCount
                    # print article.__str__()

                    if article is not None and article not in self.articleList:
                        self.articleList.append(article)

        # print datetime.datetime.strptime(timestampLong_str,'%Y-%m-%d %H:%M:%S')
        # print '1111111'
        if datetime.datetime.strptime(timestampLong_str,
                                      '%Y-%m-%d %H:%M:%S') < self.startTime:
            hasnext = False
            return ('', hasnext)
        else:
            hasnext = True
            res_data = 'timestamp=' + str(
                timestamp
            ) + '&preMoodTimestap=' + str(
                timestamp
            ) + '&platform=1&phoneBrand=HONOR&phoneVersion=24&versionNumber=9.1.1&phoneModel=BLN-AL10&type=1&channel=huaweiMarket&'
            return (res_data, hasnext)

Пример #12

Показать файл

    def __searchByPage(self, keywordList, startTime, endTime, page):
        # 在豆瓣内部搜索框搜索
        page = str(page)  #url接收的是str格式
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        params = {
            'c': 'blog',
            'range': 'article',  # 包含全部
            'by': 'all',
            'sort': 'time',
            'col': '',
            'source': '',
            'from': '',
            'country': '',
            'size': '',
            'time': '',
            'a': '',
            'isown': 'false',
            'page': page,
            'dpc': '',
            'q': query
        }
        search_url = "http://search.sina.com.cn/"

        url_page = self.session.get(search_url,
                                    params=params,
                                    headers=self.headers)
        soup = BeautifulSoup(url_page, "lxml")
        # self.logger.debug(soup)
        main_wrap = soup.find('div', attrs={'class': "result-boxes"})

        if main_wrap is None:  #为了防止因网络问题而导致的查找不到
            self.logger.debug(u'第一次查找没有结果再找一遍中')
            url_page = self.session.get(search_url,
                                        params=params,
                                        headers=self.headers)
            soup = BeautifulSoup(url_page, "lxml")
            main_wrap = soup.find('div', attrs={'class': "result-boxes"})
            if main_wrap is None:  #再找一遍如果还是没有才可以算没有结果
                self.logger.debug(u'第二次查找没有结果，该关键词没有结果')
                return (list(), False)

        main_article_list = main_wrap.findAll(
            'div', attrs={'class': "box-result clearfix"})

        hasnext = True
        if main_article_list is not None:
            print '1'
            for title in main_article_list:
                print '2'
                article_publishtime_wrap = title.findAll('p')[1]
                article_publishtime = article_publishtime_wrap.find(
                    'span', attrs={
                        'class': "fgray_time"
                    }).text.strip()
                self.logger.error(article_publishtime)
                urlTime = time.strptime(article_publishtime,
                                        "%Y-%m-%d %H:%M:%S")

                Y, M, D, H = urlTime[0:4]
                urlTime2 = datetime.datetime(Y, M, D, H)
                urlTime2 = time.mktime(urlTime2.timetuple())
                #转换成时间戳来比较 float类型
                startTime = endTime - datetime.timedelta(days=5)
                startTimeIntSecond = time.mktime(startTime.timetuple())
                endTimeIntSecond = time.mktime(endTime.timetuple())

                #如果符合时间范围就爬取
                if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond:
                    print '3'
                    title_wrap = title.find('h2')
                    article_url_parent = title_wrap.find('a').attrs['href']
                    article_url_index = article_url_parent.rfind('/')
                    article_url_child = article_url_parent[article_url_index +
                                                           1:]
                    article_url = 'http://blog.sina.com.cn/s/blog_' + article_url_child + '.html'
                    Ttitle = title_wrap.find('a').text.strip()
                    self.logger.debug(article_url)
                    meta_info = article_url_child
                    (content, reply_count, like_count, read_count,
                     collect_count, forward_count) = self.crawlArticle(
                         article_url, meta_info)
                    self.logger.debug(like_count)

                    if content is None:  # 话题已被删除或则其他格式
                        print '756236'
                        continue
                    tid = 't_' + article_url_child
                    author_name = article_publishtime_wrap.find(
                        'a', attrs={
                            'class': "rib-author"
                        }).text
                    article = Article(tid,
                                      self.channel.channel_id,
                                      Ttitle,
                                      content,
                                      article_publishtime,
                                      article_url,
                                      None,
                                      author_name,
                                      meta_info=meta_info)

                    # self.crawlComment(article)

                    article.statistics.reply_count = reply_count
                    article.statistics.like_count = like_count
                    article.statistics.read_count = read_count
                    article.statistics.collect_count = collect_count
                    article.statistics.forward_count = forward_count
                    # self.logger.debug(article)
                    if article not in self.articleList:
                        self.articleList.append(article)

                else:
                    print '78956'
                    print len(self.articleList)
                    hasnext = False
                    break

            return hasnext

Пример #13

Показать файл

Файл: jianshu.py Проект: magicgis/outfile

class JianShuCrawler(object):
    def __init__(self, channel, logger=None):
        '''

        :param channel:
        :param logger:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.channel = channel  #获取媒体实体
        # 设置请求头和代理
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'www.jianshu.com',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        }
        self.headers_1 = {
            'Host': 'www.jianshu.com',
            'Connection': 'keep-alive',
            'Connection-Length': '0',
            'Accept': 'application/json',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept-Encoding': 'gzip,deflate,br',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        #用户信息
        self.usename = '13432881156'
        self.password = '******'
        #用cookie进行登录
        self.session = SessionCrawler()
        self.session.cookies = cookielib.LWPCookieJar(
            filename='cookie')  #读取cookie
        try:
            self.session.cookies.load(ignore_discard=True)
        except:
            print('未能加载cookie')
        if self.islogin():
            print('已经登录简书')
        else:
            self.login(self.usename, self.password)

    def searchArticle(self, keywordList, endTime):
        '''
            根据关键字数组，开始时间和结束时间范围搜索文章
            :param keywordList: 关键字数组
            :param endTime: 搜索时间范围结束
            :return:
        '''
        if self.islogin():
            page = 0  # 页数
            articleList = list()
            hasnext = True  # 开始爬虫
            while hasnext:
                page += 1
                if page == 5:
                    break
                (articleListInPage,
                 hasnext) = self._searchByPage(keywordList, endTime, page)
                is_article = list()
                for article in articleListInPage:
                    if article not in articleList:
                        is_article.append(article)
                articleList.extend(is_article)
                print(len(articleList))

            self.logger.debug('总共抓取文章有:%s' % len(articleList))
            return articleList
        else:
            self.logger.error(u'未登录简书，请确认登录')
            return None

    def _searchByPage(self, keywordList, endTime, page):
        hasnext = True
        articleList = list()
        page = str(page)  # 在简书内部搜索框搜索
        search_url_1 = "https://www.jianshu.com/search/do?q="
        search_url_2 = "&type=note&page=" + page + "&order_by=published_at&time_range=a_day"
        # q：是关键词搜索，page：页数,order_by是排序，time_range是按天排序
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        search_url = search_url_1 + str(query) + search_url_2  # 抓取页面的网址
        csrf = self.getCsrf(keywordList, page)
        post_data = {'X-CSRF-Token': csrf}
        # 获取页面信息
        url_page = self.session.post(search_url,
                                     headers=self.headers_1,
                                     data=post_data,
                                     textRspOnly=False)
        if url_page.status_code == 200:
            self.logger.debug(u'已经获取中大新闻页面')
        else:
            self.logger.debug(u'中大新闻获取完毕')
            return (articleList, False)  #已经爬取到新闻界面的最后一页

        # 以下对中大新闻页面进行解析
        articleJson = None
        try:
            articleJson = json.loads(url_page.text)
        except Exception, e:
            self.logger.error(e)

        if articleJson is None:
            self.logger.error('articleJson is None')
            return

        allInfo = articleJson['entries']  # allinfo代表字典0:{}\1:{}.....
        for info in allInfo:
            pre_publishedTime = info["first_shared_at"]
            publishedTime_1 = ''.join(
                re.findall('[^A-Za-z]', pre_publishedTime[0:-5]))  # 文章发表时间
            publishedTime = publishedTime_1[0:10] + ' ' + publishedTime_1[10:18]
            #print(publishedTime)
            urlTime = time.strptime(publishedTime, '%Y-%m-%d %H:%M:%S')
            Y, M, D, H = urlTime[0:4]
            urlTime2 = datetime.datetime(Y, M, D, H)
            # 转化成时间戳来比较float
            urlTime2 = time.mktime(urlTime2.timetuple())  # 文章发表时间的时间戳
            startTime = endTime - datetime.timedelta(
                days=self.channel.search_ranges)
            startTimeIntSecond = time.mktime(startTime.timetuple())  # 开始爬取的时间戳
            endTimeIntSecond = time.mktime(endTime.timetuple())  # 结束爬取的时间错
            if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond:
                # 获取文章链接，爬取文章内容
                pre_article_url = info['slug']
                articleUrl = "https://www.jianshu.com/p/" + pre_article_url  # 文章的链接
                (content, authorName,
                 title) = self.crawlArticle(articleUrl)  #爬取文章内容
                #print(title)
                if content is None:  # 话题已被删除或则其他格式
                    print("没有爬到文章")
                    continue
                authorId = info["user"]["id"]  #作者id
                likeCount = info["likes_count"]  # 点赞数
                readCount = info["views_count"]  # 文章阅读数
                replyCount = info["public_comments_count"]  # 文章评论数
                tid = info["id"]  # 文章的id
                article = Article(tid, self.channel.channel_id, title, content,
                                  publishedTime, articleUrl, authorId,
                                  authorName)
                #self.crawlComment(article)
                article.statistics.reply_count = replyCount
                article.statistics.like_count = likeCount
                article.statistics.read_count = readCount
                if (article is not None) and (article not in articleList):
                    articleList.append(article)
            else:
                print('结束爬虫')
                hasnext = False
                break
        return (articleList, hasnext)

Пример #14

Показать файл

Файл: sinanews.py Проект: magicgis/outfile

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        cookies = 'U_TRS1=000000fa.9fe376b4.58573ebc.bde2f2c3; UOR=,vip.stock.finance.sina.com.cn,; vjuids=3923fcfb8.15914cd122a.0.e347599b65a6; SINAGLOBAL=183.63.92.250_1482112700.861930; SUB=_2AkMvC7H0f8NhqwJRmP4WzWzrb4xwzgnEieLBAH7sJRMyHRl-yD83qlNetRBAqqE4nv4pjjxQaUfLZo_Os-Bxsw..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFZzJ6nbHTRfVEqOXp-S.5z; SGUID=1482721389362_efec0e8d; vjlast=1488765553.1489054965.10; bdshare_firstime=1492414283526; _ct_uid=58f46f61.537a7929; lxlrtst=1492423120_o; rotatecount=2; Apache=59.42.29.149_1492670298.869113; ULV=1492670299361:18:6:6:59.42.29.149_1492670298.869113:1492670298484; afpCT=1; CNZZDATA1252916811=1442218969-1492654141-http%253A%252F%252Fnews.sina.com.cn%252F%7C1492664941; UM_distinctid=15b8a154522e79-0a3f79bddc9d05-4e45042e-100200-15b8a154523a49; CNZZDATA5399792=cnzz_eid%3D349789736-1492650802-http%253A%252F%252Fnews.sina.com.cn%252F%26ntime%3D1492667002; U_TRS2=00000095.1c285e96.58f85761.e07aa962; lxlrttp=1492423120'
        html = self.session.download(url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, cookies=cookies)
        if html:
            article_url = html['url']
            article_url = re.findall(r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.info('[SinaNews]'+article_url)
            #获取发布时间
            date = re.findall(r'/(\d{4}-\d{2}-\d{2})/', article_url)
            if len(date) == 0:
                return None
            # if date[0] < '2015-07-01':
            #     html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True)

            soup = BeautifulSoup(html['html'], 'lxml')
            main = soup.find('div', attrs={'class': "wrap-inner"})
            main1 = soup.find('div', attrs={'class': "Main clearfix"})
            main2 = soup.find('div', attrs ={'class': "main-content w1240"})

            #第一种网页格式
            if main is not None:
                self.logger.debug('走第一种格式')
                #获取标题
                Ttitle = main.find('h1', attrs={'id': "artibodyTitle"})
                if Ttitle is None:
                    self.logger.error('[SinaNews]' + '缺少标题，无法构成文章，可能已被修改格式，本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #获取发布时间
                Ttime = main.find('span', attrs={'class': 'time-source'})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}).*', Ttime)[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3]
                else:
                    self.logger.error('[SinaNews]' + '缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取作者信息
                Tauthor = soup.find('span', attrs={'class': "time-source"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                    else:
                        Tauthor = None
                else:
                    Tauthor = None

                #获取内容
                Tcontent = main.find('div', attrs={'id': "artibody"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.error('[SinaNews]' + '缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取')
                    return

                #获取评论数
                try:
                    channel = re.findall(r"channel: '(.*)',", html['html'])[0]
                    newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
                    data = {
                        'format': 'js',
                        'channel': channel,
                        'newsid': newsid,
                        'group': '',
                        'compress': '1',
                        'ie': 'gbk',
                        'oe': 'gbk',
                        'page': '1',
                        'page_size': '20'
                    }
                    re_url = 'http://comment5.news.sina.com.cn/page/info'
                    html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
                    html1 = re.sub(r'(.*=)\{', '{', html1)
                    html1 = json.loads(html1)
                    totalcount = html1['result']['count']['show']
                    Treply = totalcount
                except:
                    Treply = None

                # 获取文章的id
                articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0]

                article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor)
                article.statistics.reply_count = Treply
                return article

            #第二种网页格式
            elif main1 is not None:
                self.logger.debug('走第二种格式')
                #获取标题
                Ttitle = main1.find('h1', attrs={'id': "artibodyTitle"})
                if Ttitle is None:
                    self.logger.error('[SinaNews]' + '缺少标题，无法构成文章，可能已被修改格式，本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #获取时间
                Ttime = main1.find('span', attrs={'id': "pub_date"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})', Ttime)[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3]
                else:
                    self.logger.error('[SinaNews]' + '缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取发布者
                Tauthor = main1.find('span', attrs={'id': "media_name"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a').text.strip()
                else:
                    Tauthor = None

                #获取内容
                Tcontent = main1.find('div', attrs={'id': "artibody"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.error('[SinaNews]' + '缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取')
                    return

                try:
                    channel = re.findall(r"channel: '(.*)',", html['html'])[0]
                    newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
                    data = {
                        'format': 'js',
                        'channel': channel,
                        'newsid': newsid,
                        'group': '',
                        'compress': '1',
                        'ie': 'gbk',
                        'oe': 'gbk',
                        'page': '1',
                        'page_size': '20'
                    }
                    re_url = 'http://comment5.news.sina.com.cn/page/info'
                    html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
                    html1 = re.sub(r'(.*=)\{', '{', html1)
                    html1 = json.loads(html1)
                    totalcount = html1['result']['count']['show']
                    Treply = totalcount
                except:
                    Treply = None

                # 获取文章的id
                articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0]

                article = Article(tid=articleid, channel_id=self.channel.channel_id,title=Ttitle, content=Tcontent, publish_datetime=Ttime, url=article_url, author_name=Tauthor)
                article.statistics.reply_count = Treply
                return article

            #第三种网页格式
            elif main2 is not None:
                self.logger.debug(u'第三种格式')
                #获取标题
                Ttitle = main2.find('div', attrs={'class': "second-title"})

                if Ttitle is None:
                    self.logger.error('[SinaNews]' + '缺少标题，无法构成文章，可能已被修改格式，本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                # 获取时间
                Ttime = main2.find('span', attrs={'class': "date"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D ' '(\d{2}:\d{2}).*', Ttime)[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3]
                else:
                    self.logger.error('[SinaNews]' + '缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                # 获取发布者
                Tauthor = main2.find('a', attrs={'class': "source"})
                if Tauthor is not None:
                    Tauthor = Tauthor.text.strip()
                else:
                    Tauthor = None

                # 获取内容
                Tcontent = main2.find('div', attrs={'id': "article"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.error('[SinaNews]' + '缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取')
                    return

                # 获取评论数
                try:
                    channel = re.findall(r"channel: '(.*)',", html['html'])[0]
                    newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
                    data = {
                        'format': 'js',
                        'channel': channel,
                        'newsid': newsid,
                        'group': '',
                        'compress': '1',
                        'ie': 'gbk',
                        'oe': 'gbk',
                        'page': '1',
                        'page_size': '20'
                    }
                    re_url = 'http://comment5.news.sina.com.cn/page/info'
                    html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
                    html1 = re.sub(r'(.*=)\{', '{', html1)
                    html1 = json.loads(html1)
                    totalcount = html1['result']['count']['show']
                    Treply = totalcount
                except:
                    Treply = None

                # 获取文章的id
                articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0]

                article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor)
                article.statistics.reply_count = Treply
                return article

Пример #15

Показать файл

Файл: neteasenews.py Проект: magicgis/outfile

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056'
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        if html:
            article_url = html['url']
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.info(article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')
            main = soup.find('div', attrs={'class': "post_content_main"})
            main1 = soup.find('div', attrs={'class': "ep-content-main"})

            #第一种网页格式
            if main is not None:

                #获取标题
                Ttitle = main.find('h1')
                if Ttitle is None:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少标题，无法构成文章，可能已被修改格式，本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #获取发布时间
                Ttime = main.find('div', attrs={'class': "post_time_source"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                       Ttime)[0]
                else:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取')
                    return

                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取发布作者
                Tauthor = main.find('div', attrs={'class': "post_time_source"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                    else:
                        Tauthor = None

                #获取发布内容
                Tcontent = main.find('div', attrs={'class': "post_text"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                    dr = re.compile(r'<[^>]+>', re.S)
                    Tcontent = dr.sub('', Tcontent)
                else:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取')
                    return

                    # 获取评论数
                articleid = ""
                try:
                    articleid = re.findall(r'"docId" : "(.*)",',
                                           html['html'])[0]
                    re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid
                    html1 = self.session.download(url=re_url,
                                                  encoding='utf-8',
                                                  data=None,
                                                  isJson=True,
                                                  timeout=10,
                                                  retry=3)
                    Treply = html1["tcount"]
                except:
                    Treply = None
                    self.logger.error('[NeteaseComment]url:' + article_url +
                                      ', tid:' + articleid + ', %s' +
                                      traceback.format_exc())
                finally:
                    article = Article(tid=articleid,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      content=Tcontent,
                                      publish_datetime=Ttime,
                                      url=article_url,
                                      author_name=Tauthor)
                    article.statistics.reply_count = Treply
                    return article

            #第二种网页格式
            elif main1 is not None:

                #标题
                Ttitle = main1.find('h1')
                if Ttitle is None:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少标题，无法构成文章，可能已被修改格式，本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #发布的时间
                Ttime = main1.find('div',
                                   attrs={'class': "ep-time-source cDGray"})
                Ttime1 = main1.find('div', attrs={'class': "ep-info cDGray"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(
                        r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0]
                elif Ttime1 is not None:
                    Ttime = Ttime1.text.strip()
                    Ttime = re.findall(
                        r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0]
                else:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取作者信息
                Tauthor = main1.find('div',
                                     attrs={'class': "ep-time-soure cDGray"})
                Tauthor1 = main1.find('div',
                                      attrs={'class': "ep-source cDGray"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                    else:
                        Tauthor = None
                elif Tauthor1 is not None:
                    Tauthor = Tauthor1.find('span')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                        print Tauthor
                        Tauthor = re.findall(r'来源：(.*)"', Tauthor)[0]
                    else:
                        Tauthor = None
                else:
                    Tauthor = None

                #获取内容
                Tcontent = main1.find('div', attrs={'id': "endText"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                    dr = re.compile(r'<[^>]+>', re.S)
                    Tcontent = dr.sub('', Tcontent)
                else:
                    self.logger.error('[SinaNews]' +
                                      '缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取')
                    return

                #获取评论数
                try:
                    articleid = re.findall(r'"docId" : "(.*)",',
                                           html['html'])[0]
                    re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid
                    html1 = json.load(
                        self.session.download(re_url,
                                              encoding='utf-8',
                                              data=None,
                                              isJson=True,
                                              timeout=10,
                                              retry=3))
                    Treply = html1['tcount']
                except:
                    Treply = None
                    self.logger.error('[NeteaseComment]url:' + article_url +
                                      ', tid:' + articleid + ', %s' +
                                      traceback.format_exc())
                finally:
                    article = Article(tid=articleid,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      content=Tcontent,
                                      publish_datetime=Ttime,
                                      url=article_url,
                                      author_name=Tauthor)
                    self.logger.debug("[crawlArticle]" + article.tid)
                    article.statistics.reply_count = Treply
                    return article

Пример #16

Показать файл

Файл: zhihu.py Проект: magicgis/outfile

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        :return:返回一个article实例
        '''
        article_id = re.findall(r'\d+', url)[0]

        html = self.session.get(url=url, headers=CRAWL_ARTICLE_HEADERS)

        soup = BeautifulSoup(html.text, 'lxml')

        if (url.find('zhuanlan') != -1):
            main = soup.find('textarea', attrs={'id': "preloadedState"})
            try:
                if main is None:
                    return main
                sub = re.findall(r'new Date\(\".*\"\)', main.text)[0]
                sub = re.findall(r'".*"', sub)[0]
                maintext = re.subn(r'new Date\(\".*\"\)', sub, main.text)[0]
                try:
                    articleJson = json.loads(
                        maintext)["database"]["Post"][article_id]
                except:
                    return None
                #获取标题
                title = articleJson["title"]
                self.logger.info(title)
                authorName = articleJson["author"]
                contentSoup = BeautifulSoup(articleJson["content"])
                content = contentSoup.text
                commentCount = articleJson["commentCount"]
                collapsedCount = articleJson["collapsedCount"]
                likeCount = articleJson["likeCount"]
                publishedTime = articleJson["publishedTime"][0:18]
                timeArray = time.strptime(publishedTime, "%Y-%m-%dT%H:%M:%S")
                publishedTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
                article = Article(article_id, self.channel.channel_id, title,
                                  content, publishedTime, url, None,
                                  authorName)
                article.statistics.reply_count = commentCount
                article.statistics.like_count = likeCount
                article.statistics.collect_count = collapsedCount
                self.session.close()
                return article
            except:
                return None
        if (url.find('question') != -1):
            # 获取标题
            try:  # 针对404页面
                main = soup.find('div', attrs={
                    'id': "data"
                }).attrs['data-state']
                articleJson = json.loads(main)
                questionJson = articleJson['entities']['questions'][article_id]
                title = questionJson['title']
                self.logger.info(title)
                contentSoup = BeautifulSoup(questionJson['editableDetail'],
                                            'lxml')
                content = contentSoup.text
                author_id = questionJson['author']['id']
                author_name = questionJson['author']['name']
                createTimeInFloat = questionJson['created']
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(createTimeInFloat))
                reply_count = questionJson['commentCount']
                read_count = questionJson['visitCount']
                collect_count = questionJson['followerCount']
                article = Article(article_id, self.channel.channel_id, title,
                                  content, publish_datetime, url, author_id,
                                  author_name)
                article.statistics.reply_count = reply_count
                article.statistics.read_count = read_count
                article.statistics.collect_count = collect_count
                return article
            except:
                return None

Пример #17

Показать файл

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        html = self.session.download(url,
                                     encoding='utf-8',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)

        if html:
            article_url = html['url']
            if article_url.find(self.channel.url) < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[iFengnews]' + article_url)
            soup = BeautifulSoup(html['html'], 'lxml')  # 'html.parser' 解析器
            main = soup.find('div', attrs={'class': "main"})
            main1 = soup.find('div', attrs={'class': "yc_main"})

            if main is not None:
                self.logger.debug(u'走第一种格式')
                Ttitle = main.find('h1').text.strip()  # 标题
                if Ttitle is None:
                    self.logger.error(u'缺少标题，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                                      url)
                    return
                else:
                    Tpublishtime = main.find('span', attrs={
                        'class': "ss01"
                    }).text.strip()
                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ', url)
                        return
                    else:
                        Tpublishtime = Tpublishtime.replace(u'年', '-').replace(
                            u'月', '-').replace(u'日', '')

                        Tauthor = main.find('a', attrs={'target': "_blank"})
                        if Tauthor is not None:
                            Tauthor = Tauthor.text.strip()
                        else:
                            Tauthor = 'None'
                        Tcontent = main.find('div',
                                             attrs={'id': "main_content"})
                        # print Tcontent
                        # Tcontent = Tcontent.find('p')
                        if Tcontent is not None:
                            Tcontent = Tcontent.text.strip()
                            Tcontent = re.sub(r'\n|\t', '', Tcontent)
                        else:
                            self.logger.error(
                                u'缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                                url)
                            return

                        doc_url = re.findall(r'"commentUrl":"(.*)",',
                                             html['html'])
                        if doc_url:
                            doc_url = doc_url[0]
                        else:
                            doc_url = url

                        data1 = {
                            'callback': 'newCommentListCallBack',
                            # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml',
                            'doc_url': doc_url,
                            'job': '1',
                            'callback': 'newCommentListCallBack'
                        }
                        re_url = 'http://comment.ifeng.com/get.php'
                        html1 = self.session.download(re_url,
                                                      encoding='gbk',
                                                      data=data1,
                                                      timeout=10,
                                                      retry=3,
                                                      addr=False,
                                                      isJson=True)
                        # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] )

                        Treply = html1['count']
                        if len(html1['comments']) is not 0:
                            articleid = html1['comments'][0]['article_id']
                        else:
                            articleid = article_url
                            articleid = articleid[articleid.find('a/') + 2:
                                                  -6]  # 由于数据库字段长度有限，所以截取部分作为ID
                            self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid',
                                             article_url)

                        meta_info = doc_url

                        article = Article(articleid,
                                          self.channel.channel_id,
                                          Ttitle,
                                          Tcontent,
                                          Tpublishtime,
                                          article_url,
                                          None,
                                          Tauthor,
                                          meta_info=meta_info)
                        article.statistics.reply_count = Treply

                        self.logger.info(article)
                        return article

            ##对第二种格式的爬取
            if main1 is not None:
                self.logger.debug(u'走第二种格式')
                Ttitle = main1.find('h1').text.strip()
                if Ttitle is None:
                    self.logger.error(u'缺少标题，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                                      url)
                    return
                else:
                    Tpublishtime = main1.find('span').text.strip()
                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ', url)
                        return

                # return Tpublishtime
                Tauthor = main1.find('a', attrs={'target': "_blank"})
                if Tauthor is not None:
                    Tauthor = Tauthor.text.strip()
                else:
                    Tauthor = 'None'
                Tcontent = main1.find('div', attrs={'class': "yc_con_txt"})

                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.warn(
                        u'缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ', url)
                    return

                doc_url = re.findall(r'"commentUrl":"(.*)",', html['html'])
                if doc_url:
                    doc_url = doc_url[0]
                else:
                    doc_url = url

                data1 = {
                    'callback': 'newCommentListCallBack',
                    # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml',
                    'doc_url': doc_url,
                    'job': '1',
                    'callback': 'newCommentListCallBack'
                }
                re_url = 'http://comment.ifeng.com/get.php'
                html1 = self.session.download(re_url,
                                              encoding='gbk',
                                              data=data1,
                                              timeout=10,
                                              retry=3,
                                              addr=False,
                                              isJson=True)
                # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] )
                try:
                    Treply = html1['count']
                except:
                    Treply = None

                if len(html1['comments']) is not 0:
                    articleid = html1['comments'][0]['article_id']
                else:
                    articleid = url.strip()
                    articleid = articleid[articleid.find('a/') +
                                          2:-6]  # 由于数据库字段长度有限，所以截取部分作为ID
                    self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid',
                                     article_url)

                meta_info = doc_url
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Tpublishtime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                article.statistics.reply_count = Treply
                self.logger.info(article)
                return article

            if (main is None) and (main1 is None):
                self.logger.warn(u"存在另外一种html格式：：%s", url)
                return

Пример #18

Показать файл

Файл: weibo_v1.0.py Проект: magicgis/outfile

    def __parseSearchPage(self, data):
        '''
        @return: (articleList,hasnext)
        '''
        articleList = list()
        hasnext = False
        soup = BeautifulSoup(data, "lxml")
        self.logger.info(soup)
        # check if no result
        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return (articleList, hasnext)

        # find page bar to check if more

        pageDivList = soup.findAll('div', {'class': 'W_pages'})
        if len(pageDivList) > 0:
            pageDiv = pageDivList[0]
            if len(pageDiv.findAll('a',
                                   {'class': 'page next S_txt1 S_line1'})) > 0:
                hasnext = True
        if hasnext is False:
            self.logger.info('The last page')

        root_1 = soup.findAll('div', {"action-type": "feed_list_item"})
        # self.logger.debug(root_1)
        for r in root_1:
            root_2 = r.find('div', {'class': "content clearfix"})
            mid = r.attrs['mid']
            article_url = root_2.find('div', {
                'class': "feed_from W_textb"
            }).findNext('a').attrs['href']
            self.logger.debug('1  %s', article_url)
            if not article_url.startswith('http:'):
                article_url = 'http:' + article_url
            # self.logger.debug(article_url)
            root_content = root_2.find('p', {'class': "comment_txt"})

            long_content = root_content.find('a', {'action-type': "fl_unfold"})
            try:
                link_content = root_content.find('a').attrs['href']
                link_content = '  原文链接： ' + link_content
            except:
                link_content = ''
            if long_content:
                content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[
                    'action-data']
                self.session.randomSleep()
                response = self.session.get(content_url, textRspOnly=False)
                try:
                    content_html = response.json()['data']['html']
                    content = BeautifulSoup(
                        content_html, 'html.parser').text.strip().replace(
                            "'", "''").replace("%", "\%").replace(":", "\:")
                except Exception, e:
                    self.logger.debug('Exception: %s' % e)
                    continue
            else:
                content = root_content.text.strip().replace("'", "''").replace(
                    "%", "\%").replace(":", "\:")
                content = content + link_content
                # self.logger.error(content)

            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '
            author_id = r.attrs['tbinfo']
            author_id = re.findall(r'ouid=(\d+)', author_id)[0]
            author_name = root_2.find('a').attrs['nick-name']
            publish_datetime = root_2.find('a', {
                'class': "W_textb"
            }).attrs['date']

            try:
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.localtime(float(publish_datetime) / 1000))
            except:
                continue
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name)

            # fetch statistics
            root_3 = r.find('div', {'class': "feed_action clearfix"})
            soup_li_list = root_3.findAll('li')
            self.__parseStatistics(article, soup_li_list)
            # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \
            #     publish_datetime, reply_count, read_count, like_count, collect_count, forward_count
            if article not in articleList:
                articleList.append(article)

Пример #19

Показать файл

Файл: tencentnews.py Проект: magicgis/outfile

    def crawlNewArticle(self, url):
        '''
        爬取url前缀为new.qq.com的文章
        :param url:
        :return:
        '''
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        if html:
            article_url = html['url']
            if article_url.find('new.qq.com/omn') < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return

            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[TencentNew]' + article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')

            script_tags = soup.head.find_all('script')
            data = dict()
            for tag in script_tags:
                text = re.search(r'window.DATA = (.*)', tag.text, re.S)
                if text:
                    data = json.loads(text.group(1))

            tid = data['article_id']
            title = data['title']
            author_name = data['media']
            author_id = data['media_id']
            publish_datetime = data['pubtime']
            comment_id = data['comment_id']

            main = soup.find('div', attrs={'class': 'qq_conent clearfix'})
            t_content = ''
            if main is not None:
                contents = main.find_all('p', {'class': 'one-p'})
                for content in contents:
                    if content.string is None:
                        continue
                    t_content += str(content.get_text().strip())

            get_comment_count_url = 'https://coral.qq.com/article/%s/commentnum?callback=_article%scommentnum' % (
                comment_id, comment_id)
            comment_data = self.session.download(get_comment_count_url)
            comment_data = re.search(
                r'_article%scommentnum\((.*)\)' % comment_id, comment_data)

            comment_dict = eval(comment_data.group(1))
            reply_count = comment_dict['data']['commentnum']
            meta_info = '{"commentid":"%s"}' % comment_id

            article = Article(tid=tid,
                              channel_id=self.channel.channel_id,
                              title=title,
                              content=t_content,
                              publish_datetime=publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name,
                              meta_info=meta_info)
            article.statistics.reply_count = reply_count
            return article
        return None

Пример #20

Показать файл

Файл: tencentnews.py Проект: magicgis/outfile

    def crawlNewsArticle(self, url):
        '''
        爬取url前缀为news.qq.com和gd.qq.com的文章
        :param url:
        :return:
        '''
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        if html:
            article_url = html['url']
            if article_url.find('news.qq.com') < 0 and article_url.find(
                    'gd.qq.com') < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[TencentNews]' + article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')
            main = soup.find('div', attrs={'id': "Main-Article-QQ"})
            main1 = soup.find('div', attrs={'id': "Main-P-QQ"})
            if main is not None:
                Ttitle = main.find('h1').text.strip()  #标题
                Ttime = main.find('span', attrs={'class':
                                                 "article-time"})  #发布时间
                Ttime1 = main.find('span', attrs={'class': "a_time"})
                Ttime2 = main.find('span', attrs={'class': "pubTime"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                elif Ttime1 is not None:
                    Ttime1 = Ttime1.text.strip()
                    Ttime = Ttime1
                elif Ttime2 is not None:
                    Ttime2 = Ttime2.text.strip()
                    Ttime = Ttime2
                else:
                    Ttime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                Tauthor = main.find('span', attrs={'class': "a_source"})
                Tauthor1 = main.find('span', attrs={'class': "color-a-1"})
                if Tauthor is not None:
                    #Tauthor = Tauthor.find('a').text.strip()
                    Tauthor = Tauthor.text.strip()
                elif Tauthor1 is not None:
                    #Tauthor1 = Tauthor1.find('a').text.strip()
                    Tauthor1 = Tauthor1.text.strip()
                    Tauthor = Tauthor1
                else:
                    Tauthor = None
                Tcontent = main.find('div',
                                     attrs={'id': "Cnt-Main-Article-QQ"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    Tcontent = None
                articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0]
                try:
                    commentid = re.findall(r'cmt_id = (\d+);', html['html'])[0]
                    meta_info = '{"commentid":"%s"}' % commentid
                except:
                    commentid = None
                    meta_info = None
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Ttime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                if commentid is not None:
                    try:
                        re_url = 'http://coral.qq.com/article/' + commentid + '/commentnum'
                        html1 = json.loads(
                            self.session.download(re_url,
                                                  encoding='utf-8',
                                                  data=None,
                                                  timeout=10,
                                                  retry=3))
                        Treply = int(html1['data']['commentnum'])
                    except Exception:
                        traceInfo = traceback.format_exc()
                        self.logger.error(
                            'Faile to parse comment for %s (cid=%s):%s',
                            articleid, commentid, traceInfo)
                        Treply = None
                    article.statistics.reply_count = Treply
                return article
            elif main1 is not None:
                Ttitle = soup.find('meta', attrs={
                    'name': "Description"
                }).attrs['content']  # 标题
                Ttime = re.findall(
                    r"pubtime\D+(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})\',",
                    html['html'])
                if Ttime is not None:
                    Ttime = Ttime[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[
                        2] + ' ' + Ttime[3]
                else:
                    Ttime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'
                Tauthor = re.findall(r'para = {\s+name: \"(.*)\",',
                                     html['html'])
                if Tauthor is not None:
                    Tauthor = Tauthor[0]
                else:
                    Tauthor = None
                con_url = re.sub(r'\.htm\?.*', '.hdBigPic.js', article_url)
                con_html = self.session.download(con_url,
                                                 encoding='gbk',
                                                 data=None,
                                                 timeout=10,
                                                 retry=3)
                con_list = re.findall(r'<p>(.*?)</p>', con_html)
                if con_list is not None:
                    TT = []
                    for i in con_list:
                        if i.strip() not in TT:
                            TT.append(i)
                    Tcontent = ''.join(TT)
                else:
                    Tcontent = None
                articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0]
                try:
                    commentid = re.findall(r'aid\D+(\d+)\",', html['html'])[0]
                    meta_info = '{"commentid":"%s"}' % commentid
                except:
                    commentid = None
                    meta_info = None
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Ttime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                try:
                    if commentid is not None:
                        re_url = 'http://coral.qq.com/article/batchcommentnum'
                        data1 = {'targetid': articleid}
                        html1 = json.loads(
                            self.session.download(re_url,
                                                  encoding='utf-8',
                                                  data=data1,
                                                  timeout=10,
                                                  retry=3))
                        Treply = int(html1['data'][0]['commentnum'])
                    else:
                        Treply = None
                except:
                    Treply = None
                article.statistics.reply_count = Treply
                return article
        return None

Пример #21

Показать файл

Файл: peoplenews.py Проект: magicgis/outfile

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        cookies = '_HY_lvt_c5928ed57343443aa175434a09ea2804=1492582419784; _HY_CTK_c5928ed57343443aa175434a09ea2804=0acc9a79b115c4ca1931c41e303bec28; BAIDU_SSP_lcr=https://www.baidu.com/link?url=NszYD2w_HgkPWqrzDQ3WKApYldw_9MpUVun9r-R09M7r0dh09MUwTHzG087WaJrhBwMCY-7pDfds4xjtWArRf2xh01DHOWWWd9DpBnHwZ03&wd=&eqid=84d5edfe0003dbdd0000000658f6c280; ALLYESID4=0DB901C6E627D980; sso_c=0; wdcid=5838509dcecc0a53; _people_ip_new_code=510000; UM_distinctid=15b802cdbd364d-02e7f218c26ae6-4e45042e-100200-15b802cdbd49ce; wdses=62d3f0f698d07532; sfr=1; CNZZDATA1260954200=1457761991-1492499618-null%7C1492580619; CNZZDATA1260954203=33096124-1492503288-null%7C1492578888; CNZZDATA1256327855=1768205365-1492503342-null%7C1492578947; wdlast=1492582420'
        html = self.session.download(url,
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)

        soup = BeautifulSoup(html['html'], 'html.parser')  # 'html.parser' 解析器
        try:
            meta = soup.find('meta').attrs['content']
        except:
            self.logger.warn(u'找不到meta里的content')
            return
        # self.logger.error('%s',meta)

        if "GB2312" in meta:
            encoding1 = 'GB2312'
        elif "UTF-8" in meta:
            encoding1 = 'UTF-8'
        elif "utf-8" in meta:
            encoding1 = 'utf-8'
        else:
            encoding1 = 'gbk'

        html = self.session.download(url,
                                     encoding=encoding1,
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        # 不同网页编码格式让其重新下载一遍
        soup = BeautifulSoup(html['html'], 'html.parser')  # 'html.parser' 解析器
        main = soup.find('body')

        if html:
            article_url = html['url']
            # self.logger.debug(article_url)
            if article_url.find(self.channel.url) < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None

            # if '.html' not in article_url:
            #     self.logger.error(u'非文章类型网址：%s ',article_url)
            #     return

            try:
                article_url = re.findall(
                    r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            except:
                self.logger.error(u'网址后缀不符合：%s ', article_url)
                return

            self.logger.debug('[peoplenews]' + article_url)
            articleid = article_url
            articleid = articleid[articleid.find('cn/') +
                                  3:-5]  # 由于数据库字段长度有限，所以截取部分作为ID
            self.logger.warn(u'地址 %s 以url地址部分字段作为tid', article_url)

            if 'bbs1' not in article_url:

                main1 = soup.find(
                    'div', attrs={'class': "i_cont"}
                )  #http://health.people.com.cn/n1/2017/1011/c14739-29579836.html
                main2 = soup.find(
                    'div', attrs={'class': "text_c"}
                )  #http://rencai.people.com.cn/n/2014/0721/c244800-25311391.html

                if (main1 is None) and (main2 is None):
                    self.logger.debug(u'走main')
                    try:
                        Ttitle = main.find('h1').text.strip()  # 标题
                    except:
                        self.logger.error(u'Ttitle存在走了main部分却不满足其他格式的的url：：%s',
                                          article_url)
                        return

                    if Ttitle is None:
                        self.logger.error(
                            u'缺少标题，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Ttitle_crawl = soup.find('div',
                                                 attrs={'class':
                                                        "box01"})  #对应一种格式
                        if Ttitle_crawl is None:
                            self.logger.error(
                                u'Ttitle_crawl存在走了main部分却不满足其他格式的的url：：%s',
                                article_url)
                            return
                        try:
                            Tpublishtime = Ttitle_crawl.find('div',
                                                             attrs={
                                                                 'class': "fl"
                                                             }).text.strip()
                        except:
                            self.logger.error(u'main中发布时间不匹配')
                            return

                        if Tpublishtime is None:
                            self.logger.error(
                                u'缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                                article_url)
                            return
                        else:
                            # self.logger.error(Tpublishtime)
                            Tpublishtime = re.findall(
                                r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                                Tpublishtime)[0]

                            # Tpublishtime = Tpublishtime[:18]
                            if len(Tpublishtime[4]) > 1:
                                Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                               Tpublishtime[3] + Tpublishtime[4]
                            else:
                                Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                               Tpublishtime[3] + ':00'

                            # Tpublishtime = Tpublishtime.replace(u'年', '-').replace(u'月', '-').replace(u'日', '')

                            Tauthor = Ttitle_crawl.find(
                                'a', attrs={'target': "_blank"})
                            if Tauthor is not None:
                                Tauthor = Tauthor.text.strip()
                            else:
                                Tauthor = 'None'
                            Tcontent = soup.find('div',
                                                 attrs={'class': "box_con"})
                            if Tcontent is not None:
                                Tcontent = Tcontent.text.strip()
                                Tcontent = re.sub(r'\n|\t', '', Tcontent)
                            else:
                                self.logger.error(
                                    u'缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                                    article_url)
                                return

                            Treply = None  #这种格式下没有这些统计可以获取
                            meta_info = None

                            article = Article(articleid,
                                              self.channel.channel_id,
                                              Ttitle,
                                              Tcontent,
                                              Tpublishtime,
                                              article_url,
                                              None,
                                              Tauthor,
                                              meta_info=meta_info)
                            article.statistics.reply_count = Treply
                            #self.logger.info(article)
                            return article

                elif (main1 is not None):
                    self.logger.debug(u'走main1')
                    Ttitle = main1.find('h2')  # 标题

                    if Ttitle is None:
                        self.logger.error(
                            u'缺少标题，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Ttitle = Ttitle.text.strip()

                    try:
                        Tpublishtime = main1.find('div',
                                                  attrs={
                                                      'class': "artOri"
                                                  }).text.strip()
                    except:
                        self.logger.error(u'main1中发布时间不匹配')
                        return

                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Tpublishtime = re.findall(
                            r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                            Tpublishtime)[0]
                        # self.logger.error(Tpublishtime)
                        # Tpublishtime = Tpublishtime[:18]
                        if len(Tpublishtime[4]) > 1:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + Tpublishtime[4]
                        else:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + ':00'

                        Tauthor = main1.find('div', attrs={
                            'class': "artOri"
                        }).find('a', attrs={'target': "_blank"})
                        # self.logger.debug(u"作者:%s",Tauthor)
                        if Tauthor is not None:
                            Tauthor = Tauthor.text.strip()
                        else:
                            Tauthor = 'None'

                        Tcontent = main1.find('div', attrs={'class': "artDet"})
                        if Tcontent is not None:
                            Tcontent = Tcontent.text.strip()
                            Tcontent = re.sub(r'\n|\t', '', Tcontent)
                        else:
                            self.logger.error(
                                u'缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                                article_url)
                            return

                        Treply = None  # 这种格式下没有这些统计可以获取
                        meta_info = None

                        article = Article(articleid,
                                          self.channel.channel_id,
                                          Ttitle,
                                          Tcontent,
                                          Tpublishtime,
                                          article_url,
                                          None,
                                          Tauthor,
                                          meta_info=meta_info)
                        article.statistics.reply_count = Treply
                        self.logger.info(article)
                        return article

                elif (main2 is not None):
                    self.logger.debug(u'走main2')
                    Ttitle = main2.find('h2', attrs={'class': "one"})  # 标题

                    if Ttitle is None:
                        self.logger.error(
                            u'缺少标题，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Ttitle = Ttitle.text.strip()
                    try:
                        Tpublishtime = main2.find('span',
                                                  attrs={
                                                      'id': "p_publishtime"
                                                  }).text.strip()
                    except:
                        self.logger.error(u'main2中发布时间不匹配')
                        return

                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Tpublishtime = re.findall(
                            r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                            Tpublishtime)[0]
                        # self.logger.error(Tpublishtime)
                        # Tpublishtime = Tpublishtime[:18]
                        if len(Tpublishtime[4]) > 1:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + Tpublishtime[4]
                        else:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + ':00'

                        Tauthor = main2.find('span', attrs={
                            'id': "p_origin"
                        }).find('a', attrs={'target': "_blank"})
                        # self.logger.debug(u"作者:%s",Tauthor)
                        if Tauthor is not None:
                            Tauthor = Tauthor.text.strip()
                        else:
                            Tauthor = 'None'

                        Tcontent = main2.find('div',
                                              attrs={'class': "show_text"})
                        if Tcontent is not None:
                            Tcontent = Tcontent.text.strip()
                            Tcontent = re.sub(r'\n|\t', '', Tcontent)
                        else:
                            self.logger.error(
                                u'缺少文章内容，无法构成文章，可能已被修改格式，本文停止爬取::该网站为 %s ',
                                article_url)
                            return

                        Treply = None  # 这种格式下没有这些统计可以获取
                        meta_info = None

                        article = Article(articleid,
                                          self.channel.channel_id,
                                          Ttitle,
                                          Tcontent,
                                          Tpublishtime,
                                          article_url,
                                          None,
                                          Tauthor,
                                          meta_info=meta_info)
                        article.statistics.reply_count = Treply
                        self.logger.info(article)
                        return article

                else:
                    self.logger.warn(u'存在另外一种html格式 %s', article_url)

Пример #22

Показать файл

Файл: sacrawler.py Проект: magicgis/outfile

 def __updateOldArticleToArticleHistoryTable(self,
                                             articleList,
                                             currentTableName,
                                             historyTableName,
                                             isEventTable=False):
     '''
     更新到文章历史表
     @param currentTableName: 当前文章表：全局文章表、实体文章表或者实体事件文章表
     @param historyTableName: 历史文章表：全局文章表、实体文章表或者实体事件文章表
     @param eventId: 如果更新到实体事件文章表，则需要提供事件id，否则为None
     '''
     if len(articleList) > 0:
         if isEventTable is False:
             eventIdFieldName = ''
         else:
             eventIdFieldName = ',EVENT_ID'
         #找寻老文章
         selectSql = '''
         SELECT TID, CHANNEL_ID %s FROM %s where %s
         '''
         whereClauseList = map(
             lambda article: '(TID="%s" and CHANNEL_ID=%d)' %
             (article.tid, article.channel_id), articleList)
         self.dbProxy.execute(selectSql %
                              (eventIdFieldName, currentTableName,
                               ' or '.join(whereClauseList)))
         resultList = self.dbProxy.fetchall()
         if isEventTable:
             existingArticleList = map(
                 lambda item: Article(item[0], item[1], eventId=item[2]),
                 resultList)
         else:
             existingArticleList = map(
                 lambda item: Article(item[0], item[1]), resultList)
         toBeUpdateArticleList = list()
         for item in existingArticleList:
             index = articleList.index(item)
             obj = copy.copy(articleList[index])
             obj.eventId = item.eventId
             toBeUpdateArticleList.append(obj)
         if len(toBeUpdateArticleList) > 0:
             n = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             if isEventTable is False:
                 eventIdFieldName = ''
             else:
                 eventIdFieldName = 'EVENT_ID,'
             insertSql = '''
             INSERT INTO %s (TID, %s CHANNEL_ID,
                 READ_COUNT,LIKE_COUNT, REPLY_COUNT,
                 FORWARD_COUNT, COLLECT_COUNT, HEAT, ADD_DATETIME)
             VALUES %s 
             '''
             valueList = list()
             for article in toBeUpdateArticleList:
                 statistics = article.statistics
                 if isEventTable is False:
                     eventIdFieldValue = ''
                 else:
                     eventIdFieldValue = str(article.eventId) + ','
                 valueList.append(
                     '("%s", %s %d, %s, %s, %s, %s, %s, %s, "%s")' %
                     (article.tid, eventIdFieldValue, article.channel_id,
                      statistics.read_count if statistics.read_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.like_count if statistics.like_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.reply_count if statistics.reply_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.forward_count if statistics.forward_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.collect_count if statistics.collect_count
                      is not None else Constants.DEFAULT_NUM,
                      statistics.heat if statistics.heat is not None else
                      Constants.DEFAULT_NUM, n))
             if len(valueList) > 0:
                 self.dbProxy.execute(insertSql %
                                      (historyTableName, eventIdFieldName,
                                       ','.join(valueList)))
                 self.dbProxy.commit()

Пример #23

Показать файл

    def __parse_page_url(self, page_url, count_page=0):
        '''
        获取每一页的微博信息
        @return: article_list
        '''
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Host': 's.weibo.com',
            'Referer': 'http://s.weibo.com/',
            'User-Agent': user_agent
        }

        self.change_cookie()
        cookies = self.__load_cookies_from_lwp(self.user_name)
        response = self.session.get(url=page_url,
                                    headers=headers,
                                    cookies=cookies)

        # 若此cookie无效，则重新登录获取新的cookie
        if re.match(r'File not .*', response) is not None:
            self.__login(self.user_name)
            cookies = self.__load_cookies_from_lwp(self.user_name)
            response = self.session.get(url=page_url,
                                        headers=headers,
                                        cookies=cookies)

        html = etree.HTML(response)
        # 内容list
        card_wrap_list = html.xpath(
            '//div[@class="m-wrap"]/div[@id="pl_feedlist_index"]//div[@class="card-wrap"]'
        )
        article_list = list()
        for card_wrap in card_wrap_list:
            try:
                mid = card_wrap.xpath('./@mid')[0]
            except Exception as e:
                continue
            info = card_wrap.xpath('.//div[@class="content"]')[0]
            author_name = info.xpath('.//a[@class="name"]/text()')[0]
            author_id = re.search(
                r'/(\d+).?refer_flag',
                info.xpath('.//a[@class="name"]/@href')[0]).group(1)

            txt = info.xpath('.//p[@class="txt"]')[0]
            content = txt.xpath('string(.)').strip()
            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '

            article_info = info.xpath('.//p[@class="from"]/a')[0]
            article_url = self.HTTP + article_info.xpath('./@href')[0]

            statistics = card_wrap.xpath(
                './/div[@class="card-act"]//li/a/text()')

            collect_count = re.search(r'(\d+)', statistics[0])
            forward_count = re.search(r'(\d+)', statistics[1])
            reply_count = re.search(r'(\d+)', statistics[2])
            like_count = card_wrap.xpath(
                './/div[@class="card-act"]//li/a/em/text()')

            if collect_count:
                collect_count = collect_count.group(1).strip()
            else:
                collect_count = 0
            if forward_count:
                forward_count = forward_count.group(1).strip()
            else:
                forward_count = 0
            if reply_count:
                reply_count = reply_count.group(1).strip()
            else:
                reply_count = 0
            if len(like_count):
                like_count = like_count[0]
            else:
                like_count = 0

            publish_datetime = self.__parse_time(article_url, cookies)

            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              url=article_url,
                              publish_datetime=publish_datetime,
                              author_id=author_id,
                              author_name=author_name)

            article.statistics.collect_count = collect_count
            article.statistics.forward_count = forward_count
            article.statistics.reply_count = reply_count
            article.statistics.like_count = like_count

            if article not in article_list:
                article_list.append(article)

        self.logger.debug('%d page crawl article is %s' %
                          (count_page, len(article_list)))
        return article_list