Пример #1
0
    def fetch_ori_page(self, page_url):
        """拿到单个文章页面,在文章url里加上参数f=json可以直接得到
        json格式的数据。
        """
        # 先拿到搜狗跳转到微信文章的地址
        pre_r = get(page_url, headers=self.headers)
        wechat_url = pre_r.url.split('#')[0] + '&f=json'

        if 'mp.weixin' not in wechat_url:
            return

        r = get(wechat_url, headers=self.headers)
        self.logger.info(wechat_url)
        if self.col.find_one(dict(nick_name=self.name, url=wechat_url)):
            raise DocumentExistsException("article exist")
        if r.status_code != 200:
            return

        o = json.loads(r.text)
        self.col.update(
            {
                '_id': gid(),
                'nick_name': self.name,
                'url': wechat_url,
            }, {'$set': {
                'json': o
            }},
            upsert=True)
Пример #2
0
    def save_article_dict_list(self, nick_name, article_dict_list):
        # 先删除超过限制数量的文章
        if self.col.find(dict(nick_name=self.name)).count() > self.limit:
            oldest_doc = list(self.col.find(dict(nick_name=self.name)).
                              sort([('ori_create_time', 1)]).limit(1))[0]
            oldest_doc_id = oldest_doc.get('_id')
            self.col.remove({'_id': oldest_doc_id})
            self.logger.info(
                "%s:删除:%s : %s\n" %
                (
                    self.name,
                    oldest_doc.get('title'),
                    datestr_from_stamp(
                        oldest_doc.get('ori_create_time'), '%Y-%m-%d'
                    )
                )
            )
        for o in article_dict_list:
            if o['title']:
                o_date = datestr_from_stamp(
                    o.get('ori_create_time'), '%Y-%m-%d'
                )
                self.logger.info(
                    '%s-保存文章 title : %s %s\n', self.name, o['title'], o_date
                )

                o['tag_id'] = self.tag_id
                self.col.update({'_id': gid()}, {'$set': o}, True)
Пример #3
0
    def save_article_dict_list(self, nick_name, article_dict_list):
        # 先删除超过限制数量的文章
        if self.col.find(dict(nick_name=self.name)).count() > self.limit:
            oldest_doc = list(
                self.col.find(dict(nick_name=self.name)).sort([
                    ('ori_create_time', 1)
                ]).limit(1))[0]
            oldest_doc_id = oldest_doc.get('_id')
            self.col.remove({'_id': oldest_doc_id})
            self.logger.info(
                "%s:删除:%s : %s\n" %
                (self.name, oldest_doc.get('title'),
                 datestr_from_stamp(oldest_doc.get('ori_create_time'),
                                    '%Y-%m-%d')))
        for o in article_dict_list:
            if o['title']:
                o_date = datestr_from_stamp(o.get('ori_create_time'),
                                            '%Y-%m-%d')
                self.logger.info('%s-保存文章 title : %s %s\n', self.name,
                                 o['title'], o_date)

                o['tag_id'] = self.tag_id
                self.col.update({'_id': gid()}, {'$set': o}, True)
Пример #4
0
    def fetch_page(self, page_url):
        """拿到单个文章页面,在文章url里加上参数f=json可以直接得到json格式
        的数据,处理json拿到需要的字段。
        """
        if self.col.find(dict(nick_name=self.name)).count() > self.limit:
            oldest_doc = list(
                self.col.find(dict(nick_name=self.name)).sort([
                    ('ori_create_time', 1)
                ]).limit(1))[0]
            oldest_doc_id = oldest_doc.get('_id')
            self.col.remove({'_id': oldest_doc_id})
            self.logger.info(
                "%s:删除:%s : %s\n" %
                (self.name, oldest_doc.get('title'),
                 datestr_from_stamp(oldest_doc.get('ori_create_time'),
                                    '%Y-%m-%d')))

        # 先拿到搜狗跳转到微信文章的地址
        pre_r = get(page_url, headers=self.headers)
        wechat_url = pre_r.url.split('#')[0] + '&f=json'

        if 'mp.weixin' not in wechat_url:
            return

        r = get(wechat_url, headers=self.headers)
        self.logger.info(wechat_url)
        if self.col.find_one(dict(nick_name=self.name, url=wechat_url)):
            raise DocumentExistsException("article exist")

        if r.status_code != 200:
            return

        o = json.loads(r.text)
        if o.get('title') is None:  # 文章被投诉后没有此字段,跳过
            return

        fields = {
            'cdn_url', 'nick_name', 'title', 'content', 'desc', 'link',
            'ori_create_time'
        }
        media_fields = {'round_head_img', 'nick_name', 'signature'}
        media_dict = {k: o.get(k) for k in media_fields}
        article_dict = {k: o.get(k) for k in fields}

        if self.col.find_one(dict(nick_name=self.name, title=o['title'])):
            raise DocumentExistsException("article exist")

        too_old_days = 10
        if days_from_now(o['ori_create_time']) > too_old_days:  # 10天之前的跳过
            self.logger.info('%s跳过%d天前文章 title : %s\n', self.name,
                             too_old_days, o['title'])
            raise DocumentExpireException("expire")

        if o['title'] and o['content']:
            o_date = datestr_from_stamp(o.get('ori_create_time'), '%Y-%m-%d')
            self.logger.info('%s-保存文章 title : %s %s\n', self.name, o['title'],
                             o_date)

            article_dict['nick_name'] = self.name
            article_dict['url'] = wechat_url
            article_dict['tag_id'] = self.tag_id
            del article_dict['content']
            self.col.update({'_id': gid()}, {'$set': article_dict}, True)

        # http://mp.weixin.qq.com/s?__biz=MjM5NjAxMDc4MA==&mid=404900944&idx=1&sn=fe2d53ce562ee51e7163a60d4c95484a#rd
        biz = extract('__biz=', '==', article_dict['link'])
        self.media_col.update({'_id': biz}, {'$set': media_dict}, True)