示例#1
0
 def prepare_docs(self, num=None):
     """
     :param num: 如果是具体数字则 准备最近发布的num篇文章
     :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值
     """
     from pymongo import DESCENDING
     doc_list = []
     # 从数据库中找出文章列表
     col = CollectionOperation(self.nickname_raw)
     if num:
         db_docs = col.table.find().sort("p_date", DESCENDING)()[:num]
     else:
         db_docs = col.get()
     begin_time = time.time()
     # 根据 doc_schema 中 key 构建doc list
     for doc in db_docs:
         item = {}
         doc['id'] = doc['content_url']
         for key in doc_schema:
             if key in doc:
                 item[key] = doc[key]
             # 如果数据库中没有该字段使用-2填充
             else:
                 item[key] = -2
         doc_list.append(item)
     logger.info('解析文章文本用时 %.3f'%(time.time()-begin_time))
     return doc_list
示例#2
0
 def delete_like(article_info):
     """
     :param article_info:{content_irl}
     :return: 删除搜藏
     """
     # 从喜欢列表中删除
     col_like.delete(content_url=article_info['content_url'])
     # 将原始数据库的like记录改为FALSE
     col_data = CollectionOperation(article_info['nickname'])
     article_data = col_data.get(content_url=article_info['content_url'])[0]
     article_data['like_folder'] = False
     col_data.insert(key='content_url', data=article_data)
 def run(self, command, cmd_args):
     from app.api.gzh import Finished
     from cmp.db.mongo import CollectionOperation
     for gzh in Finished().get()['finished']:
         col = CollectionOperation(gzh['nickname'])
         articles_buffer = []
         for a in col.get():
             if type(a['mov']) == str:
                 a['mov'] = int(a['mov'])
                 articles_buffer.append(a)
         col.insert('id', articles_buffer)
         logger.info('转化完成 %d %s'%(len(articles_buffer), gzh['nickname']))
         command_log('转化完成 %d %s'%(len(articles_buffer), gzh['nickname']))
示例#4
0
 def add_like(article_info):
     """
     :param article_info: {nickname, content_url}
     :return: 添加到搜藏
     """
     # 根据nickname和url从数据库中获得原始记录
     col_data = CollectionOperation(article_info['nickname'])
     article_data = col_data.get(content_url=article_info['content_url'])[0]
     # 原始数据库中增加已经搜藏字段
     article_data['like_folder'] = True
     col_data.insert(key='content_url', data=article_data)
     # 增加收藏时间
     article_data['like_time'] = datetime.now()
     # 插入 "微搜收藏"
     res = col_like.insert(key='content_url', data=article_data)
示例#5
0
文件: gzh.py 项目: zmjaction/WCplus
    def get_article_list(self, page_info):
        """
        :param page_info: {'nickname','start','end'}
        :return: 返回一个公众号的全部文章列表
        """
        col_data = CollectionOperation(page_info['nickname'])
        info_data = []
        cnt = 1
        articles = col_data.get()[int(page_info['start']):int(page_info['end']
                                                              )]
        for a in articles:
            item = {}
            item['id'] = cnt
            item['mov'] = a['mov']
            if 'read_num' in a:
                item['read'] = a['read_num']
            else:
                item['read'] = '-'
            if 'like_num' in a:
                item['like'] = a['like_num']
            else:
                item['like'] = '-'
            if 'reward_num' in a:
                item['reward'] = a['reward_num']
            else:
                item['reward'] = '-'
            if 'comment_num' in a:
                item['comment'] = a['comment_num']
            else:
                item['comment'] = '-'
            item['date'] = a['p_date'].timestamp()
            item['title'] = a['title']
            item['url'] = a['content_url']
            item['md5'] = get_md5(a['content_url'])
            cnt += 1
            info_data.append(item)

        return info_data
示例#6
0
class ReadingData:
    def __init__(self):
        self.wx_req_data_list = rd.tidy()
        self.nickname = self.wx_req_data_list[0]['nickname']
        self.every_delay = 3.0
        self.wx_num = len(self.wx_req_data_list)
        self.delay = round(self.every_delay / self.wx_num, 3)
        self.articles = []
        self.col_data = CollectionOperation(self.nickname)
        self.pre_crawl_time = time.time()

    def get_all_reading_data(self, filter=None, process=None):
        """
        :param filter:
        :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据
        """
        if 'getappmsgext' in self.wx_req_data_list[0]:
            raw_articles = self.col_data.get(read_num={'$exists': False})
            cnt = 0
            for a in raw_articles:
                if 'mp.weixin.qq.com' in a['content_url']:
                    if 'comment_id' not in a:
                        a['comment_id'] = 0
                    self.articles.append(
                        [cnt, a['content_url'], a['comment_id']])
                    cnt += 1

            for itme in self.articles:
                while time.time() - self.pre_crawl_time <= self.delay:
                    time.sleep(0.05)

                self.pre_crawl_time = time.time()
                reading_data = Crawler(
                    itme[1], itme[2],
                    self.wx_req_data_list[itme[0] % self.wx_num]).run()
                reading_data = self.check(reading_data, itme)
                reading_data['id'] = get_md5(itme[1])
                self.col_data.insert('id', reading_data)
                process.new_reading_data(itme[0] + 1, len(self.articles),
                                         self.delay)

        else:
            logger.warning('点击查看该公众号的任意一篇文章且出现阅读量')

    def save(self, reading_data):
        """
        :param reading_data:
        :return: 保存数据
        """
        pass

    def prepare_task(self):
        """
        :return: 多线程的方式准备任务
        """
        for item in self.articles:
            yield {'index': item[0], 'url': item[1]}

    def task_handler(self, task):
        """
        :return: 多线程的方式任务处理器
        """
        Crawler(task['url'],
                self.wx_req_data_list[task['index'] % self.wx_num]).run()

    def check(self, reading_data, item):
        """
        :return: 带着本次请求的参数和结果一起过安检
        请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求
        不排除还是会失败  继续调用自己 反正想办法让其获得成功的请求  最后返回成功的请求
        """
        if reading_data != 'req_data_error':
            stop_and_start.check({'crawler': '阅读数据', 'msg': 'success'})
        else:
            stop_and_start.check({'crawler': '阅读数据', 'msg': 'req_data_error'})
            self.wx_req_data_list = rd.tidy()
            while len(self.wx_req_data_list) == 0:
                self.wx_req_data_list = rd.tidy()
                from utils.front import notification
                notification('没有发现参数', '参数错误', _type='error')
                time.sleep(3)

            reading_data = Crawler(item[1], item[2],
                                   self.wx_req_data_list[0]).run()
            self.check(reading_data, item)
        return reading_data