def prepare_docs(self, num=None): """ :param num: 如果是具体数字则 准备最近发布的num篇文章 :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值 """ from pymongo import DESCENDING doc_list = [] # 从数据库中找出文章列表 col = CollectionOperation(self.nickname_raw) if num: db_docs = col.table.find().sort("p_date", DESCENDING)()[:num] else: db_docs = col.get() begin_time = time.time() # 根据 doc_schema 中 key 构建doc list for doc in db_docs: item = {} doc['id'] = doc['content_url'] for key in doc_schema: if key in doc: item[key] = doc[key] # 如果数据库中没有该字段使用-2填充 else: item[key] = -2 doc_list.append(item) logger.info('解析文章文本用时 %.3f'%(time.time()-begin_time)) return doc_list
def delete_like(article_info): """ :param article_info:{content_irl} :return: 删除搜藏 """ # 从喜欢列表中删除 col_like.delete(content_url=article_info['content_url']) # 将原始数据库的like记录改为FALSE col_data = CollectionOperation(article_info['nickname']) article_data = col_data.get(content_url=article_info['content_url'])[0] article_data['like_folder'] = False col_data.insert(key='content_url', data=article_data)
def run(self, command, cmd_args): from app.api.gzh import Finished from cmp.db.mongo import CollectionOperation for gzh in Finished().get()['finished']: col = CollectionOperation(gzh['nickname']) articles_buffer = [] for a in col.get(): if type(a['mov']) == str: a['mov'] = int(a['mov']) articles_buffer.append(a) col.insert('id', articles_buffer) logger.info('转化完成 %d %s'%(len(articles_buffer), gzh['nickname'])) command_log('转化完成 %d %s'%(len(articles_buffer), gzh['nickname']))
def add_like(article_info): """ :param article_info: {nickname, content_url} :return: 添加到搜藏 """ # 根据nickname和url从数据库中获得原始记录 col_data = CollectionOperation(article_info['nickname']) article_data = col_data.get(content_url=article_info['content_url'])[0] # 原始数据库中增加已经搜藏字段 article_data['like_folder'] = True col_data.insert(key='content_url', data=article_data) # 增加收藏时间 article_data['like_time'] = datetime.now() # 插入 "微搜收藏" res = col_like.insert(key='content_url', data=article_data)
def get_article_list(self, page_info): """ :param page_info: {'nickname','start','end'} :return: 返回一个公众号的全部文章列表 """ col_data = CollectionOperation(page_info['nickname']) info_data = [] cnt = 1 articles = col_data.get()[int(page_info['start']):int(page_info['end'] )] for a in articles: item = {} item['id'] = cnt item['mov'] = a['mov'] if 'read_num' in a: item['read'] = a['read_num'] else: item['read'] = '-' if 'like_num' in a: item['like'] = a['like_num'] else: item['like'] = '-' if 'reward_num' in a: item['reward'] = a['reward_num'] else: item['reward'] = '-' if 'comment_num' in a: item['comment'] = a['comment_num'] else: item['comment'] = '-' item['date'] = a['p_date'].timestamp() item['title'] = a['title'] item['url'] = a['content_url'] item['md5'] = get_md5(a['content_url']) cnt += 1 info_data.append(item) return info_data
class ReadingData: def __init__(self): self.wx_req_data_list = rd.tidy() self.nickname = self.wx_req_data_list[0]['nickname'] self.every_delay = 3.0 self.wx_num = len(self.wx_req_data_list) self.delay = round(self.every_delay / self.wx_num, 3) self.articles = [] self.col_data = CollectionOperation(self.nickname) self.pre_crawl_time = time.time() def get_all_reading_data(self, filter=None, process=None): """ :param filter: :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据 """ if 'getappmsgext' in self.wx_req_data_list[0]: raw_articles = self.col_data.get(read_num={'$exists': False}) cnt = 0 for a in raw_articles: if 'mp.weixin.qq.com' in a['content_url']: if 'comment_id' not in a: a['comment_id'] = 0 self.articles.append( [cnt, a['content_url'], a['comment_id']]) cnt += 1 for itme in self.articles: while time.time() - self.pre_crawl_time <= self.delay: time.sleep(0.05) self.pre_crawl_time = time.time() reading_data = Crawler( itme[1], itme[2], self.wx_req_data_list[itme[0] % self.wx_num]).run() reading_data = self.check(reading_data, itme) reading_data['id'] = get_md5(itme[1]) self.col_data.insert('id', reading_data) process.new_reading_data(itme[0] + 1, len(self.articles), self.delay) else: logger.warning('点击查看该公众号的任意一篇文章且出现阅读量') def save(self, reading_data): """ :param reading_data: :return: 保存数据 """ pass def prepare_task(self): """ :return: 多线程的方式准备任务 """ for item in self.articles: yield {'index': item[0], 'url': item[1]} def task_handler(self, task): """ :return: 多线程的方式任务处理器 """ Crawler(task['url'], self.wx_req_data_list[task['index'] % self.wx_num]).run() def check(self, reading_data, item): """ :return: 带着本次请求的参数和结果一起过安检 请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求 不排除还是会失败 继续调用自己 反正想办法让其获得成功的请求 最后返回成功的请求 """ if reading_data != 'req_data_error': stop_and_start.check({'crawler': '阅读数据', 'msg': 'success'}) else: stop_and_start.check({'crawler': '阅读数据', 'msg': 'req_data_error'}) self.wx_req_data_list = rd.tidy() while len(self.wx_req_data_list) == 0: self.wx_req_data_list = rd.tidy() from utils.front import notification notification('没有发现参数', '参数错误', _type='error') time.sleep(3) reading_data = Crawler(item[1], item[2], self.wx_req_data_list[0]).run() self.check(reading_data, item) return reading_data