def getArticleList(self, num=None): """ :param num: 如果是具体数字则 准备最近发布的num篇文章 :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值 """ from pymongo import DESCENDING article_list = [] col = DB(self.ori_nickname) if num: article_datas = col.table.find().sort('p_date', DESCENDING)()[:num] else: article_datas = col.get() begin_time = time.time() for doc in article_datas: item = {} doc['id'] = doc['content_url'] for key in l1llll1lll_wcplus_: if key in doc: item[key] = doc[key] else: item[key] = -2 article_list.append(item) logger.info('解析文章文本用时 %.3f' % (time.time() - begin_time)) return article_list
def prepare_docs(self, num=None): """ :param num: 如果是具体数字则 准备最近发布的num篇文章 :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值 """ from pymongo import DESCENDING doc_list = [] # 从数据库中找出文章列表 col = CollectionOperation(self.nickname_raw) if num: db_docs = col.table.find().sort("p_date", DESCENDING)()[:num] else: db_docs = col.get() begin_time = time.time() # 根据 doc_schema 中 key 构建doc list for doc in db_docs: item = {} doc['id'] = doc['content_url'] for key in doc_schema: if key in doc: item[key] = doc[key] # 如果数据库中没有该字段使用-2填充 else: item[key] = -2 doc_list.append(item) logger.info('解析文章文本用时 %.3f'%(time.time()-begin_time)) return doc_list
def l1l11l111_wcplus_(cls, q=True): mac, pt = cls.l1lll1ll1l1_wcplus_() if not mac and not pt: return False else: l1lll1l1l11_wcplus_ = cls.l11l111l1_wcplus_() l1lll1l1ll1_wcplus_ = l1lll1l1lll_wcplus_() if l1lll1l1l11_wcplus_ == 1: return False if int(mac) not in l1lll1l1ll1_wcplus_: if not q: logger.warning('证书错误') return False end_time = int(pt) - int(mac) - 12874767561234 l1lll1ll111_wcplus_ = l1lll1l11ll_wcplus_() if not l1lll1ll111_wcplus_: return False l1lll1l11l1_wcplus_ = end_time - l1lll1l11ll_wcplus_() if l1lll1l11l1_wcplus_ <= 0: if not q: logger.warning('证书过期') return False l1lll1l1l1l_wcplus_ = datetime.utcfromtimestamp(end_time).strftime( '%Y-%m-%d %H:%M:%S') if not q: logger.info('证书有效至' + l1lll1l1l1l_wcplus_) return l1lll1l1l1l_wcplus_
def response(self, l1lll1111l1_wcplus_): global l11l11llll_wcplus_ for key in l1lll111111_wcplus_: if l1lll111111_wcplus_[key] in l1lll1111l1_wcplus_.request.url: l1l1l1111_wcplus_, timestamp = l1lll111lll_wcplus_.l1lll1111ll_wcplus_( l1lll1111l1_wcplus_.request) if key == 'home': l11l11llll_wcplus_ = self._1ll1llll1l_wcplus_( l1l1l1111_wcplus_) if l11l11llll_wcplus_ == 'UNK': return key_name = '%s.%s.req' % (l11l11llll_wcplus_, key) l1lll11ll11_wcplus_(key_name, l1l1l1111_wcplus_) logger.debug(key_name) if key == 'getappmsgext': status_code, text = l1lll111lll_wcplus_.get_response( l1lll1111l1_wcplus_.response) l1lll11l1ll_wcplus_ = json.loads(text) l11l1l111l_wcplus_ = 'UNK' if 'nick_name' in l1lll11l1ll_wcplus_: l11l1l111l_wcplus_ = l1lll11l1ll_wcplus_['nick_name'] if l11l1l111l_wcplus_ == 'UNK': logger.debug('没能找到微信昵称 换一篇文章点击试试看 确保文章底部阅读数据出现') else: l1lll11ll11_wcplus_(l11l1l111l_wcplus_ + '.nick_name', l11l11llll_wcplus_) elif key == 'home': status_code, l1lll11lll1_wcplus_ = l1lll111lll_wcplus_.get_response( l1lll1111l1_wcplus_.response) l1lll11l1l1_wcplus_ = l1lll11lll1_wcplus_.split( 'var nickname = "')[1].split('" || ""')[0] logger.info('准备公众号:' + l1lll11l1l1_wcplus_) l1lll11ll11_wcplus_('current_nickname', l1lll11l1l1_wcplus_)
def examplePassport(cls, q=True): mac, pt = cls.getMacUUid() if not mac and not pt: return False else: uuid_ = cls.getUUid() mac_uuid = getUuidIp() if uuid_ == 1: return False if int(mac) not in mac_uuid: if not q: logger.warning('证书错误') return False end_time = int(pt) - int(mac) - 12874767561234 baidu_time = getBaiduTime() if not baidu_time: return False time_left = end_time - getBaiduTime() if time_left <= 0: if not q: logger.warning('证书过期') return False end_time = datetime.utcfromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S') if not q: logger.info('证书有效至' + end_time) return end_time
def response(self, flow): for key in url_filter: if url_filter[key] in flow.request.url: req_data, timestamp = ExtractFlow.format_request_data( flow.request) wxuin = self._extract_wxuin(req_data) if wxuin == 'UNK': return key_name = '%s.%s.req' % (wxuin, key) insert_helper(key_name, req_data) logger.debug(key_name) if key == 'getappmsgext': status_code, text = ExtractFlow.get_response(flow.response) text_dict = json.loads(text) nick_name = 'UNK' if 'nick_name' in text_dict: nick_name = text_dict['nick_name'] if nick_name == 'UNK': logger.debug('没能找到微信昵称 换一篇文章点击试试看 确保文章底部阅读数据出现') else: insert_helper(nick_name + '.nick_name', wxuin) elif key == 'home': status_code, html_text = ExtractFlow.get_response( flow.response) current_nickname = html_text.split( 'var nickname = "')[1].split('" || ""')[0] logger.info('准备公众号:' + current_nickname) insert_helper('current_nickname', current_nickname)
def check_password(cls, q=True): """ :param pt:真实mac+15618407030+截止日期timestamp :param mac:用户提供的mac :return:返回证书是否有效如果有效 无效直接False 有效返回截止日期 日期为字符串格式 """ return '2099-12-31 00:00:00' mac, pt = cls.read_password() if not mac and not pt: return False else: your_mac = cls.get_mac_address() your_macs, _ = get_uuind() if your_mac == 1: return False if int(mac) not in your_macs: if not q: logger.warning('证书错误') return False end_time = int(pt) - int(mac) - 18058584888 net_time = get_internet_time() if not net_time: return False left_seconds = end_time - get_internet_time() if left_seconds <= 0: if not q: logger.warning('证书过期') return False end_time_str = datetime.utcfromtimestamp(end_time).strftime( '%Y-%m-%d %H:%M:%S') if not q: logger.info('证书有效至' + end_time_str) return end_time_str
def l1llll1l1l_wcplus_(self, num=None): """ :param num: 如果是具体数字则 准备最近发布的num篇文章 :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值 """ from pymongo import DESCENDING l1llll1l11_wcplus_ = [] col = l1l11llll_wcplus_(self.l1lll1lll1_wcplus_) if num: l11111l1l_wcplus_ = col.table.find().sort('p_date', DESCENDING)()[:num] else: l11111l1l_wcplus_ = col.get() begin_time = time.time() for doc in l11111l1l_wcplus_: item = {} doc['id'] = doc['content_url'] for key in l1llll1lll_wcplus_: if key in doc: item[key] = doc[key] else: item[key] = -2 l1llll1l11_wcplus_.append(item) logger.info('解析文章文本用时 %.3f' % (time.time() - begin_time)) return l1llll1l11_wcplus_
def response(self, flow): # 检查是否是过滤器中的url for key in url_filter: if url_filter[key] in flow.request.url: # 请求参数进行格式转化 得到请求参数和时间戳 req_data,timestamp = ExtractFlow.format_request_data(flow.request) # debug_p(req_data) global wxuin if key == 'home': wxuin = self._extract_wxuin(req_data) # 没有获取到微信昵称不保存参数 if wxuin == 'UNK': return key_name = '%s.%s.req'%(wxuin, key) insert_helper(key_name,req_data) logger.debug(key_name) # 获取当前微信的昵称和微信的wxuin 以此支持多微信同时采集 if key == 'getappmsgext': # 找出当前微信昵称 status_code,text = ExtractFlow.get_response(flow.response) text_dict = json.loads(text) nick_name = 'UNK' if 'nick_name' in text_dict: nick_name = text_dict['nick_name'] if nick_name == 'UNK': logger.debug('没能找到微信昵称 换一篇文章点击试试看 确保文章底部阅读数据出现') else: insert_helper(nick_name+'.nick_name', wxuin) elif key == 'home': status_code, html_text = ExtractFlow.get_response(flow.response) current_nickname = html_text.split('var nickname = "')[1].split('" || ""')[0] logger.info('准备公众号:'+current_nickname) insert_helper('current_nickname',current_nickname)
def run(self, command, cmd_args): from app.api.gzh import Finished from cmp.db.mongo import CollectionOperation for gzh in Finished().get()['finished']: col = CollectionOperation(gzh['nickname']) articles_buffer = [] for a in col.get(): if type(a['mov']) == str: a['mov'] = int(a['mov']) articles_buffer.append(a) col.insert('id', articles_buffer) logger.info('转化完成 %d %s'%(len(articles_buffer), gzh['nickname'])) command_log('转化完成 %d %s'%(len(articles_buffer), gzh['nickname']))
def print_ts_brief(): global front_process ts = TaskRecoder.get_ts() TaskRecoder.put_ts(ts) item = {} item['style'] = '采集正文' item['nickname'] = '略' item['process'] = '%d/%d' % (ts['done_task_num'], ts['total_task_num']) item['data'] = '速度%.3f' % ts['speed'] item['task'] = '略' logger.info('速度%.3f 完成%d/%d' % (ts['speed'], ts['done_task_num'], ts['total_task_num'])) front_process.new_article(ts['done_task_num'], ts['total_task_num'], ts['ips'], ts['speed'])
def l1l1ll1lll_wcplus_(l1l1l1l1l1_wcplus_=200): """ :return: article_data_buffer当到达一定长度之后 保存并清空 """ global l1l1l1111l_wcplus_ success = False if len(l1l1l1111l_wcplus_) >= l1l1l1l1l1_wcplus_: while not success: try: l11ll111l_wcplus_.insert(id, l1l1l1111l_wcplus_) l1l1l1111l_wcplus_ = [] logger.info('保存成功保存%d' % l1l1l1l1l1_wcplus_) success = True except: time.sleep(3) l1l1ll1lll_wcplus_(l1l1l1l1l1_wcplus_=l1l1l1l1l1_wcplus_)
def l1l1l11lll_wcplus_(): global l1ll1l11ll_wcplus_ ts = l1l1ll1l11_wcplus_.l1ll11ll1l_wcplus_() l1l1ll1l11_wcplus_.l1l11l1ll1_wcplus_(ts) item = {} item['style'] = '采集正文' item['nickname'] = '略' item['process'] = '%d/%d' % (ts['done_task_num'], ts['total_task_num']) item['data'] = '速度%.3f' % ts['speed'] item['task'] = '略' logger.info('速度%.3f 完成%d/%d' % (ts['speed'], ts['done_task_num'], ts['total_task_num'])) if l1ll1l11ll_wcplus_: l1ll1l11ll_wcplus_.l11l1l1ll_wcplus_(ts['done_task_num'], ts['total_task_num'], ts['ips'], ts['speed'])
def save_article(windos=200): """ :return: article_data_buffer当到达一定长度之后 保存并清空 """ global article_data_buffer success = False if len(article_data_buffer) >= windos: while not success: try: col_data.insert(id, article_data_buffer) article_data_buffer = [] logger.info('保存成功保存%d' % windos) success = True except: time.sleep(3) save_article(windos=windos)
def _insert(use_data, item, p_date): """ 文章列表信息插入use_data """ use_data['index'] += 1 keys = ('title', 'author', 'content_url', 'digest', 'cover', 'source_url', 'mov', 'nickname') sub_data = DecodeArticleList.sub_dict(item, keys) p_date = datetime.fromtimestamp(p_date) sub_data['p_date'] = p_date sub_data['id'] = get_md5(sub_data['content_url']) if sub_data['title']: use_data['data'].append(sub_data) item = {} item['style'] = '采集文章列表' item['nickname'] = sub_data['nickname'] item['process'] = use_data['index'] item['data'] = sub_data['mov'] item['task'] = sub_data['title'][:5] + '...' logger.info('采集文章列表中... %2d %2s %s' % (use_data['index'], sub_data['mov'], sub_data['title']))
def _insert(l11lll1lll_wcplus_, item, l1l1111111_wcplus_): """ 文章列表信息插入use_data """ l11lll1lll_wcplus_['index'] += 1 keys = ('title', 'author', 'content_url', 'digest', 'cover', 'source_url', 'mov', 'nickname') l11lll11ll_wcplus_ = l11lll11l1_wcplus_.l11llll111_wcplus_(item, keys) l1l1111111_wcplus_ = datetime.fromtimestamp(l1l1111111_wcplus_) l11lll11ll_wcplus_['p_date'] = l1l1111111_wcplus_ l11lll11ll_wcplus_['id'] = l11llll11_wcplus_( l11lll11ll_wcplus_['content_url']) l11lll11ll_wcplus_['mov'] = int(l11lll11ll_wcplus_['mov']) if l11lll11ll_wcplus_['title']: l11lll1lll_wcplus_['data'].append(l11lll11ll_wcplus_) item = {} item['style'] = '采集文章列表' item['nickname'] = l11lll11ll_wcplus_['nickname'] item['process'] = l11lll1lll_wcplus_['index'] item['data'] = l11lll11ll_wcplus_['mov'] item['task'] = l11lll11ll_wcplus_['title'][:5] + '...' logger.info('采集文章列表中... %2d %2s %s' % (l11lll1lll_wcplus_['index'], l11lll11ll_wcplus_['mov'], l11lll11ll_wcplus_['title']))
def _insert(use_data, item, p_date): ''' 文章列表信息插入use_data ''' use_data['index'] += 1 keys = ('title', 'author', 'content_url', 'digest', 'cover', 'source_url','mov','nickname') sub_data = DecodeArticleList.sub_dict(item, keys) p_date = datetime.fromtimestamp(p_date) sub_data["p_date"] = p_date # 设置id为文章的url 方便使用不同的数据库操作 sub_data["id"] = get_md5(sub_data['content_url']) # mov转化为整数 sub_data['mov'] = int(sub_data['mov']) # 只保留有文章标题的文章 没有标题的文章已经被删除 if sub_data["title"]: use_data['data'].append(sub_data) # 准备任务日志信息 需要答应 item = {} item['style'] = '采集文章列表' item['nickname'] = sub_data['nickname'] item['process'] = use_data["index"] item['data'] = sub_data["mov"] item['task'] = sub_data["title"][:5]+'...' logger.info('采集文章列表中... %2d %2s %s'%(use_data["index"],sub_data["mov"], sub_data["title"]))
def l11l1llll1_wcplus_(rd): logger.info('采集阅读数据中... 阅读%-5d 点赞%-4d 赞赏%3d 评论%d' % (rd['read_num'], rd['like_num'], rd['reward_num'], rd['comment_num']))
def l1l11ll11l_wcplus_(): ts = l1l1ll1l11_wcplus_.l1ll11ll1l_wcplus_() l1l1ll1l11_wcplus_.l1l11l1ll1_wcplus_(ts) logger.info(ts)
def print_ts(): ts = TaskRecoder.get_ts() TaskRecoder.put_ts(ts) logger.info(ts)
def print_log(rd): logger.info('采集阅读数据中... 阅读%-5d 点赞%-4d 赞赏%3d 评论%d' % (rd['read_num'], rd['like_num'], rd['reward_num'], rd['comment_num']))