def search(self): """ :return: 执行搜索动作 """ # 根据公众号的昵称 indices = [] st = deepcopy(search_template) dls = self.search_data_preprocess() st.update(dls) if self.source != None: st["_source"] = self.source # 添加搜索字段 # st['_source'] = self.fileds # 更新from 和 size 支持分页 try: st["from"] = self.from_size["from"] st["size"] = self.from_size["size"] except: logger.warning("from_size字段错误 %s"%(str(self.from_size))) # 指定 搜索的索引范围 if not self.index_list: indices = '*' else: indices = self.index_list try: result = es_instance.search(index=indices, body=st)['hits'] # result = es_instance.search(index=indices, doc_type=self.doc_type, body=st)['hits'] return result except Exception as e: print(e) logger.error("搜索错误 可能是有指定了不存在的搜索范围没有建立索引%s"%(str(indices))) return False
def post(self): """ :return: 返回搜索的结果 1标题 2摘要 3文章 4全部 """ args = parser.parse_args() if args['fields'] == '1': args['fields'] = [ 'title'] else: if args['fields'] == '2': args['fields'] = [ 'digest'] else: if args['fields'] == '3': args['fields'] = [ 'article'] else: args['fields'] = [ 'title', 'digest', 'article'] if args['range'] == '全部': args['range'] = 'gzh_*' else: args['range'] = 'gzh_' + args['range'] from app.search.search import l1ll111l1_wcplus_ try: result = (l1ll111l1_wcplus_(l1lll1l11l_wcplus_=args['search_data'], l1l1l1lll_wcplus_=args['range'], fields=args['fields'], _1lll1l1l1_wcplus_=int(args['from']), _1lll11ll1_wcplus_=int(args['size']))).get_result() return result except: from utils.base import logger logger.warning('搜索请求超时 建议多次尝试') return '搜索请求超时 建议多次尝试'
def l1lll111l_wcplus_(self, filter=None, process=None): """ :param filter: 过滤器比如按照时间过滤 按照数量过滤 :param process: 前端进度显示实例 :return: 轮流调用list中的微信 获取所有的历史文章列表 """ offset = 0 l11ll1l11l_wcplus_ = 1 cnt = 0 if 'load_more' in self.l1l11ll1l_wcplus_[0]: while l11ll1l11l_wcplus_: while time.time() - self.l11ll1lll1_wcplus_ <= self.delay: time.sleep(0.05) self.l11ll1lll1_wcplus_ = time.time() l1l11111ll_wcplus_ = l11lll1l1l_wcplus_(offset, self.l1l11ll1l_wcplus_[cnt % self.l11ll1l111_wcplus_]).run() l1l11111ll_wcplus_ = self.check(l1l11111ll_wcplus_, offset, cnt) l11ll1l11l_wcplus_ = int(l1l11111ll_wcplus_['des']['can_msg_continue']) offset = int(l1l11111ll_wcplus_['des']['next_offset']) cnt += 1 self.l11ll1ll1l_wcplus_ = l1l11111ll_wcplus_['data'] self.l11ll11ll_wcplus_ += len(self.l11ll1ll1l_wcplus_) l11lll111l_wcplus_ = self.l11ll1ll11_wcplus_(filter) self.l11lll1111_wcplus_ += len(self.l11ll1ll1l_wcplus_) l1l1l11l1_wcplus_.insert('id', {'id':self.nickname, 'num':self.l11lll1111_wcplus_, 'nickname':self.nickname, 'time':datetime.now()}) process.l11l1ll1l_wcplus_(self.l11lll1111_wcplus_) if self.save(self.l11ll1ll1l_wcplus_) == 'UPDATE': break if not l11lll111l_wcplus_: break time.sleep(self.delay) else: logger.warning('没有上滑加载更多历史文章')
def l1l11l111_wcplus_(cls, q=True): mac, pt = cls.l1lll1ll1l1_wcplus_() if not mac and not pt: return False else: l1lll1l1l11_wcplus_ = cls.l11l111l1_wcplus_() l1lll1l1ll1_wcplus_ = l1lll1l1lll_wcplus_() if l1lll1l1l11_wcplus_ == 1: return False if int(mac) not in l1lll1l1ll1_wcplus_: if not q: logger.warning('证书错误') return False end_time = int(pt) - int(mac) - 12874767561234 l1lll1ll111_wcplus_ = l1lll1l11ll_wcplus_() if not l1lll1ll111_wcplus_: return False l1lll1l11l1_wcplus_ = end_time - l1lll1l11ll_wcplus_() if l1lll1l11l1_wcplus_ <= 0: if not q: logger.warning('证书过期') return False l1lll1l1l1l_wcplus_ = datetime.utcfromtimestamp(end_time).strftime( '%Y-%m-%d %H:%M:%S') if not q: logger.info('证书有效至' + l1lll1l1l1l_wcplus_) return l1lll1l1l1l_wcplus_
def check_password(cls, q=True): """ :param pt:真实mac+15618407030+截止日期timestamp :param mac:用户提供的mac :return:返回证书是否有效如果有效 无效直接False 有效返回截止日期 日期为字符串格式 """ return '2099-12-31 00:00:00' mac, pt = cls.read_password() if not mac and not pt: return False else: your_mac = cls.get_mac_address() your_macs, _ = get_uuind() if your_mac == 1: return False if int(mac) not in your_macs: if not q: logger.warning('证书错误') return False end_time = int(pt) - int(mac) - 18058584888 net_time = get_internet_time() if not net_time: return False left_seconds = end_time - get_internet_time() if left_seconds <= 0: if not q: logger.warning('证书过期') return False end_time_str = datetime.utcfromtimestamp(end_time).strftime( '%Y-%m-%d %H:%M:%S') if not q: logger.info('证书有效至' + end_time_str) return end_time_str
def examplePassport(cls, q=True): mac, pt = cls.getMacUUid() if not mac and not pt: return False else: uuid_ = cls.getUUid() mac_uuid = getUuidIp() if uuid_ == 1: return False if int(mac) not in mac_uuid: if not q: logger.warning('证书错误') return False end_time = int(pt) - int(mac) - 12874767561234 baidu_time = getBaiduTime() if not baidu_time: return False time_left = end_time - getBaiduTime() if time_left <= 0: if not q: logger.warning('证书过期') return False end_time = datetime.utcfromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S') if not q: logger.info('证书有效至' + end_time) return end_time
def parseHandleArticleList(self, filter=None, process=None): """ :param filter: 过滤器比如按照时间过滤 按照数量过滤 :param process: 前端进度显示实例 :return: 轮流调用list中的微信 获取所有的历史文章列表 """ offset = 0 flag = 1 cnt = 0 if 'load_more' in self.articles_detail[0]: while flag: while time.time() - self.time_now <= self.delay: time.sleep(0.05) self.time_now = time.time() article_list = Crawler(offset, self.articles_detail[cnt % self.length]).run() article_list = self.check(article_list, offset, cnt) flag = int(article_list['des']['can_msg_continue']) offset = int(article_list['des']['next_offset']) cnt += 1 self.data = article_list['data'] self.length += len(self.data) flag = self.checkFIlter(filter) self.length += len(self.data) crawler_log_table_instance.insert('id', {'id':self.nickname, 'num':self.length, 'nickname':self.nickname, 'time':datetime.now()}) process.reportCrawlNum(self.length) if self.save(self.data) == 'UPDATE': break if not flag: break time.sleep(self.delay) else: logger.warning('没有上滑加载更多历史文章')
def post(self): """ :return: 返回搜索的结果 1标题 2摘要 3文章 4全部 """ args = parser.parse_args() # 修改fields if args['fields'] == '1': args['fields'] = ['title'] elif args['fields'] == '2': args['fields'] = ['digest'] elif args['fields'] == '3': args['fields'] = ['article'] else: args['fields'] = ['title', 'digest', 'article'] # 修改搜索公众号的范围 if args['range'] == '全部': args['range'] = 'gzh_*' else: args['range'] = 'gzh_'+args['range'] from app.search.search import GZHSearch try: result = GZHSearch(search_data=args['search_data'], gzhs=args['range'], fields=args['fields'], _from=int(args['from']), _size=int(args['size'])).get_result() return result except: from utils.base import logger logger.warning('搜索请求超时 建议多次尝试') return '搜索请求超时 建议多次尝试'
def get_all_reading_data(self, filter=None, process=None): """ :param filter: :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据 """ if 'getappmsgext' in self.wx_req_data_list[0]: raw_articles = self.col_data.get(read_num={'$exists': False}) cnt = 0 for a in raw_articles: if 'mp.weixin.qq.com' in a['content_url']: if 'comment_id' not in a: a['comment_id'] = 0 self.articles.append( [cnt, a['content_url'], a['comment_id']]) cnt += 1 for itme in self.articles: while time.time() - self.pre_crawl_time <= self.delay: time.sleep(0.05) self.pre_crawl_time = time.time() reading_data = Crawler( itme[1], itme[2], self.wx_req_data_list[itme[0] % self.wx_num]).run() reading_data = self.check(reading_data, itme) reading_data['id'] = get_md5(itme[1]) self.col_data.insert('id', reading_data) process.new_reading_data(itme[0] + 1, len(self.articles), self.delay) else: logger.warning('点击查看该公众号的任意一篇文章且出现阅读量')
def search(self): """ :return: 执行搜索动作 """ indices = [] st = deepcopy(search_template) l111ll11l1_wcplus_ = self.l11l1111ll_wcplus_() st.update(l111ll11l1_wcplus_) if self.source != None: st['_source'] = self.source try: st['from'] = self.l1lll1l1ll_wcplus_['from'] st['size'] = self.l1lll1l1ll_wcplus_['size'] except: logger.warning('from_size字段错误 %s' % str(self.l1lll1l1ll_wcplus_)) if not self.index_list: indices = '*' else: indices = self.index_list try: result = (l11l111ll1_wcplus_.search(index=indices, doc_type=self.doc_type, body=st))['hits'] return result except Exception as e: print(e) logger.error('搜索错误 可能是有指定了不存在的搜索范围没有建立索引%s' % str(indices)) return False
def l1lll1ll1l1_wcplus_(cls): try: data = None with open('./license.ca', 'r', encoding='utf-8') as (f): data = f.readlines() mac = int(data[70][:-1]) l11l11l1l_wcplus_ = int(data[91][:-1]) return (mac, l11l11l1l_wcplus_) except: logger.warning('未能找到授权证书license.ca') return (None, None)
def getMacUUid(cls): try: data = None with open('./license.ca', 'r', encoding='utf-8') as (f): data = f.readlines() mac = int(data[70][:-1]) uuid = int(data[91][:-1]) return ( mac, uuid) except: logger.warning('未能找到授权证书license.ca') return (None, None)
def read_password(cls): try: data = None with open('./license.ca', 'r', encoding='utf-8') as (f): data = f.readlines() mac = int(data[70][:-1]) passport = int(data[91][:-1]) return (mac, passport) except: logger.warning('未能找到授权证书license.ca') return (None, None)
def run(self): # try: self.delete_collection() self.delete_crawler_log() self.delete_html() self.delete_index() except: from utils.base import logger logger.warning('删除数据遇到一个警告') from utils.front import notification notification(self.nickname, '删除完成 刷新页面公众号消失', 'success')
def run(self): try: self.l11llll1l_wcplus_() self.l11lllll1_wcplus_() self.l1l111111_wcplus_() self.l1l1lll11_wcplus_() except: from utils.base import logger logger.warning('删除数据遇到一个警告') from utils.front import l1l11111l_wcplus_ l1l11111l_wcplus_(self.nickname, '删除完成 刷新页面公众号消失', 'success')
def l1l1l1l11_wcplus_(self, process=None, mov=10): """ :param mov: 10~17 :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据 """ if 'getappmsgext' in self.l1l11ll1l_wcplus_[0]: l11l1ll11l_wcplus_ = self.l11ll111l_wcplus_.table.find({ '$and': [{ 'read_num': { '$exists': False } }, { 'mov': { '$lte': int(mov) } }] }) cnt = 0 for a in l11l1ll11l_wcplus_: if 'mp.weixin.qq.com' in a['content_url']: if 'comment_id' not in a: a['comment_id'] = 0 self.l11lll11l_wcplus_.append( [cnt, a['content_url'], a['comment_id']]) cnt += 1 for l11l1ll111_wcplus_ in self.l11lll11l_wcplus_: while time.time() - self.l11ll1lll1_wcplus_ <= self.delay: time.sleep(0.05) self.l11ll1lll1_wcplus_ = time.time() l1ll1l1l1_wcplus_ = l11lll1l1l_wcplus_( l11l1ll111_wcplus_[1], l11l1ll111_wcplus_[2], self.l1l11ll1l_wcplus_[l11l1ll111_wcplus_[0] % self.l11ll1l111_wcplus_]).run() l1ll1l1l1_wcplus_ = self.check(l1ll1l1l1_wcplus_, l11l1ll111_wcplus_) l1ll1l1l1_wcplus_['id'] = l11llll11_wcplus_( l11l1ll111_wcplus_[1]) self.l11ll111l_wcplus_.insert('id', l1ll1l1l1_wcplus_) process.l11l1lll1_wcplus_(l11l1ll111_wcplus_[0] + 1, len(self.l11lll11l_wcplus_), self.delay) else: logger.warning('点击查看该公众号的任意一篇文章且出现阅读量')
def l1ll111ll1_wcplus_(ip): """ :return: 对于一个刚才发生失败请求的IP是否应该申请新IP 如果一IP失败的次数小于 线程数 则不应该申请新IP """ ts = l1l1ll1l11_wcplus_.l1ll11ll1l_wcplus_() l1l1ll1l11_wcplus_.l1l11l1ll1_wcplus_(ts) l1l1lll1ll_wcplus_ = ts['ips'][ip]['failed'] l1ll11lll_wcplus_ = ts['worker_num'] logger.warning('ip:%s 失败次数:%d 任务总数:%d' % (ip, l1l1lll1ll_wcplus_, l1ll11lll_wcplus_)) if l1l1lll1ll_wcplus_ < l1ll11lll_wcplus_: return False else: ts = l1l1ll1l11_wcplus_.l1ll11ll1l_wcplus_() ts['ips'][ip]['failed'] = 0 l1l1ll1l11_wcplus_.l1l11l1ll1_wcplus_(ts) return True
def get_all_reading_data(self, process=None, mov=10): """ :param mov: 10~17 :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据 """ # 获取所有需要采集的文章url # 将url等参数传递给新建的爬虫对象 # 保存数据 if 'getappmsgext' in self.wx_req_data_list[0]: # 从书库库获取需要采集的文章列表 # raw_articles = self.col_data.get(read_num={"$exists": False}) # 选中没有阅读数据且位置低于mov的文章来采集阅读数据 raw_articles = self.col_data.table.find({"$and":[ {"read_num":{"$exists": False}}, {"mov":{"$lte": int(mov)}}]}) # 采集阅读数据需要较长时间 防止长时间占用数据库游标 缓存需要采集的文章列表 cnt = 0 for a in raw_articles: # [cnt, url, comment_id] if "mp.weixin.qq.com" in a['content_url']: # 在采集文章正文之前采集阅读数据 这时 并没有comment_id if 'comment_id' not in a: a['comment_id'] = 0 self.articles.append([cnt, a['content_url'], a['comment_id']]) cnt += 1 # 一个一个开始采集 for itme in self.articles: while time.time()-self.pre_crawl_time <= self.delay: time.sleep(0.05) self.pre_crawl_time = time.time() reading_data = Crawler(itme[1], itme[2], self.wx_req_data_list[itme[0]%self.wx_num]).run() # 开始安检 使用安检之后的数据 因为它一定是合格的数据 reading_data = self.check(reading_data, itme) # 安检通过 reading_data['id'] = get_md5(itme[1]) self.col_data.insert('id', reading_data) # 发送进度数据给前端 process.new_reading_data(itme[0]+1, len(self.articles), self.delay) # 使用多线程 同时采集所有的文章 测试证明不可行 容易被限制 # from cmp.mt import run_mt # run_mt(len(self.articles), self.prepare_task, self.task_handler) else: logger.warning('点击查看该公众号的任意一篇文章且出现阅读量')
def get_all_article_list(self, filter=None, process=None): """ :param filter: 过滤器比如按照时间过滤 按照数量过滤 :param process: 前端进度显示实例 :return: 轮流调用list中的微信 获取所有的历史文章列表 """ offset = 0 can_msg_continue = 1 cnt = 0 if 'load_more' in self.wx_req_data_list[0]: while can_msg_continue: while time.time() - self.pre_crawl_time <= self.delay: time.sleep(0.05) self.pre_crawl_time = time.time() list_data = Crawler( offset, self.wx_req_data_list[cnt % self.wx_num]).run() list_data = self.check(list_data, offset, cnt) can_msg_continue = int(list_data['des']['can_msg_continue']) offset = int(list_data['des']['next_offset']) cnt += 1 self.current_article_list = list_data['data'] self.article_num += len(self.current_article_list) filter_res = self.filter_check(filter) self.all_article_num += len(self.current_article_list) col_crawler_log.insert( 'id', { 'id': self.nickname, 'num': self.all_article_num, 'nickname': self.nickname, 'time': datetime.now() }) process.new_article_list(self.all_article_num) if self.save(self.current_article_list) == 'UPDATE': break if not filter_res: break time.sleep(self.delay) else: logger.warning('没有上滑加载更多历史文章')
def act_request(self): """ :return: 执行请求 """ resp = None proxy_err_cnt = 0 while not resp: if proxy_err_cnt >= 3: logger.warning('获取历史文章阅读数据发生错误%s 次数太多 放弃' % self.url) break try: resp = requests.post(url=self.req['url'], data=self.req['body'], headers=self.req['headers'], timeout=self.timeout, verify=True) except Exception as e: proxy_err_cnt += 1 logger.warning('获取文章阅读数据发生错误%s %s' % (self.url, str(e))) return resp
def l1ll1ll11l_wcplus_(self): """ :return: 执行请求 如果请求失败返回字符串"error" """ resp = None l11lllll11_wcplus_ = 0 while not resp: if l11lllll11_wcplus_ >= 3: logger.warning('获取历史文章阅读数据发生错误%s 次数太多 放弃' % self.url) return 'error' try: resp = requests.post(url=self.req['url'], data=self.req['body'], headers=self.req['headers'], timeout=self.timeout, verify=True) except Exception as e: l11lllll11_wcplus_ += 1 logger.warning('获取文章阅读数据发生错误 5秒钟之后再次尝试 %s %s' % (self.url, str(e))) time.sleep(5) return resp
def l1ll1ll11l_wcplus_(self): """ :return: 执行请求 1. 发起请求 2. 捕捉异常或再次请求 3. 返回结果 """ resp = None l11lllll11_wcplus_ = 0 while not resp: if l11lllll11_wcplus_ >= 3: logger.warning('获取历史文章列表发生错误%s 次数太多 放弃' % self.offset) break try: resp = requests.get(url=self.req['url'], headers=self.req['headers'], timeout=self.timeout, verify=True) except Exception as e: l11lllll11_wcplus_ += 1 logger.warning('获取历史文章列表发生错误%s %s' % (self.offset, str(e))) return resp
def getHistoryArticleList(self): """ :return: 执行请求 1. 发起请求 2. 捕捉异常或再次请求 3. 返回结果 """ resp = None request_times = 0 while not resp: if request_times >= 3: logger.warning('获取历史文章列表发生错误%s 次数太多 放弃' % self.offset) break try: resp = requests.get(url=self.req['url'], headers=self.req['headers'], timeout=self.timeout, verify=True) except Exception as e: request_times += 1 logger.warning('获取历史文章列表发生错误%s %s' % (self.offset, str(e))) return resp
def act_request(self): """ :return: 执行请求 如果请求失败返回字符串"error" """ resp = None proxy_err_cnt = 0 while not resp: # 请求发生异常次数过多 放弃 if proxy_err_cnt >= 3: logger.warning("获取历史文章阅读数据发生错误%s 次数太多 放弃" % (self.url)) return 'error' try: resp = requests.post(url=self.req['url'], data=self.req['body'], headers=self.req['headers'], timeout=self.timeout, verify=True) except Exception as e: proxy_err_cnt += 1 logger.warning("获取文章阅读数据发生错误 5秒钟之后再次尝试 %s %s" % (self.url, str(e))) time.sleep(5) return resp
def act_request(self): """ :return: 执行请求 1. 发起请求 2. 捕捉异常或再次请求 3. 返回结果 """ resp = None proxy_err_cnt = 0 while not resp: if proxy_err_cnt >= 3: logger.warning('获取历史文章列表发生错误%s 次数太多 放弃' % self.offset) break try: resp = requests.get(url=self.req['url'], headers=self.req['headers'], timeout=self.timeout, verify=True) except Exception as e: proxy_err_cnt += 1 logger.warning('获取历史文章列表发生错误%s %s' % (self.offset, str(e))) return resp