def run(self): try: urls_article = [ 'https://mp.weixin.qq.com/s?src=11×tamp=1541559601&ver=1229&signature=ixTsG-RvK8H58t6D-CpW6olWI8hA52Wz-FRb12ZcrNG-lxR20YutoyLYUr-RB3w8WHjE1petjDcbbxZVxTChvPWM27qszWu0Z3zonjx8SEQB5mmgm1O9Eu*5qsFhnBCH&new=1' ] entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): # if page_count < 15: # continue html = requests.get(url) # 确定account信息 name = pq(html.text)('#js_name').text() account_name = pq( html.text)('.profile_meta_value').eq(0).text() log('---{}---{}---'.format(name, account_name)) account = Account() account.name = name account.account = account_name account.get_account_id() article = Article() try: article.create(url, account) except RuntimeError as run_error: log('找不到浏览器 {}'.format(run_error)) log('第{}条 文章标题: {}'.format(page_count, article.title)) log("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log('当前文章ID: {}'.format(entity.id)) # if entity.id in ids: # log('当前文章已存在,跳过') # continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) # if page_count >= 3: # break log("发包") # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) if entity: # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log("发包完成") except Exception as e: log("解析公众号错误 {}".format(e)) if 'chrome not reachable' in str(e): raise RuntimeError('chrome not reachable')
def run(self): count = 0 while True: # ADD_COLLECTION 补采账号 get_account 日常采集; 使用account_list 兼容单个账号和账号列表 account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account( ) # length = len(threading.enumerate()) # 枚举返回个列表 # log.info('当前运行的线程数为:{}'.format(threading.active_count())) count += 1 log.info('第{}次'.format(count)) if account_list is None: log.info('调度队列为空,休眠5秒') time.sleep(5) continue for account_name in account_list: try: self.search_name = account_name html_account = self.account_homepage() if html_account: html = html_account else: log.info('{}|找到不到微信号'.format(account_name)) continue urls_article = self.urls_article(html) # 确定account信息 account = Account() account.name = self.name account.account = account_name account.tags = self.get_tags() account.get_account_id() # 判重 ids = self.dedup(account_name) if JUDEG else '' entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): # if page_count < 15: # continue article = Article() try: article.create(url, account) except RuntimeError as run_error: log.info('微信验证码错误 {}'.format(run_error)) log.info('第{}条 文章标题: {}'.format( page_count, article.title)) log.info("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log.info('当前文章ID: {}'.format(entity.id)) if entity.id in ids and JUDEG is True: log.info('当前文章已存在,跳过') # if page_count >= 20: # log.info('超过20篇文章,跳出') # break continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) # self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log.info('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) # break log.info("开始发包") # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) log.info("ftp发包完成") if entity and backpack_list: # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log.info("数据中心,三合一,发包完成") except Exception as e: log.exception("解析公众号错误 {}".format(e)) if 'chrome not reachable' in str(e): raise RuntimeError('chrome not reachable') if ADD_COLLECTION: break
def run(self): count = 0 while True: # ADD_COLLECTION 补采账号 get_account 日常采集; 使用account_list 兼容单个账号和账号列表 account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account( ) # length = len(threading.enumerate()) # 枚举返回个列表 log.info('当前运行的线程数为:{}'.format(threading.active_count())) log.info('当前运行的进程:{}'.format( multiprocessing.current_process().name)) count += 1 log.info('第{}次'.format(count)) if account_list is None: log.info('调度队列为空,休眠5秒') time.sleep(5) continue for account_name in account_list: try: self.search_name = account_name html_account = self.account_homepage() if html_account: html = html_account else: log.info('{}|找到不到微信号'.format(account_name)) continue urls_article = self.urls_article(html) # 确定account信息 account = Account() account.name = self.name account.account = account_name account.tags = self.get_tags() account.get_account_id() # 判重 查底层 # ids = self.dedup(account_name) if JUDEG else '' # 判重 redis sentenced_keys = account.account + ' ' + str( account.account_id) keys = hash_md5(sentenced_keys) log.info('keys: {}'.format(keys)) dedup_result = self.dedup_redis(keys) post_dedup_urls = [] entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): try: # if page_count > 5: # break article = Article() article.create(url, account, self.proxies) log.info('第{}条 文章标题: {}'.format( page_count, article.title)) log.info("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log.info('当前文章ID: {}'.format(entity.id)) article_date = datetime.datetime.fromtimestamp( int(str(article.time)[:-3])) day_diff = datetime.date.today( ) - article_date.date() if day_diff.days > 15: log.info( '超过采集interval最大15天 的文章不采集,已采集{}条文章'.format( page_count)) self.count_articles(page_count) break if dedup_result: # title_time_str = entity.title + str(entity.time) # title_time_md5 = hash_md5(title_time_str) if entity.id in dedup_result: log.info('当前文章已存在,跳过') continue else: post_dedup_urls.append(entity.id) else: # title_time_str = entity.title + str(entity.time) # title_time_md5 = hash_md5(title_time_str) post_dedup_urls.append(entity.id) # dedup_result = self.dedup_redis(entity) # if dedup_result: # log.info('当前文章已存在,跳过') # ids = ids.append({'key': entity.id, 'urls': entity.url}) # if entity.id in ids and JUDEG is True: # log.info('当前文章已存在,跳过') # continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) # self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log.info('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) except Exception as run_error: log.info('微信解析文章错误 {}'.format(run_error)) continue log.info("开始发包") if entity and backpack_list: # 直接发底层 # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log.info("数据中心,三合一,发包完成") else: log.info('包列表为空,不发送数据') continue # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) log.info("ftp发包完成") if post_dedup_urls: log.info('上传判重中心key:{} urls:{}'.format( keys, post_dedup_urls)) url = 'http://47.100.53.87:8008/Schedule/CacheWx' data = [{ "key": keys, "sourceNodes": "1", "sourceType": "2", "urls": post_dedup_urls }] r = requests.post(url, data=json.dumps(data), timeout=self.timeout) log.info('上传判重中心结果{}'.format(r.status_code)) except Exception as e: log.exception("解析公众号错误 {}".format(e)) time.sleep(30) if ('chrome not reachable' in str(e)) or ('Message: timeout' in str(e)): raise RuntimeError('chrome not reachable') if ADD_COLLECTION: break
def run(self): count = 0 while True: count += 1 log.info('第{}次'.format(count)) account_list = ADD_COLLECTION if ADD_COLLECTION else self.account_list( ) # if account_list: # continue # for account_name in account_list: try: for account_name in account_list: log.info('第{}次'.format(count)) self.search_name = account_name html_account = self.account_homepage() if html_account: html = html_account else: log.info('找到不到微信号首页: '.format(account_name)) continue urls_article = self.urls_article(html) # 确定account信息 account = Account() account.name = self.name account.account = account_name account.get_account_id() # 判重 ids = self.dedup(account_name) entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): # if page_count < 15: # continue article = Article() try: article.create(url, account) except RuntimeError as run_error: log.info('找不到浏览器 {}'.format(run_error)) log.info('第{}条 文章标题: {}'.format( page_count, article.title)) log.info("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log.info('当前文章ID: {}'.format(entity.id)) if entity.id in ids: log.info('当前文章已存在,跳过') continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log.info('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) # if page_count >= 3: # break log.info("发包") # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) if entity: # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log.info("发包完成") except Exception as e: log.exception("解析公众号错误 {}".format(e)) if 'chrome not reachable' in str(e): raise RuntimeError('chrome not reachable') continue