def count_articles(count_article): # 统计文章数量 collection_name = 'run_counts' log.info('文章数量:{}'.format(count_article)) try: if count_article == 0: return db = mongo_conn() result = db[collection_name].find({}) if result.count() == 0: db[collection_name].insert({ 'save_name': save_name(), 'account_count': 1, 'article_count': 0, 'start': time_strftime(), 'end': None }) log.info('插入文章数成功') for item in db[collection_name].find(): if item.get('save_name') == save_name(): count = count_article + item.get( 'article_count') if item.get( 'article_count') else count_article db[collection_name].update( {'save_name': save_name()}, {'$set': { 'article_count': count }}, upsert=True) log.info('更新文章数量成功') except Exception as e: log.exception(e)
def get_account(self): collection_name = 'run_counts' try: # s = requests.Session() # s.keep_alive = False # s.adapters.DEFAULT_RETRIES = 5 url = 'http://dispatch.yunrunyuqing.com:38082/ScheduleDispatch/dispatch?type=8' resp = requests.get(url, timeout=self.timeout, headers={'Connection': 'close'}) data = json.loads(resp.text).get('data') if not data: # 即返回None return account = json.loads(data).get('account') db = mongo_conn() result = db[collection_name].find({}) if result.count() == 0: db[collection_name].insert({ 'account_count': 1, 'article_count': 0, 'start': time_strftime(), 'end': None, 'save_name': save_name() }) log.info("插入mongo成功") else: updated = False for item in db[collection_name].find(): if item.get('save_name') == save_name(): count = item.get( 'account_count' ) + 1 # if item.get('account_count') else 0 log.info(item) db[collection_name].update( {'save_name': save_name()}, { '$set': { 'account_count': count, 'end': time_strftime() } }, upsert=True) updated = True log.info("更新mongo成功") break if updated is False: log.info('找不到save_name,需要插入') db[collection_name].insert({ 'account_count': 1, 'article_count': 0, 'start': time_strftime(), 'end': None, 'save_name': save_name() }) log.info("插入mongo成功") except Exception as e: log.info('调度获取account出错:{}'.format(e)) return None return [account]
def urls_article(html): collection_name = 'run_counts' items = re.findall('"content_url":".*?,"copyright_stat"', html) urls = [] for item in items: url_last = item[15:-18].replace('amp;', '') url = 'https://mp.weixin.qq.com' + url_last # 部分是永久链接 if '_biz' in url_last: url = re.search('http://mp.weixin.qq.*?wechat_redirect', url_last).group() urls.append(url) continue # 有的文章链接被包含在里面,需再次匹配 if 'content_url' in url: item = re.search('"content_url":".*?wechat_redirect', url).group() url = item[15:].replace('amp;', '') urls.append(url) # 统计文章数量 count_article = len(urls) log.info('文章数量:{}'.format(count_article)) try: if count_article == 0: return urls db = mongo_conn() result = db[collection_name].find({}) if result.count() == 0: db[collection_name].insert({ 'save_name': save_name(), 'account_count': 1, 'article_count': 0, 'start': time_strftime(), 'end': None }) log.info('插入文章数成功') for item in db[collection_name].find(): if item.get('save_name') == save_name(): count = count_article + item.get( 'article_count') if item.get( 'article_count') else count_article db[collection_name].update( {'save_name': save_name()}, {'$set': { 'article_count': count }}, upsert=True) log.info('更新文章数量成功') except Exception as e: log.exception(e) return urls
def get_account(): # 老版 # url = 'http://124.239.144.181:7114/Schedule/dispatch?type=8' # # url = 'http://183.131.241.60:38011/nextaccount?label=5' # resp = requests.get(url, timeout=30) # # data 可能为空 # data_json = resp.text.get('data') # data = json.loads(data_json) # self.search_name = data.get('name') # print(self.search_name) # return self.search_name # 重点采集接口 # account_all = [] # try: # url = 'http://183.131.241.60:38011/nextaccount?label=5' # resp = requests.get(url, timeout=21) # items = json.loads(resp.text) # if len(items) == 0: # return [] # for item in items: # account_all.append(item.get('account')) # log.info("开始account列表 {}".format(account_all)) # except Exception as e: # log.info('获取账号列表错误 {}'.format(e)) # time.sleep(5) # 统计账号 collection_name = 'run_counts' try: url = 'http://dispatch.yunrunyuqing.com:38082/ScheduleDispatch/dispatch?type=8' resp = requests.get(url, timeout=30) data = json.loads(resp.text).get('data') if not data: # 即返回None return account = json.loads(data).get('account') db = mongo_conn() result = db[collection_name].find({}) if result.count() == 0: db[collection_name].insert({ 'account_count': 1, 'article_count': 0, 'start': time_strftime(), 'end': None, 'save_name': save_name() }) log.info("插入mongo成功") else: updated = False for item in db[collection_name].find(): if item.get('save_name') == save_name(): count = item.get( 'account_count' ) + 1 # if item.get('account_count') else 0 log.info(item) db[collection_name].update( {'save_name': save_name()}, { '$set': { 'account_count': count, 'end': time_strftime() } }, upsert=True) updated = True log.info("更新mongo成功") break if updated is False: log.info('找不到save_name,需要插入') db[collection_name].insert({ 'account_count': 1, 'article_count': 0, 'start': time_strftime(), 'end': None, 'save_name': save_name() }) log.info("插入mongo成功") except Exception as e: log.info('调度获取account出错:{}'.format(e)) return None return [account]
def demo_test(text_model, image_model, label_dict, label_dict_en): """ 获取验证码图片、模型识别、提交 :return: """ image_path = utils.download_captcha() raw_texts, raw_images = utils.process_raw_images( image_path, (image_shape[0], image_shape[1])) utils.save_name(raw_texts[0], demo_path, 'text') for i, img in enumerate(raw_images): utils.save_name(raw_images[i], demo_path, i) shutil.copy(image_path, os.path.join(demo_path, 'demo.png')) images = np.array([np.asarray(image) for image in raw_images]) image_predict = image_model.predict(images) image_result = np.argmax(image_predict, 1) image_prob = np.max(image_predict, 1) image_label = [label_dict[r].replace("\xa0", "") for r in image_result] image_label_en = [label_dict_en[r] for r in image_result] text_label = GoogleLens.get_target_text(image_path) print(text_label) print(image_label) print(image_label_en) ids = set() for id, r2 in enumerate(image_label): if text_label == r2: ids.add(id) if len(ids) == 0: txt, score = process.extractOne(text_label, image_label) print(text_label, txt) text_label = txt for id, r2 in enumerate(image_label): if txt == r2: ids.add(id) result = utils.submit_captcha(ids) utils.draw_circle(ids, demo_path, 'demo.png') label = {} for i, l in enumerate(image_label): label[i] = {} label[i]['cn'] = l label[i]['en'] = image_label_en[i] dict = {} dict['text_label'] = text_label dict['text_label_en'] = utils.find_en_word(image_label, image_label_en, text_label) # translate.translate(text_label) dict['label'] = label if "成功" in result: dict['result'] = True else: dict['result'] = False with open(os.path.join(demo_path, 'file.txt'), 'w') as file: file.write(json.dumps(dict, indent=4, ensure_ascii=False))