def _update_hash(self): """Updates information hash and if update of whole tree is required adds the pk to the update hash queue""" new = hash_md5(str(self)) if new != self._info_hash: self._info_hash = new
def test_information_node_creation(self): info_node = InformationNode(randomPK, **temp_info) self.assertIsInstance(info_node, InformationNode) self.assertEqual(info_node._pk, randomPK) self.assertEqual(info_node._data_holder, temp_info) self.assertEqual(info_node._info_hash, hash_md5(str(info_node))) self.assertTrue(check_valid_hash(info_node._info_hash))
def into_url_save_dir(mysql_handle_base, job_body): ''' 将任务结果插入 url_save_dir 数据表 ''' for once_task in job_body['task_list']: url = once_task['url'] if 'path' in once_task: timestamp_dir_path = once_task['path'] file_list = os.listdir(once_task['path']) insert_gray_dir_tree(mysql_handle_base, url, timestamp_dir_path, file_list) continue if 'web_save_resource_num' in once_task: web_save_resource_num = once_task['web_save_resource_num'] timestamp_abs_dir_clist = timestamp_dir_path.split('/') update_fields = { 'html': [int(web_save_resource_num['html_num']), 'd'], 'css': [int(web_save_resource_num['css_num']), 'd'], 'js': [int(web_save_resource_num['js_num']), 'd'], 'pic': [int(web_save_resource_num['img_num']), 'd'] } wheres = { 'url_hash': [hash_md5(once_task['url']), 's'], 'timestamp': [timestamp_abs_dir_clist[-1], 's'] } result = mysql_handle_base.update('url_save_dir', update_fields, wheres)
def _update_children_hash(self): """ Updates children hash. """ if not self._children: self._set_base_attribute('_children_hash', DEFAULT_HASH_VALUE) return temp = ''.join((x.get_hash() for x in self._children)) self._children_hash = hash_md5(temp)
def hash_exist(mysql_handle_base, url, timestamp_dir_path): ''' 判断一个记录是否在 url_save_dir 表中 ''' wheres = { 'url_hash': [hash_md5(url), 's'], 'timestamp': [timestamp_dir_path.split('/')[-1], 's'] } return wheres
def __init__(self, config_name='test_engine_conf.yaml'): super(Engine_Model, self).__init__() self.config_name = config_name self.CURRENT_PATH = sys.path[0] self.read_config_public() self.engine_id = hash_md5(getLocalIp() + self.engine_type + self.CURRENT_PATH) self.mq = multiprocessing.Queue() self.lock = multiprocessing.Lock() self.start_server_heart_beat()
def _update_hash(self): """ Updates hash of self, as well as of the tree. If hash has changed. Insert self pk into _update_hash_queue so as to inform that I have a new updated hash, and the corresponding parents should be updated too. """ old = self.get_hash() self._info._update_hash() self._update_children_hash() # assumes that all children have clean hash self._hash = hash_md5(self.get_children_hash() + self.get_info_hash()) new = self.get_hash() self._touch() if new != old: # propogate hash upwards self._update_hash_queue.add(self._pk)
def into_url_list(mysql_handle_base, job_body): ''' 将任务结果在url_list数据表中插入一条记录存储 ''' engine_describe_all = engine_describe.get_engine_describe( mysql_handle_base) # 向数据表中填写task_list中相关数据 table_field_descripe = TableFieldDescripe insert_urls(mysql_handle_base, job_body['url_list'], job_body['add_way']) update_fields = add_table_feild(job_body, table_field_descripe) # 防止下面添加add_way时重复 if 'add_way' in update_fields: del update_fields['add_way'] for once_task in job_body['task_list']: update_fields = add_table_feild( once_task, table_field_descripe, update_fields) # 防止下面添加url时重复 if 'url' in update_fields: del update_fields['url'] run_win_engine_list = [] run_error_engine_list = [] # 分别把操作成功的引擎和操作失败的引擎添加到相应列表中(此处失败分为对网页分析失败和引擎启动失败两部分) # 对网页分析失败部分 for field, value in once_task.iteritems(): if field.find('_status') != -1: engine_code_name = field[0:field.find('_status')] engine_type = engine_describe_all[engine_code_name] if value == True: run_win_engine_list.append(engine_type) elif value == False: run_error_engine_list.append(engine_type) # 引擎启动失败部分 run_error_engine_list.extend(job_body['run_error_engine']) # 引擎执行结果转换给字符串格式 update_fields['run_win_engine'] = ['-'.join(run_win_engine_list), 's'] update_fields['run_error_engine'] = ['-'.join(run_error_engine_list), 's'] update_fields['waiting_engine'] = ['-'.join(update_fields['waiting_engine'][0]), 's'] update_fields['running_engine'] = ['-'.join(update_fields['running_engine'][0]), 's'] # 获取指令url的ip ip_location = get_ip_location(once_task['url']) update_fields = add_table_feild( ip_location, table_field_descripe, update_fields) wheres = {'url_hash': [hash_md5(once_task['url']), 's']} result = mysql_handle_base.update( 'url_list', update_fields, wheres)
def __deco(mysql_handle_base, urls, add_way='test'): @arg_exist(table_name) def hash_exist(mysql_handle_base, url): wheres = {'url_hash': [hash_md5(url), 's']} return wheres fields = [('url_hash', 's'), ('url', 's'), ('add_way', 's'), ('add_time', 's')] param = [] for url in urls: if hash_exist(mysql_handle_base, url) is not False: # 说明该URL已存在 continue param.append((hash_md5(url), url, add_way, get_format_time())) if param == []: return False param = tuple(param) return mysql_handle_base.batch_insert(table_name, fields, param)
def run(self): count = 0 while True: # ADD_COLLECTION 补采账号 get_account 日常采集; 使用account_list 兼容单个账号和账号列表 account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account( ) # length = len(threading.enumerate()) # 枚举返回个列表 log.info('当前运行的线程数为:{}'.format(threading.active_count())) log.info('当前运行的进程:{}'.format( multiprocessing.current_process().name)) count += 1 log.info('第{}次'.format(count)) if account_list is None: log.info('调度队列为空,休眠5秒') time.sleep(5) continue for account_name in account_list: try: self.search_name = account_name html_account = self.account_homepage() if html_account: html = html_account else: log.info('{}|找到不到微信号'.format(account_name)) continue urls_article = self.urls_article(html) # 确定account信息 account = Account() account.name = self.name account.account = account_name account.tags = self.get_tags() account.get_account_id() # 判重 查底层 # ids = self.dedup(account_name) if JUDEG else '' # 判重 redis sentenced_keys = account.account + ' ' + str( account.account_id) keys = hash_md5(sentenced_keys) log.info('keys: {}'.format(keys)) dedup_result = self.dedup_redis(keys) post_dedup_urls = [] entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): try: # if page_count > 5: # break article = Article() article.create(url, account, self.proxies) log.info('第{}条 文章标题: {}'.format( page_count, article.title)) log.info("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log.info('当前文章ID: {}'.format(entity.id)) article_date = datetime.datetime.fromtimestamp( int(str(article.time)[:-3])) day_diff = datetime.date.today( ) - article_date.date() if day_diff.days > 15: log.info( '超过采集interval最大15天 的文章不采集,已采集{}条文章'.format( page_count)) self.count_articles(page_count) break if dedup_result: # title_time_str = entity.title + str(entity.time) # title_time_md5 = hash_md5(title_time_str) if entity.id in dedup_result: log.info('当前文章已存在,跳过') continue else: post_dedup_urls.append(entity.id) else: # title_time_str = entity.title + str(entity.time) # title_time_md5 = hash_md5(title_time_str) post_dedup_urls.append(entity.id) # dedup_result = self.dedup_redis(entity) # if dedup_result: # log.info('当前文章已存在,跳过') # ids = ids.append({'key': entity.id, 'urls': entity.url}) # if entity.id in ids and JUDEG is True: # log.info('当前文章已存在,跳过') # continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) # self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log.info('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) except Exception as run_error: log.info('微信解析文章错误 {}'.format(run_error)) continue log.info("开始发包") if entity and backpack_list: # 直接发底层 # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log.info("数据中心,三合一,发包完成") else: log.info('包列表为空,不发送数据') continue # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) log.info("ftp发包完成") if post_dedup_urls: log.info('上传判重中心key:{} urls:{}'.format( keys, post_dedup_urls)) url = 'http://47.100.53.87:8008/Schedule/CacheWx' data = [{ "key": keys, "sourceNodes": "1", "sourceType": "2", "urls": post_dedup_urls }] r = requests.post(url, data=json.dumps(data), timeout=self.timeout) log.info('上传判重中心结果{}'.format(r.status_code)) except Exception as e: log.exception("解析公众号错误 {}".format(e)) time.sleep(30) if ('chrome not reachable' in str(e)) or ('Message: timeout' in str(e)): raise RuntimeError('chrome not reachable') if ADD_COLLECTION: break
def hash_exist(mysql_handle_base, url): ''' 判断一个url是否在url_list表中 ''' wheres = {'url_hash': [hash_md5(url), 's']} return wheres
def hash_exist(mysql_handle_base, url): wheres = {'url_hash': [hash_md5(url), 's']} return wheres
def __deco(mysql_handle_base, url, timestamp_dir_path, file_list=[], update_sign=True): timestamp_abs_dir_clist = timestamp_dir_path.split('/') fields = { 'url_hash': [hash_md5(url), 's'], 'timestamp': [timestamp_abs_dir_clist[-1], 's'], 'url': [url, 's'], 'update_time': [get_format_time(), 's'], 'save_path': [ '/'.join(timestamp_abs_dir_clist[timestamp_abs_dir_clist. index('web_info') + 1:]), 's' ], 'url_file': [0, 'd'], 'main_html': [0, 'd'], 'normal_html_html': [0, 'd'], 'html': [0, 'd'], 'css': [0, 'd'], 'js': [0, 'd'], 'pic': [0, 'd'], 'text_json': [0, 'd'], 'block_json': [0, 'd'], 'border_json': [0, 'd'], 'block_html': [0, 'd'], 'cut_img': [0, 'd'], 'vips_imgs_txt': [0, 'd'], 'view_json': [0, 'd'], 'webpage_jpeg': [0, 'd'], 'blockpage_jpeg': [0, 'd'], 'other': ['', 's'] } for f in file_list: if f == 'images': # 第二版网页保存引擎中images目录下存储资源 resource_nums = count_resource_num(timestamp_dir_path) fields['js'][0] = resource_nums[0] fields['css'][0] = resource_nums[1] fields['pic'][0] = resource_nums[2] fields['html'][0] = resource_nums[3] elif f in fields: #对文件名和fields中一致的文件或目录进行添加, 第一版网页保存引擎中各资源存储在fields中对应名称的目录下 if os.path.isdir(pjoin(timestamp_dir_path, f)): f_num = len(os.listdir(pjoin(timestamp_dir_path, f))) fields[f][0] = f_num else: fields[f][0] = 1 elif f.replace('.', '_') in fields: # 对有后缀的文件进行添加 fields[f.replace('.', '_')][0] = 1 else: fields['other'][0] += (f + '/') wheres = { 'url_hash': fields['url_hash'], 'timestamp': fields['timestamp'] } if mysql_handle_base.select(table_name, ['*'], wheres): if update_sign: mysql_handle_base.update(table_name, fields, wheres) else: mysql_handle_base.insert(table_name, fields) return True
def run(self): # self.set_name() # while True: account_list = ['大数据发布', '上海港湾集团', '绿盟365', '酌梦录', '瞄了个喵', '豪德通讯', '魔都娱乐1', '大侠的小宇宙', '澳洲梦', '盛世路跑', '佛系金融女', '中卫今日热点', '金华社区居委会', '昕说法', '华农海洋研会', '尘埃一生', '革镇堡街道普法', '速度车行', '七分钟高清视频', '摘星少女酱', '青海省格尔木市健桥医院', '乐用好车', '最强省钱喵喵君', '石柱港航', '荣盛物业长沙花语馨苑客服中心', '汕头超声集团', '中奥吴郡半岛', '隽永人生', '飞鸿影视传媒', 'RGSE义乌雨具遮阳及防护用品展'] articles = [] ID = hash_md5(self.name) for name in account_list: if len(name) == 0: continue self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage account.get_account_id() backpack_list = [] for page_count, url in enumerate(urls_article): # if page_count < 35: # continue article = Article() article.create(url, self.name) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # 所有文章 article_info = backpack.to_dict() articles.append({ID: article_info}) # 上传数据库 import pymongo conn = pymongo.MongoClient('120.78.237.213', 27017) sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = ( article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title ) uploads_mysql(config_mysql, sql, _tuple) # if page_count == 5: # break log("发包") if entity: entity.uploads(backpack_list)