def __init__(self, tab_urls, depth, process_num = None): ''' @summary: --------- @param tab_urls: @param depth: @param process_num: 进程编号 --------- @result: ''' super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False self._tab_worker_status = 'news:worker_status' self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')
def __init__(self): self._mysqldb = MysqlDB(**config.get('mysqldb')) self._redis = RedisDB(**config.get('redisdb')) self._task_root_key = config.get('spider').get( 'redis_task_cache_root_key') self._account_task_key = self._task_root_key + ':z_account_task' self._article_task_key = self._task_root_key + ':z_article_task' self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time' self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time' self._ignore_haved_crawl_today_article_account = config.get( 'spider').get('ignore_haved_crawl_today_article_account') self._monitor_interval = config.get('spider').get('monitor_interval') self._zombie_account_not_publish_article_days = config.get( 'spider').get('zombie_account_not_publish_article_days') self._spider_interval_min = config.get('spider').get( 'spider_interval').get('min_sleep_time') self._spider_interval_max = config.get('spider').get( 'spider_interval').get('max_sleep_time') self._spider_interval_max = config.get('spider').get( 'spider_interval').get('max_sleep_time') self._crawl_time_range = (config.get("spider").get("crawl_time_range") or "~").split('~')
def __init__(self, table_article='articles'): if not hasattr(self, '_table_article'): super(ArticleManager, self).__init__() self._thread_stop = False self._articles_deque = collections.deque() self._db = RedisDB() self._table_article = table_article
def monitor_proxies(): redis_0 = RedisDB() config = os.path.join(os.path.dirname(__file__) + '/../config.conf') redis_key = tools.get_conf_value(config, 'redis', 'redis_key') redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2') sum = redis_0.count(redis_key) sum2 = redis_0.count(redis_key2) log.debug("douban当前redis库中剩余ip总数:%d" % sum) log.debug("weibo当前redis库中剩余ip总数:%d" % sum2)
def __init__(self, table_url = 'urls'): if not hasattr(self,'_table_url'): super(UrlManager, self).__init__() self._thread_stop = False self._urls_deque = collections.deque() self._db = RedisDB() self._table_url = table_url self._table_url_dupefilter = self._table_url + '_dupefilter' self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter'
def __init__(self, table_article = 'articles'): if not hasattr(self,'_table_article'): super(ArticleManager, self).__init__() self._thread_stop = False self._articles_deque = collections.deque() self._db = RedisDB() self._table_article = table_article self._table_article_bloomfilter = table_article + '_bloomfilter' self._bloomfilter = BloomFilter(redis_obj = self._db, key = self._table_article_bloomfilter)
def random_proxy(): try: redis = RedisDB() ip_pools = redis.sget(table=redis_key, count=1) proxy = random.choice(ip_pools) proxies = { "http": proxy, "https": proxy, } except Exception as e: print(e) proxies = {} return proxies
def __init__(self, tab_urls, depth): super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth # or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False
def __init__(self): super(SyncArtice, self).__init__() self._es = ES() self._redis = RedisDB() self._sync_count = 0
print(f"{bcolors.OKGREEN}Client 2:{bcolors.ENDC} " f"{bcolors.OKCYAN}Updating resource '" + RESOURCE_NAME + f"' {bcolors.ENDC}") # Update the resource redis.update(name=RESOURCE_NAME, key="client", value="client_2") redis.update(name=RESOURCE_NAME, key="random", value=random.random()) else: print( f"{bcolors.FAIL}Client 2: Error acquiring the lock on resource '" + RESOURCE_NAME + f"' {bcolors.ENDC}") # Create Redis instance redis = RedisDB() # Create Redlock instance dlm = Redlock([ { "host": "localhost", "port": 6379, "db": 0 }, ]) print(f"{bcolors.OKBLUE}## EXECUTING TEST 6 ##{bcolors.ENDC}") print( f"{bcolors.OKBLUE} Several clients, Several locks, one resource, client blocked {bcolors.ENDC}" )
def __init__(self): self.redis = RedisDB() self.test_url = "https://movie.douban.com/"
def __init__(self): self.redis = RedisDB() self.test_url = 'https://m.weibo.cn/'
def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._wechat_sogo = WechatSogou()
class WechatService(): _db = OracleDB() _es = ES() _redisdb = RedisDB() _wechat_sogou = WechatSogou() _wechat_public_platform = WechatPublicPlatform() _todo_accounts = collections.deque() _rownum = 1 _is_done = False # 做完一轮 _is_all_done = False # 所有账号当日发布的消息均已爬取 # wechat_sogou 最后没被封的时间 _wechat_sogou_enable = True _wechat_sogou_last_unenable_time = tools.get_current_timestamp() # wechat_public_platform 最后没被封的时间 _wechat_public_platform_enable = True _wechat_public_platform_last_unenable_time = tools.get_current_timestamp() def __init__(self): pass def __load_todo_account(self): accounts = WechatService._redisdb.sget('wechat:account', count=1) for account in accounts: account = eval(account) WechatService._todo_accounts.append(account) def is_have_new_article(self, account_id, account_name, __biz): ''' @summary: 检查是否有新发布的文章 --------- @param account_id: @param __biz: --------- @result: ''' result = '' if WechatService._wechat_sogou_enable: # 搜狗微信可用 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: # 被封了 请求失败 记录下失败时间 WechatService._wechat_sogou_enable = False WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 搜狗微信停用时间超过24小时了 可重新尝试 elif tools.get_current_timestamp( ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_sogou.is_have_new_article( account_id=account_id, account=account_name) if result == constance.UPDATE: # 搜狗微信可用 WechatService._wechat_sogou_enable = True elif result == constance.NOT_UPDATE: pass elif result == constance.ERROR: pass elif result == constance.VERIFICATION_CODE: pass # 更新下可用时间 WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp( ) # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章 if not result or result == constance.VERIFICATION_CODE: if WechatService._wechat_public_platform_enable: # 微信公众平台可用 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 pass elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 WechatService._wechat_public_platform_enable = False WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) elif tools.get_current_timestamp( ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL: # 搜狗微信不可用 但是已经间歇一天 还可以一试 result = WechatService._wechat_public_platform.is_have_new_article( __biz) if result == constance.UPDATE: # 有新发布的文章 抓取 WechatService._wechat_public_platform_enable = True elif result == constance.NOT_UPDATE: # 无新发布的文章 pass pass elif result == constance.ERROR: # 被封了 请求失败 记录下失败时间 pass # 更新下可用时间 WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp( ) return result def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' if not WechatService._todo_accounts: self.__load_todo_account() if not WechatService._todo_accounts: return None oralce_id, account_id, account_name, last_article_release_time, biz = WechatService._todo_accounts.popleft( ) next_account_id = account_id next_account_biz = biz next_account_name = account_name next_account = next_account_id, next_account_biz sql = "update TAB_IOPM_SITE t set t.spider_status=602 where t.biz = '%s'" % ( next_account_biz) WechatService._db.update(sql) return next_account def update_account_article_num(self, __biz): # 查询es 统计数量 # 今日 body = { "size": 0, "query": { "filtered": { "filter": { "range": { "record_time": { "gte": tools.get_current_date('%Y-%m-%d') + ' 00:00:00', "lte": tools.get_current_date('%Y-%m-%d') + ' 23:59:59' } } }, "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) today_msg = result.get('hits', {}).get('total', 0) # 历史总信息量 body = { "size": 0, "query": { "filtered": { "query": { 'match': { "__biz": __biz } } } } } result = WechatService._es.search('wechat_article', body) total_msg = result.get('hits', {}).get('total', 0) if total_msg: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d, t.spider_status=603 where t.biz = '%s'" % ( today_msg, total_msg, __biz) else: sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.spider_status=603 where t.biz = '%s'" % ( today_msg, __biz) print(sql) WechatService._db.update(sql) def is_exist(self, table, data_id): if WechatService._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- 标题 %s 发布时间 %s 作者 %s 公众号 %s url %s ''' % (article_info['title'], article_info['release_time'], article_info['author'], article_info['account'], article_info['url'])) WechatService._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) WechatService._es.add('wechat_account', account_info, account_info.get('__biz'))
def __init__(self): self.redis = RedisDB()
import socket import os, signal import time from storage_module.dection_ping_proxy import check_ip from retrying import retry # def write(content_info): # f = open('D:\start_get_ip\pid.txt','a',encoding="utf-8") # f.write(str(content_info)+"\n") # f.close() # pid=os.getpid() # print(pid) # write(pid) # while True: # print(1) # time.sleep(34) redis_0 = RedisDB() MAX_POOL = 400 config = os.path.join('D:\proxy\\' + 'config.conf') redis_key = tools.get_conf_value(config, 'redis', 'redis_key') redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2') bj_ip = socket.gethostbyname(socket.gethostname()) def retry(attempt): def decorator(func): def wrapper(*args, **kw): att = 0 while att < attempt: try: return func(*args, **kw)
class TaskService(): _task_ring_buff = RingBuff(TASK_BUFFER_SIZE) _offset = 1 _lock = threading.RLock() _spider_start_timestamp = 0 _spider_end_timestamp = 0 _total_task_size = 0 _db = OracleDB() _redisdb = RedisDB() def __init__(self ): pass def load_task(self): if TaskService._offset == 1: log.info('开始新的一轮抓取') TaskService._spider_start_timestamp = tools.get_current_timestamp() TaskService._total_task_size = 0 # 清空url表 TaskService._redisdb.clear('news:news_urls') TaskService._redisdb.clear('news:news_urls_dupefilter') task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.depth, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and t.position != 35 and rownum < {page_size}) where r >= {offset} '''.format(page_size = TaskService._offset + TASK_BUFFER_SIZE, offset = TaskService._offset) TaskService._offset += TASK_BUFFER_SIZE print(task_sql) tasks = TaskService._db.find(task_sql) TaskService._total_task_size += len(tasks) if not tasks: TaskService._spider_end_timestamp = tools.get_current_timestamp() log.info('已做完一轮,共处理网站%s个 耗时%s'%(TaskService._total_task_size, tools.seconds_to_h_m_s(TaskService._spider_end_timestamp - TaskService._spider_start_timestamp))) TaskService._offset = 1 self.load_task() TaskService._task_ring_buff.put_data(tasks) def get_task(self, count = TASK_COUNT): TaskService._lock.acquire() #加锁 tasks = TaskService._task_ring_buff.get_data(count) if not tasks: self.load_task() tasks = TaskService._task_ring_buff.get_data(count) TaskService._lock.release() return {'tasks':tasks, 'thread_count':THREAD_COUNT} def update_task_status(self, tasks, status): TaskService._lock.acquire() #加锁 for task in tasks: website_id = task[0] sql = "update tab_iopm_site t set t.spider_time = to_date('%s', 'yyyy-mm-dd :hh24:mi:ss'), t.spider_status = %s where id = %s"%(tools.get_current_date(), status, website_id) TaskService._db.update(sql) TaskService._lock.release()
def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._news_url_table = 'news:news_urls' self._news_urls_dupefilter = 'news:news_urls_dupefilter'