def get_non_seed_details(self, queue_id): if queue_id is None: return [] query = """ SELECT * FROM details WHERE queue_id = %(queue_id)s AND active=%(active)s AND last_used < %(last_used_cutoff)s ORDER BY last_used ASC LIMIT %(limit)s; """ active_params = { 'queue_id': queue_id, 'active': True, 'last_used_cutoff': LAST_USED_CUTOFF, 'limit': ACTIVE_LIMIT } inactive_params = { 'queue_id': queue_id, 'active': False, 'last_used_cutoff': LAST_USED_CUTOFF, 'limit': INACTIVE_LIMIT } active = [Detail(**d) for d in self.do_query(query, active_params)] inactive = [Detail(**d) for d in self.do_query(query, inactive_params)] return active + inactive
def get_seed_details(self): self.init_seed_details() params = {'seed_queue_id': SEED_QUEUE_ID} query = """ SELECT * FROM details WHERE queue_id=%(queue_id)s AND active=%(active)s AND last_used < %(last_used_cutoff)s ORDER BY last_used ASC LIMIT %(limit)s; """ a_params = { "queue_id": SEED_QUEUE_ID, "active": True, "limit": INITIAL_SEED_COUNT } ia_params = { "queue_id": SEED_QUEUE_ID, "active": False, "limit": INITIAL_SEED_COUNT } a_params['last_used_cutoff'] = LAST_USED_CUTOFF ia_params['last_used_cutoff'] = LAST_USED_CUTOFF active = [Detail(**d) for d in self.do_query(query, a_params)] inactive = [Detail(**d) for d in self.do_query(query, ia_params)] return active + inactive
def create_new_details(self, queue, count=ACTIVE_LIMIT + INACTIVE_LIMIT): fetched_pids_key = "%s%s" % (NEW_QUEUE_PROXY_IDS_PREFIX, queue.domain) fetched_pids = list(self.redis_mgr.redis.smembers(fetched_pids_key)) proxy_ids = self.db_mgr.get_unused_proxy_ids(queue, count, fetched_pids) for proxy_id in proxy_ids: self.redis_mgr.redis.sadd(fetched_pids_key, proxy_id) proxy_key = 'p_%s' % proxy_id if not self.redis_mgr.redis.exists(proxy_key): raise Exception( "Error while trying to create a new detail: proxy key does not exist in redis cache for proxy id %s" % proxy_id) if self.redis_mgr.redis.exists('d_%s_%s' % (queue.queue_key, proxy_key)): continue detail_kwargs = { 'proxy_id': proxy_id, 'proxy_key': proxy_key, 'queue_id': queue.id(), 'queue_key': queue.queue_key } new_detail = Detail(**detail_kwargs) self.redis_mgr.register_detail(new_detail, bypass_db_check=True)
def dequeue(self): if self.is_empty(): raise RedisDetailQueueEmpty( "No proxies available for queue key %s" % self.queue.queue_key) detail = Detail(**self.redis.hgetall(self.redis.lpop(self.redis_key))) return detail
def get_detail_by_queue_and_proxy(self, queue_id, proxy_id): query = "SELECT * FROM details WHERE proxy_id=%(proxy_id)s AND queue_id=%(queue_id)s" params = {'queue_id': queue_id, 'proxy_id': proxy_id} cursor = self.cursor() cursor.execute(query, params) detail_data = cursor.fetchone() if detail_data is None: cursor.close() return None detail = Detail(**detail_data) cursor.close() return detail
def new_proxy(self, address, port, protocol='http'): existing = self.redis_mgr.get_proxy_by_address_and_port(address, port) if existing is None: new_proxy = self.redis_mgr.register_proxy( Proxy(address, port, protocol)) new_detail = Detail(proxy_key=new_proxy.proxy_key, queue_id=SEED_QUEUE_ID) try: self.redis_mgr.register_detail(new_detail) self.redis_mgr.redis.sadd(NEW_DETAILS_SET_KEY, new_detail.detail_key) except DetailExistsException: pass else: logger.warn( "proxy with address %s and port %s already exists in the cache/db." % (address, port))
def sync_to_db(self): logging.info("STARTING SYNC") new_queues = [ Queue(**self.redis_mgr.redis.hgetall(q)) for q in self.redis_mgr.redis.keys("qt_*") ] new_proxies = [ Proxy(**self.redis_mgr.redis.hgetall(p)) for p in self.redis_mgr.redis.keys("pt_*") ] new_detail_keys = set( self.redis_mgr.redis.keys('d_qt*') + self.redis_mgr.redis.keys('d_*pt*')) for ndk in new_detail_keys: self.redis_mgr.redis.sadd(NEW_DETAILS_SET_KEY, ndk) new_details = [ Detail(**self.redis_mgr.redis.hgetall(d)) for d in list(new_detail_keys) ] cursor = self.db_mgr.cursor() queue_keys_to_id = {} proxy_keys_to_id = {} for q in new_queues: self.db_mgr.insert_queue(q, cursor) queue_id = cursor.fetchone()[0] queue_keys_to_id[q.queue_key] = queue_id for p in new_proxies: try: self.db_mgr.insert_proxy(p, cursor) proxy_id = cursor.fetchone()[0] proxy_keys_to_id[p.proxy_key] = proxy_id except psycopg2.errors.UniqueViolation as e: # existing_proxy = self.db_mgr.get_proxy_by_address_and_port(p.address,p.port) proxy_keys_to_id[p.proxy_key] = None for d in new_details: if d.proxy_id is None: new_proxy_id = proxy_keys_to_id[d.proxy_key] if new_proxy_id is None: continue else: d.proxy_id = new_proxy_id if d.queue_id is None: d.queue_id = queue_keys_to_id[d.queue_key] self.db_mgr.insert_detail(d, cursor) changed_detail_keys = self.redis_mgr.redis.sdiff( 'changed_details', 'new_details') changed_details = [ Detail(**self.redis_mgr.redis.hgetall(d)) for d in self.redis_mgr.redis.sdiff('changed_details', 'new_details') ] for changed in changed_details: if (changed.queue_id is None or changed.proxy_id is None): raise Exception( "Unable to get a queue_id or proxy_id for an existing detail" ) self.db_mgr.update_detail(changed) cursor.close() self.redis_mgr.redis.flushall() logging.info("SYNC COMPLETE") return True
def get_all_queue_details(self, queue_key): key_match = 'd_%s*' % queue_key keys = self.redis.keys(key_match) details = [Detail(**self.redis.hgetall(key)) for key in keys] return details
def get_detail(self, redis_detail_key): return Detail(**self.redis.hgetall(redis_detail_key))