def test_rdir_linking_old(self): """ Tests that rdir services linked to rawx services are not on the same locations """ self.skipTest('Deprecated way of linking rdir services') self._reload_proxy() cs = ConscienceClient({'namespace': self.ns}) rawx_list = cs.all_services('rawx') rdir_dict = {x['addr']: x for x in cs.all_services('rdir')} # Link the services for rawx in rawx_list: self.api.link('_RDIR_TEST', rawx['addr'], 'rdir', autocreate=True) # Do the checks for rawx in rawx_list: linked_rdir = self.api.list( '_RDIR_TEST', rawx['addr'], service_type='rdir')['srv'] rdir = rdir_dict[linked_rdir[0]['host']] rawx_loc = rawx['tags'].get('tag.loc') rdir_loc = rdir['tags'].get('tag.loc') self.assertNotEqual(rawx_loc, rdir_loc) # Unlink the services for rawx in rawx_list: self.api.unlink('_RDIR_TEST', rawx['addr'], 'rdir') self.api.delete('_RDIR_TEST', rawx['addr'])
def _wait_account_meta2(self): # give account and meta2 time to catch their breath wait = False cluster = ConscienceClient({"namespace": self.ns}) for i in range(10): try: for service in cluster.all_services("account"): # Score depends only on CPU usage. if int(service['score']) < 70: wait = True continue if not wait: for service in cluster.all_services("meta2"): # Score depends also on available storage. if int(service['score']) < 50: wait = True continue if not wait: return except exc.OioException: pass wait = False time.sleep(5) else: logging.warn('Some scores may still be low, ' 'but we already waited for 50 seconds')
def _smart_link_rdir(self, volume_id, cs=None, all_rdir=None): """ Force the load balancer to avoid services that already host more bases than the average while selecting rdir services. """ if not cs: cs = ConscienceClient(self.conf) if not all_rdir: all_rdir = cs.all_services("rdir", True) avail_base_count = [x["tags"]["stat.opened_db_count"] for x in all_rdir if x["score"] > 0] mean = sum(avail_base_count) / float(len(avail_base_count)) avoids = [ _make_id(self.ns, "rdir", x["addr"]) for x in all_rdir if x["score"] > 0 and x["tags"]["stat.opened_db_count"] > mean ] known = [_make_id(self.ns, "rawx", volume_id)] try: polled = cs.poll("rdir", avoid=avoids, known=known)[0] except ClientException as exc: if exc.status != 481: raise # Retry without `avoids`, hoping the next iteration will rebalance polled = cs.poll("rdir", known=known)[0] forced = {"host": polled["addr"], "type": "rdir", "seq": 1, "args": "", "id": polled["id"]} self.directory.force(RDIR_ACCT, volume_id, "rdir", forced, autocreate=True) return polled["id"]
class Harasser(object): def __init__(self, ns, max_containers=256, max_contents=256): conf = {'namespace': ns} self.cs = ConscienceClient(conf) self.rdir = RdirClient(conf) self.rawx_list = [x['addr'] for x in self.cs.all_services('rawx')] self.sent = set() self.max_containers = max_containers self.max_contents = max_contents def harass_put(self, loops=None): if loops is None: loops = random.randint(1000, 2000) print "Pushing %d fake chunks" % loops loop = loops count_start_container = random.randrange(2**20) count_start_content = random.randrange(2**20) start = time.time() nb_rawx = len(self.rawx_list) while loop > 0: args = {'mtime': int(start)} # vol_id = random.choice(self.rawx_list) # container_id = "%064X" % (random.randrange(self.max_containers)) # content_id = "%032X" % (random.randrange(self.max_contents)) vol_id = self.rawx_list[loop % nb_rawx] container_id = "%064X" % (loop + count_start_container) content_id = "%032X" % (loop + count_start_content) chunk_id = "http://%s/%064X" \ % (vol_id, random.randrange(2**128)) self.rdir.chunk_push( vol_id, container_id, content_id, chunk_id, **args) self.sent.add((vol_id, container_id, content_id, chunk_id)) loop -= 1 end = time.time() print "%d pushed in %.3fs, %d req/s" \ % (loops, end-start, loops/(end-start)) def harass_del(self, min_loops=0): min_loops = min(min_loops, len(self.sent)) loops = random.randint(min_loops, len(self.sent)) print "Removing %d fake chunks" % loops loop = loops start = time.time() while loop > 0: args = self.sent.pop() self.rdir.chunk_delete(*args) loop -= 1 end = time.time() print "%d removed in %.3fs, %d req/s" \ % (loops, end-start, loops/(end-start)) def __call__(self): try: while True: self.harass_put() self.harass_del() except KeyboardInterrupt: print "Cleaning..." self.harass_del(len(self.sent))
def assign_all_rawx(self): """ Find a rdir service for all rawx that don't have one already. """ cs = ConscienceClient(self.conf) all_rawx = cs.all_services("rawx") all_rdir = cs.all_services("rdir", True) by_id = {_make_id(self.ns, "rdir", x["addr"]): x for x in all_rdir} for rawx in all_rawx: try: # Verify that there is no rdir linked resp = self.directory.get(RDIR_ACCT, rawx["addr"], service_type="rdir") rawx["rdir"] = by_id[_make_id(self.ns, "rdir", self._lookup_rdir_host(resp))] except (NotFound, ClientException): rdir = self._smart_link_rdir(rawx["addr"], cs, all_rdir) n_bases = by_id[rdir]["tags"].get("stat.opened_db_count", 0) by_id[rdir]["tags"]["stat.opened_db_count"] = n_bases + 1 rawx["rdir"] = by_id[rdir] return all_rawx
def assign_all_rawx(self, max_per_rdir=None): """ Find a rdir service for all rawx that don't have one already. :param max_per_rdir: maximum number or rawx services that an rdir can be linked to :type max_per_rdir: `int` """ cs = ConscienceClient(self.conf) all_rawx = cs.all_services('rawx') all_rdir = cs.all_services('rdir', True) if len(all_rdir) <= 0: raise ServiceUnavailable("No rdir service found in %s" % self.ns) by_id = {_make_id(self.ns, 'rdir', x['addr']): x for x in all_rdir} for rawx in all_rawx: try: # Verify that there is no rdir linked resp = self.directory.list(RDIR_ACCT, rawx['addr'], service_type='rdir') rdir_host = _filter_rdir_host(resp) try: rawx['rdir'] = by_id[_make_id(self.ns, 'rdir', rdir_host)] except KeyError: self.logger.warn("rdir %s linked to rawx %s seems down", rdir_host, rawx['addr']) except (NotFound, ClientException): if rawx['score'] <= 0: self.logger.warn("rawx %s has score %s, and thus cannot be" " affected a rdir (load balancer " "limitation)", rawx['addr'], rawx['score']) continue rdir = self._smart_link_rdir(rawx['addr'], cs, all_rdir, max_per_rdir) n_bases = by_id[rdir]['tags'].get("stat.opened_db_count", 0) by_id[rdir]['tags']["stat.opened_db_count"] = n_bases + 1 rawx['rdir'] = by_id[rdir] return all_rawx
def get_assignation(self): cs = ConscienceClient(self.conf) all_rawx = cs.all_services('rawx') all_rdir = cs.all_services('rdir', True) by_id = {_make_id(self.ns, 'rdir', x['addr']): x for x in all_rdir} for rawx in all_rawx: try: # Verify that there is no rdir linked resp = self.directory.list(RDIR_ACCT, rawx['addr'], service_type='rdir') rdir_host = _filter_rdir_host(resp) try: rawx['rdir'] = by_id[_make_id(self.ns, 'rdir', rdir_host)] except KeyError: self.logger.warn("rdir %s linked to rawx %s seems down", rdir_host, rawx['addr']) rawx['rdir'] = {"addr": rdir_host, "tags": dict()} by_id[_make_id(self.ns, 'rdir', rdir_host)] = rawx['rdir'] except NotFound: self.logger.info("No rdir linked to %s", rawx['addr']) return all_rawx, all_rdir
def test_rdir_linking(self): """ Tests that rdir services linked to rawx services are not on the same locations """ cs = ConscienceClient({'namespace': self.ns}) rawx_list = cs.all_services('rawx') rdir_dict = {x['addr']: x for x in cs.all_services('rdir')} # Link the services for rawx in rawx_list: self.api.link('_RDIR_TEST', rawx['addr'], 'rdir', autocreate=True) # Do the checks for rawx in rawx_list: linked_rdir = self.api.get( '_RDIR_TEST', rawx['addr'], service_type='rdir')['srv'] rdir = rdir_dict[linked_rdir[0]['host']] rawx_loc = rawx['tags'].get('tag.loc') rdir_loc = rdir['tags'].get('tag.loc') self.assertNotEqual(rawx_loc, rdir_loc) # Unlink the services for rawx in rawx_list: self.api.unlink('_RDIR_TEST', rawx['addr'], 'rdir') self.api.delete('_RDIR_TEST', rawx['addr'])
def main(): args = make_arg_parser().parse_args() conf = args.conf_file if args.conf_file else "mover.conf" cs = ConscienceClient({"namespace": args.namespace}) all_rawx = cs.all_services('rawx', full=True) # Sort rawx by disk usage all_rawx.sort(key=lambda c: c['tags']['stat.space']) dic = dict() sum_size = 0 nrawx = 0 for rawx in all_rawx: addr = rawx['addr'] volume = rawx['tags']['tag.vol'] du = float(rawx['tags']['stat.space']) # Keep only rawx with big disk usage if nrawx < len(all_rawx)/2: dic[addr] = volume sum_size += du nrawx += 1 if nrawx == 0: return av = sum_size/nrawx # Lock rawx, run blob-mover and unlock rawx target_use = str(int(av)) for addr in dic: infos_srv = {"addr": addr, "type": "rawx"} print("Lock rawx at " + addr) if not args.dry_run: cs.lock_score(infos_srv) print("Run mover on rawx at " + addr + " to get disk usage under " + str(target_use)) if not args.dry_run: try: output = move_rawx(dic[addr], target_use, args.user, args.namespace, conf) print(output) except Exception as err: print("ERROR: " + str(err)) print("Unlock rawx at " + addr) if not args.dry_run: cs.unlock_score(infos_srv) # Delete mover configuration file if not args.dry_run: remove(conf)
class XcuteOrchestrator(object): DEFAULT_DISPATCHER_TIMEOUT = 2 DEFAULT_REFRESH_TIME_BEANSTALKD_WORKERS = 30 DEFAULT_MAX_JOBS_PER_BEANSTALKD = 1024 def __init__(self, conf, logger=None): self.conf = conf self.logger = logger or get_logger(self.conf) self.backend = XcuteBackend(self.conf, logger=self.logger) self.conscience_client = ConscienceClient(self.conf) self.orchestrator_id = self.conf.get('orchestrator_id') if not self.orchestrator_id: raise ValueError('Missing orchestrator ID') self.logger.info('Using orchestrator ID: %s', self.orchestrator_id) self.beanstalkd_workers_tube = self.conf.get('beanstalkd_workers_tube') if not self.beanstalkd_workers_tube: raise ValueError('Missing beanstalkd workers tube') self.logger.info('Using beanstalkd workers tube: %s', self.beanstalkd_workers_tube) self.beanstalkd_reply_addr = self.conf.get('beanstalkd_reply_addr') if not self.beanstalkd_reply_addr: raise ValueError('Missing beanstalkd reply address') self.beanstalkd_reply_tube = self.conf.get( 'beanstalkd_reply_tube', self.beanstalkd_workers_tube + '.reply') self.logger.info('Using beanstalkd reply : %s %s', self.beanstalkd_reply_addr, self.beanstalkd_reply_tube) self.refresh_time_beanstalkd_workers = int_value( self.conf.get('refresh_time_beanstalkd_workers'), self.DEFAULT_REFRESH_TIME_BEANSTALKD_WORKERS) self.max_jobs_per_beanstalkd = int_value( self.conf.get('max_jobs_per_beanstalkd'), self.DEFAULT_MAX_JOBS_PER_BEANSTALKD) self.running = True self.beanstalkd_workers = dict() self.refresh_beanstalkd_workers_thread = None self.listen_beanstalkd_reply_thread = None self.dispatch_tasks_threads = dict() self.compute_total_tasks_threads = dict() def handle_backend_errors(self, func, *args, **kwargs): while True: try: return func(*args, **kwargs), None except (RedisConnectionError, RedisTimeoutError) as exc: self.logger.warn( 'Fail to communicate with redis: %s', exc) if not self.running: return None, exc sleep(1) def safe_run_forever(self): try: self.run_forever() except Exception as exc: self.logger.exception('Fail to run forever: %s', exc) self.exit_gracefully() if self.refresh_beanstalkd_workers_thread: self.refresh_beanstalkd_workers_thread.join() if self.listen_beanstalkd_reply_thread: self.listen_beanstalkd_reply_thread.join() for dispatch_tasks_thread in self.dispatch_tasks_threads.values(): dispatch_tasks_thread.join() for compute_total_tasks_thread \ in self.compute_total_tasks_threads.values(): compute_total_tasks_thread.join() self.logger.info('Exited running thread') def run_forever(self): """ Take jobs from the queue and spawn threads to dispatch them """ # gather beanstalkd info self.refresh_beanstalkd_workers_thread = threading.Thread( target=self.refresh_beanstalkd_workers_forever) self.refresh_beanstalkd_workers_thread.start() # start processing replies self.listen_beanstalkd_reply_thread = threading.Thread( target=self.listen_beanstalkd_reply_forever) self.listen_beanstalkd_reply_thread.start() if not self.running: return # restart running jobs self.logger.debug('Look for unfinished jobs') orchestrator_jobs, exc = self.handle_backend_errors( self.backend.list_orchestrator_jobs, self.orchestrator_id) if exc is not None: self.logger.warn( 'Unable to list running jobs for this orchestrator: %s', exc) return for job_info in orchestrator_jobs: if not self.running: return self.safe_handle_running_job(job_info) # run next jobs while self.running: sleep(1) job_info, exc = self.handle_backend_errors( self.backend.run_next, self.orchestrator_id) if exc is not None: self.logger.warn('Unable to run next job: %s', exc) return if not job_info: continue self.safe_handle_running_job(job_info) def safe_handle_running_job(self, job_info): try: job_id = job_info['job']['id'] job_type = job_info['job']['type'] self.logger.info('Run job %s: %s', job_id, job_type) self.handle_running_job(job_id, job_type, job_info) except Exception as exc: self.logger.exception('Failed to run job %s: %s', job_id, exc) _, exc = self.handle_backend_errors( self.backend.fail, job_id) if exc is not None: self.logger.warn( '[job_id=%s] Job has not been updated ' 'with the failure: %s', job_id, exc) def handle_running_job(self, job_id, job_type, job_info): """ First launch the computation of total number of tasks, then launch the dispatchnig of all tasks across the platform. """ if job_info['tasks']['all_sent']: self.logger.info( '[job_id=%s] All tasks are already sent', job_id) return job_class = JOB_TYPES[job_type] job = job_class(self.conf, logger=self.logger) if job_info['tasks']['total'] == 0 \ and job_info['tasks']['is_total_temp'] \ and job_info['tasks']['sent'] == 0 \ and not job_info['tasks']['all_sent']: job.prepare(job_info['config']['params']) if job_id in self.compute_total_tasks_threads: self.logger.info( '[job_id=%s] Already computing the total number of tasks', job_id) elif job_info['tasks']['is_total_temp']: compute_total_tasks_thread = threading.Thread( target=self.safe_compute_total_tasks, args=(job_id, job_type, job_info, job)) compute_total_tasks_thread.start() self.compute_total_tasks_threads[job_id] = \ compute_total_tasks_thread else: self.logger.info( '[job_id=%s] The total number of tasks is already computed', job_id) if job_id in self.dispatch_tasks_threads: self.logger.warning( '[job_id=%s] Already dispatching the tasks', job_id) else: dispatch_tasks_thread = threading.Thread( target=self.safe_dispatch_tasks, args=(job_id, job_type, job_info, job)) dispatch_tasks_thread.start() self.dispatch_tasks_threads[job_id] = dispatch_tasks_thread def safe_dispatch_tasks(self, job_id, job_type, job_info, job): """ Dispatch all tasks across the platform and update the backend. """ try: self.logger.info( '[job_id=%s] Start to dispatch tasks', job_id) self.dispatch_tasks(job_id, job_type, job_info, job) self.logger.info( '[job_id=%s] Finish to dispatch tasks', job_id) except Exception as exc: self.logger.exception( '[job_id=%s] Fail to dispatch tasks: %s', job_id, exc) _, exc = self.handle_backend_errors( self.backend.fail, job_id) if exc is not None: self.logger.warn( '[job_id=%s] Job has not been updated ' 'with the failure: %s', job_id, exc) finally: del self.dispatch_tasks_threads[job_id] def dispatch_tasks(self, job_id, job_type, job_info, job): job_config = job_info['config'] job_params = job_config['params'] tasks_per_second = job_config['tasks_per_second'] tasks_batch_size = job_config['tasks_batch_size'] last_task_id = job_info['tasks']['last_sent'] job_tasks = job.get_tasks(job_params, marker=last_task_id) beanstalkd_workers = self.get_beanstalkd_workers() tasks_run_time = 0 batch_per_second = tasks_per_second / float( tasks_batch_size) # The backend must have the tasks in order # to know the last task sent tasks = OrderedDict() for task_id, task_payload in job_tasks: if not self.running: break tasks[task_id] = task_payload if len(tasks) < tasks_batch_size: continue tasks_run_time = ratelimit( tasks_run_time, batch_per_second) sent = self.dispatch_tasks_batch( beanstalkd_workers, job_id, job_type, job_config, tasks) if sent: job_status, exc = self.handle_backend_errors( self.backend.update_tasks_sent, job_id, tasks.keys()) tasks.clear() if exc is not None: self.logger.warn( '[job_id=%s] Job has not been updated ' 'with the sent tasks: %s', job_id, exc) break if job_status == 'PAUSED': self.logger.info('Job %s is paused', job_id) return if not self.running: break else: sent = True if tasks: sent = self.dispatch_tasks_batch( beanstalkd_workers, job_id, job_type, job_config, tasks) if sent: job_status, exc = self.handle_backend_errors( self.backend.update_tasks_sent, job_id, tasks.keys(), all_tasks_sent=True) if exc is None: if job_status == 'FINISHED': self.logger.info('Job %s is finished', job_id) self.logger.info( 'Finished dispatching job (job_id=%s)', job_id) return else: self.logger.warn( '[job_id=%s] Job has not been updated ' 'with the last sent tasks: %s', job_id, exc) _, exc = self.handle_backend_errors(self.backend.free, job_id) if exc is not None: self.logger.warn( '[job_id=%s] Job has not been freed: %s', job_id, exc) def dispatch_tasks_batch(self, beanstalkd_workers, job_id, job_type, job_config, tasks): """ Try sending a task until it's ok """ beanstalkd_payload = self.make_beanstalkd_payload( job_id, job_type, job_config, tasks) if len(beanstalkd_payload) > 2**16: raise ValueError('Task payload is too big (length=%s)' % len(beanstalkd_payload)) # max 2 minutes per task ttr = len(tasks) * DEFAULT_TTR while self.running: for beanstalkd_worker in beanstalkd_workers: if not self.running: return False if beanstalkd_worker is not None: break try: beanstalkd_worker.put(beanstalkd_payload, ttr=ttr) self.logger.debug( '[job_id=%s] Tasks sent to %s: %s', job_id, beanstalkd_worker.addr, str(tasks)) return True except Exception as exc: self.logger.warn( '[job_id=%s] Fail to send beanstalkd job: %s', job_id, exc) # TODO(adu): We could be more lenient # and wait for a few errors in a row # to happen before marking it as broken. beanstalkd_worker.is_broken = True return False def make_beanstalkd_payload(self, job_id, job_type, job_config, tasks): return json.dumps( { 'event': EventTypes.XCUTE_TASKS, 'data': { 'job_id': job_id, 'job_type': job_type, 'job_config': job_config, 'tasks': tasks, 'beanstalkd_reply': { 'addr': self.beanstalkd_reply_addr, 'tube': self.beanstalkd_reply_tube, }, } }) def safe_compute_total_tasks(self, job_id, job_type, job_info, job): """ Compute the total number of tasks and update the backend. """ try: self.logger.info( '[job_id=%s] Start to compute the total number of tasks', job_id) self.compute_total_tasks(job_id, job_type, job_info, job) self.logger.info( '[job_id=%s] Finish to compute the total number of tasks', job_id) except Exception as exc: self.logger.exception( '[job_id=%s] Fail to compute the total number of tasks: %s', job_id, exc) finally: del self.compute_total_tasks_threads[job_id] def compute_total_tasks(self, job_id, job_type, job_info, job): job_params = job_info['config']['params'] total_marker = job_info['tasks']['total_marker'] tasks_counter = job.get_total_tasks( job_params, marker=total_marker) for total_marker, tasks_incr in tasks_counter: stop, exc = self.handle_backend_errors( self.backend.incr_total_tasks, job_id, total_marker, tasks_incr) if exc is not None: self.logger.warn( '[job_id=%s] Job has not been updated ' 'with total tasks: %s', job_id, exc) return if stop or not self.running: return total_tasks, exc = self.handle_backend_errors( self.backend.total_tasks_done, job_id) if exc is not None: self.logger.warn( '[job_id=%s] Job has not been updated ' 'with last total tasks: %s', job_id, exc) return self.logger.info( '[job_id=%s] %s estimated tasks', job_id, total_tasks) def listen_beanstalkd_reply_forever(self): """ Process this orchestrator's job replies """ self.logger.info('Connecting to the reply beanstalkd') while self.running: try: listener = BeanstalkdListener( addr=self.beanstalkd_reply_addr, tube=self.beanstalkd_reply_tube, logger=self.logger) break except ConnectionError: self.logger.error('Failed to connect to the reply beanstalkd') sleep(5) self.logger.info('Listening to replies on %s (tube=%s)', self.beanstalkd_reply_addr, self.beanstalkd_reply_tube) # keep the job results in memory while self.running: connection_error = self.listen_loop(listener) # in case of a beanstalkd connection error # sleep to avoid spamming if connection_error: sleep(2) self.logger.info('Exited listening thread') def listen_loop(self, listener): """ One iteration of the listening loop """ connection_error = False try: replies = listener.fetch_job( self.process_reply, timeout=self.DEFAULT_DISPATCHER_TIMEOUT) # to force the execution of process_reply # if there were no replies, consider it as a connection error connection_error = len(list(replies)) == 0 except OioTimeout: pass return connection_error def process_reply(self, beanstalkd_job_id, encoded_reply): reply = json.loads(encoded_reply) job_id = reply['job_id'] task_ids = reply['task_ids'] task_results = reply['task_results'] task_errors = reply['task_errors'] self.logger.debug('Tasks processed (job_id=%s): %s', job_id, task_ids) try: finished, exc = self.handle_backend_errors( self.backend.update_tasks_processed, job_id, task_ids, task_errors, task_results) if exc is None: if finished: self.logger.info('Job %s is finished', job_id) else: self.logger.warn( '[job_id=%s] Job has not been updated ' 'with the processed tasks: %s', job_id, exc) except Exception: self.logger.exception('Error processing reply') yield None def refresh_beanstalkd_workers_forever(self): """ Refresh beanstalkd workers by looking at the score, existing tubes and tube statistics. """ while self.running: try: beanstalkd_workers = self._find_beanstalkd_workers() except Exception as exc: self.logger.error( 'Fail to find beanstalkd workers: %s', exc) # TODO(adu): We could keep trying to send jobs # to the beanstalkd we already found. # But we need the score to know how to dispatch the tasks... beanstalkd_workers = dict() old_beanstalkd_workers_addr = set(self.beanstalkd_workers.keys()) new_beanstalkd_workers_addr = set(beanstalkd_workers.keys()) added_beanstalkds = new_beanstalkd_workers_addr \ - old_beanstalkd_workers_addr for beanstalkd_addr in added_beanstalkds: self.logger.info('Add beanstalkd %s' % beanstalkd_addr) beanstalkd = beanstalkd_workers[beanstalkd_addr] beanstalkd.use(self.beanstalkd_workers_tube) removed_beanstalkds = old_beanstalkd_workers_addr \ - new_beanstalkd_workers_addr for beanstalkd_addr in removed_beanstalkds: self.logger.info('Remove beanstalkd %s' % beanstalkd_addr) self.logger.info('Refresh beanstalkd workers') self.beanstalkd_workers = beanstalkd_workers for _ in range(self.refresh_time_beanstalkd_workers): if not self.running: break sleep(1) self.logger.info('Exited beanstalkd workers thread') def _find_beanstalkd_workers(self): """ Find beanstalkd workers by looking at the score, existing tubes and tube statistics. """ all_beanstalkd = self.conscience_client.all_services( 'beanstalkd') beanstalkd_workers = dict() for beanstalkd_info in all_beanstalkd: try: beanstalkd = self._check_beanstalkd_worker(beanstalkd_info) if not beanstalkd: continue beanstalkd_workers[beanstalkd.addr] = beanstalkd except Exception as exc: self.logger.error('Fail to check beanstalkd: %s', exc) return beanstalkd_workers def _check_beanstalkd_worker(self, beanstalkd_info): """ Check beanstalkd worker by looking at the score, existing tubes and tube statistics. """ beanstalkd_addr = 'beanstalk://' + beanstalkd_info['addr'] beanstalkd_score = beanstalkd_info['score'] if beanstalkd_score == 0: self.logger.debug( 'Ignore beanstalkd %s: score=0', beanstalkd_addr) return None beanstalkd = self.beanstalkd_workers.get(beanstalkd_addr) if not beanstalkd: beanstalkd = Beanstalk.from_url(beanstalkd_addr) beanstalkd.addr = beanstalkd_addr beanstalkd_tubes = beanstalkd.tubes() if self.beanstalkd_workers_tube not in beanstalkd_tubes: self.logger.debug( 'Ignore beanstalkd %s: ' 'No worker has ever listened to the tube %s', beanstalkd_addr, self.beanstalkd_workers_tube) return None current_stats = beanstalkd.stats_tube( self.beanstalkd_workers_tube) beanstalkd_jobs_ready = current_stats['current-jobs-ready'] if beanstalkd_jobs_ready > 0: beanstalkd_jobs_reserved = current_stats['current-jobs-reserved'] if beanstalkd_jobs_reserved <= 0: self.logger.warn( 'Ignore beanstalkd %s: The worker doesn\'t process task ' '(current-jobs-ready=%d, current-jobs-reserved=%d)', beanstalkd_addr, beanstalkd_jobs_ready, beanstalkd_jobs_reserved) return None if beanstalkd_jobs_ready >= self.max_jobs_per_beanstalkd: self.logger.warn( 'Ignore beanstalkd %s: The queue is full ' '(current-jobs-ready=%d, current-jobs-reserved=%d)', beanstalkd_addr, beanstalkd_jobs_ready, beanstalkd_jobs_reserved) return None if hasattr(beanstalkd, 'is_broken') and beanstalkd.is_broken: self.logger.info( 'Beanstalkd %s was broken, and now it\'s coming back', beanstalkd_addr) beanstalkd.is_broken = False # Favor the workers with a good score # 50% -> beanstalkd score worker_score = beanstalkd_score * 50. / 100. # 50% -> beanstalkd tube size worker_score += 50 - (beanstalkd_jobs_ready * 50. / self.max_jobs_per_beanstalkd) beanstalkd.occurrence = int(math.ceil(worker_score / 10.)) self.logger.debug( 'Give the green light to beanstalkd %s (worker_score=%d)', beanstalkd_addr, worker_score) return beanstalkd def get_beanstalkd_workers(self): """ Yield beanstalkd workers following a loadbalancing strategy """ beanstalkd_workers_id = None beanstalkd_workers = list() while True: if not self.beanstalkd_workers: self.logger.info('No beanstalkd worker available') sleep(1) yield None continue if id(self.beanstalkd_workers) != beanstalkd_workers_id: beanstalkd_workers_id = id(self.beanstalkd_workers) beanstalkd_workers = list() for beanstalkd in self.beanstalkd_workers.values(): for _ in range(beanstalkd.occurrence): beanstalkd_workers.append(beanstalkd) # Shuffle to not have the same suite for all jobs random.shuffle(beanstalkd_workers) yielded = False for beanstalkd_worker in beanstalkd_workers: if id(self.beanstalkd_workers) != beanstalkd_workers_id: break if beanstalkd_worker.is_broken: continue yield beanstalkd_worker yielded = True else: if not yielded: self.logger.info( 'All beanstalkd workers available are broken') sleep(1) yield None def exit_gracefully(self, *args, **kwargs): if self.running: self.logger.info('Exiting gracefully') self.running = False else: self.logger.info('Already exiting gracefully')
class Meta1Rebuilder(MetaRebuilder): def __init__(self, conf, logger, **kwargs): super(Meta1Rebuilder, self).__init__(conf, logger, None, **kwargs) self.conscience = ConscienceClient(self.conf, logger=self.logger) sds_conf = load_namespace_conf(self.conf['namespace']) or {} self.meta1_digits = int( sds_conf.get('ns.meta1_digits', sds_conf.get('meta1_digits', 4))) def _create_worker(self, **kwargs): return Meta1RebuilderWorker(self, **kwargs) def _fill_queue(self, queue, **kwargs): if self._fill_queue_from_file(queue, **kwargs): return prefixes = set() rawx_services = self.conscience.all_services('rawx') for rawx in rawx_services: cid = cid_from_name('_RDIR', rawx['addr']) prefix = cid[:self.meta1_digits] if prefix not in prefixes: queue.put(prefix.ljust(64, '0')) prefixes.add(prefix) accounts = self.api.account_list() for account in accounts: containers = self._full_container_list(account) for container in containers: cid = cid_from_name(account, container[0]) prefix = cid[:self.meta1_digits] if prefix not in prefixes: queue.put(prefix.ljust(64, '0')) prefixes.add(prefix) def _item_to_string(self, prefix, **kwargs): return 'prefix %s' % prefix def _get_report(self, status, end_time, counters, **kwargs): prefixes_processed, errors, total_prefixes_processed, \ total_errors = counters time_since_last_report = (end_time - self.last_report) or 0.00001 total_time = (end_time - self.start_time) or 0.00001 return ('%(status)s volume=%(volume)s ' 'last_report=%(last_report)s %(time_since_last_report).2fs ' 'prefixes=%(prefixes)d %(prefixes_rate).2f/s ' 'errors=%(errors)d %(errors_rate).2f%% ' 'start_time=%(start_time)s %(total_time).2fs ' 'total_prefixes=%(total_prefixes)d ' '%(total_prefixes_rate).2f/s ' 'total_errors=%(total_errors)d %(total_errors_rate).2f%%' % { 'status': status, 'volume': self.volume, 'last_report': datetime.fromtimestamp(int(self.last_report)).isoformat(), 'time_since_last_report': time_since_last_report, 'prefixes': prefixes_processed, 'prefixes_rate': prefixes_processed / time_since_last_report, 'errors': errors, 'errors_rate': 100 * errors / float(prefixes_processed or 1), 'start_time': datetime.fromtimestamp(int(self.start_time)).isoformat(), 'total_time': total_time, 'total_prefixes': total_prefixes_processed, 'total_prefixes_rate': total_prefixes_processed / total_time, 'total_errors': total_errors, 'total_errors_rate': 100 * total_errors / float(total_prefixes_processed or 1) })
class TestPerfectibleContent(BaseTestCase): def setUp(self): super(TestPerfectibleContent, self).setUp() self.api = ObjectStorageApi(self.ns, endpoint=self.uri, pool_manager=self.http_pool) self.cs = ConscienceClient(self.conf, pool_manager=self.http_pool) self.event = EventClient(self.conf) self.locked_svc = list() # Ensure the tube is not clogged self.event.beanstalk.drain_tube(DEFAULT_IMPROVER_TUBE) def tearDown(self): if self.locked_svc: self.cs.unlock_score(self.locked_svc) super(TestPerfectibleContent, self).tearDown() @classmethod def tearDownClass(cls): # Be kind with the next test suites cls._cls_reload_proxy() time.sleep(3) cls._cls_reload_meta() time.sleep(1) def _aggregate_services(self, type_, key): """ Build lists of services indexed by `key`. """ all_svcs = self.cs.all_services(type_) out = defaultdict(list) for svc in all_svcs: out[key(svc)].append(svc) return out def _lock_services(self, type_, services): """ Lock specified services, wait for the score to be propagated. """ for svc in services: self.locked_svc.append({'type': type_, 'addr': svc['addr']}) self.cs.lock_score(self.locked_svc) # In a perfect world™️ we do not need the time.sleep(). # For mysterious reason, all services are not reloaded immediately. self._reload_proxy() time.sleep(0.5) self._reload_meta() time.sleep(0.5) def _wait_for_event(self, timeout=REASONABLE_EVENT_DELAY): """ Wait for an event in the oio-improve tube. """ bt = self.event.beanstalk bt.watch(DEFAULT_IMPROVER_TUBE) try: job_id, data = bt.reserve(timeout=timeout) except ResponseError as exc: logging.warn('No event read from tube %s: %s', DEFAULT_IMPROVER_TUBE, exc) self.fail() bt.delete(job_id) return Event(json.loads(data)) # This test must be executed first def test_0_upload_ok(self): """Check that no event is emitted when everything is ok.""" # Check we have enough service locations. by_place = self._aggregate_services( 'rawx', lambda x: x['tags']['tag.loc'].rsplit('.', 2)[0]) if len(by_place) < 3: self.skip('This test requires 3 different 2nd level locations') return # Upload an object. container = self._random_user() reqid = request_id('perfectible-') self.api.object_create(self.account, container, obj_name='perfect', data='whatever', policy='THREECOPIES', headers={'X-oio-req-id': reqid}) # Wait on the oio-improve beanstalk tube. bt = self.event.beanstalk bt.watch(DEFAULT_IMPROVER_TUBE) # Ensure we do not receive any event. self.assertRaises(ResponseError, bt.reserve, timeout=REASONABLE_EVENT_DELAY) def test_upload_warn_dist(self): """ Check that an event is emitted when the warning distance is reached. """ # Check we have enough service locations. by_place = self._aggregate_services( 'rawx', lambda x: x['tags']['tag.loc'].rsplit('.', 2)[0]) if len(by_place) < 3: self.skip('This test requires 3 different 2nd level locations') return # Lock all services of the 3rd location. banned_loc = by_place.keys()[2] self._lock_services('rawx', by_place[banned_loc]) # Upload an object. container = self._random_user() reqid = request_id('perfectible-') self.api.object_create(self.account, container, obj_name='perfectible', data='whatever', policy='THREECOPIES', headers={'X-oio-req-id': reqid}) # Wait on the oio-improve beanstalk tube. event = self._wait_for_event() # Check the content of the event. self.assertEqual('storage.content.perfectible', event.event_type) self.assertEqual(reqid, event.reqid) self.assertEqual(self.account, event.url['account']) self.assertEqual(container, event.url['user']) self.assertEqual('perfectible', event.url['path']) mc = event.data self.assertEqual(0, mc['pos']) # only one metachunk in this test lowest_dist = 4 warn_dist = 4 for chunk in mc['chunks']: qual = chunk['quality'] if qual['final_dist'] < lowest_dist: lowest_dist = qual['final_dist'] if qual['warn_dist'] < warn_dist: warn_dist = qual['warn_dist'] self.assertEqual(qual['expected_slot'], qual['final_slot']) self.assertLessEqual(lowest_dist, warn_dist) def test_upload_fallback(self): """ Test that an event is emitted when a fallback service slot is used. """ by_slot = self._aggregate_services( 'rawx', lambda x: x['tags'].get('tag.slots', 'rawx').rsplit(',', 2)[-1]) if len(by_slot) < 2: self.skip('This test requires 2 different slots for rawx services') return elif len(by_slot['rawx-odd']) < 3: self.skip('This test requires at least 3 services ' 'in the "rawx-odd" slot') # Lock all services of the 'rawx-even' slot. banned_slot = 'rawx-even' self._lock_services('rawx', by_slot[banned_slot]) # Upload an object. container = self._random_user() reqid = request_id('perfectible-') self.api.object_create(self.account, container, obj_name='perfectible', data='whatever', policy='THREECOPIES', headers={'X-oio-req-id': reqid}) # Wait on the oio-improve beanstalk tube. event = self._wait_for_event() # Check the content of the event. self.assertEqual('storage.content.perfectible', event.event_type) self.assertEqual(reqid, event.reqid) self.assertEqual(self.account, event.url['account']) self.assertEqual(container, event.url['user']) self.assertEqual('perfectible', event.url['path']) mc = event.data self.assertEqual(0, mc['pos']) # only one metachunk in this test slot_matches = list() for chunk in mc['chunks']: qual = chunk['quality'] slot_matches.append(qual['final_slot'] == qual['expected_slot']) self.assertNotEqual(qual['final_slot'], banned_slot) self.assertIn(False, slot_matches)
def __init__(self, conf, tool): super(_DistributedDispatcher, self).__init__(conf, tool) self.sending = None self.max_items_per_second = int_value( self.conf.get('items_per_second'), self.tool.DEFAULT_ITEM_PER_SECOND) # All available beanstalkd conscience_client = ConscienceClient(self.conf) all_beanstalkd = conscience_client.all_services('beanstalkd') all_available_beanstalkd = dict() for beanstalkd in all_beanstalkd: if beanstalkd['score'] <= 0: continue all_available_beanstalkd[beanstalkd['addr']] = beanstalkd if not all_available_beanstalkd: raise OioException('No beanstalkd available') # Beanstalkd workers workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \ or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE self.beanstalkd_workers = dict() for beanstalkd in locate_tube(all_available_beanstalkd.values(), workers_tube): beanstalkd_worker = BeanstalkdSender(beanstalkd['addr'], workers_tube, self.logger) self.beanstalkd_workers[beanstalkd['addr']] = beanstalkd_worker self.logger.info( 'Beanstalkd %s using tube %s is selected as a worker', beanstalkd_worker.addr, beanstalkd_worker.tube) if not self.beanstalkd_workers: raise OioException('No beanstalkd worker available') nb_workers = len(self.beanstalkd_workers) if self.max_items_per_second > 0: # Max 2 seconds in advance queue_size_per_worker = self.max_items_per_second * 2 / nb_workers else: queue_size_per_worker = 64 for _, beanstalkd_worker in self.beanstalkd_workers.items(): beanstalkd_worker.low_limit = queue_size_per_worker / 2 beanstalkd_worker.high_limit = queue_size_per_worker # Beanstalkd reply beanstalkd_reply = dict() try: local_services = conscience_client.local_services() for local_service in local_services: if local_service['type'] != 'beanstalkd': continue beanstalkd = all_available_beanstalkd.get( local_service['addr']) if beanstalkd is None: continue if beanstalkd_reply \ and beanstalkd_reply['score'] >= beanstalkd['score']: continue beanstalkd_reply = beanstalkd except Exception as exc: # pylint: disable=broad-except self.logger.warning( 'ERROR when searching for beanstalkd locally: %s', exc) if not beanstalkd_reply: self.logger.warn('No beanstalkd available locally') try: beanstalkd = conscience_client.next_instance('beanstalkd') beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']] except Exception as exc: # pylint: disable=broad-except self.logger.warning('ERROR when searching for beanstalkd: %s', exc) beanstalkd_reply_addr = beanstalkd_reply['addr'] # If the tube exists, another service must have already used this tube tube_reply = workers_tube + '.reply.' + str(time.time()) tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_reply_addr).tubes() if tube_reply in tubes: raise OioException('Beanstalkd %s using tube %s is already used') self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr, tube_reply, self.logger) self.logger.info( 'Beanstalkd %s using tube %s is selected for the replies', self.beanstalkd_reply.addr, self.beanstalkd_reply.tube)
class ContainerClient(ProxyClient): """ Intermediate level class to manage containers. """ def __init__(self, conf, refresh_rawx_scores_delay=30.0, **kwargs): super(ContainerClient, self).__init__(conf, request_prefix="/container", **kwargs) # to refresh the rawx scores from cache kwargs.pop('pool_manager', None) self.conscience_client = ConscienceClient( self.conf, pool_manager=self.pool_manager, **kwargs) self.rawx_scores = dict() self._refresh_rawx_scores_delay = refresh_rawx_scores_delay self._last_refresh_rawx_scores = 0.0 def _make_uri(self, target): """ Build URIs for request that don't use the same prefix as the one set in this class' constructor. """ uri = '%s://%s/v3.0/%s/%s' % (self.proxy_scheme, self.proxy_netloc, self.ns, target) return uri def _make_params(self, account=None, reference=None, path=None, cid=None, content=None, version=None, **kwargs): if cid: params = {'cid': cid} else: params = {'acct': account, 'ref': reference} if path: params.update({'path': path}) if content: params.update({'content': content}) if version: params.update({'version': version}) return params def _get_rawx_scores(self): rawx_services = self.conscience_client.all_services('rawx') rawx_scores = dict() for rawx_service in rawx_services: rawx_scores[rawx_service['id']] = \ rawx_service['score'] return rawx_scores def _refresh_rawx_scores(self, now=None, **kwargs): """Refresh rawx service scores.""" self.rawx_scores = self._get_rawx_scores() if not now: now = time.time() self._last_refresh_rawx_scores = now def _maybe_refresh_rawx_scores(self, **kwargs): """Refresh rawx service scores if delay has been reached.""" if self._refresh_rawx_scores_delay >= 0.0 or not self.rawx_scores: now = time.time() if now - self._last_refresh_rawx_scores \ > self._refresh_rawx_scores_delay: try: self._refresh_rawx_scores(now, **kwargs) except OioNetworkException as exc: self.logger.warn( "Failed to refresh rawx service scores: %s", exc) except Exception: self.logger.exception( "Failed to refresh rawx service scores") def container_create(self, account, reference, properties=None, system=None, **kwargs): """ Create a container. :param account: account in which to create the container :type account: `str` :param reference: name of the container :type reference: `str` :param properties: properties to set on the container :type properties: `dict` :param system: system properties to set on the container :type system: `dict` :keyword headers: extra headers to send to the proxy :type headers: `dict` :returns: True if the container has been created, False if it already exists """ params = self._make_params(account, reference) data = json.dumps({ 'properties': properties or {}, 'system': system or {} }) resp, body = self._request('POST', '/create', params=params, data=data, **kwargs) if resp.status not in (204, 201): raise exceptions.from_response(resp, body) return resp.status == 201 def container_create_many(self, account, containers, properties=None, **kwargs): """ Create several containers. :param account: account in which to create the containers :type account: `str` :param containers: names of the containers :type containers: iterable of `str` :param properties: properties to set on the containers :type properties: `dict` :keyword headers: extra headers to send to the proxy :type headers: `dict` :returns: a list of tuples with the name of the container and a boolean telling if the container has been created :rtype: `list` of `tuple` """ results = list() try: params = self._make_params(account) unformatted_data = list() for container in containers: unformatted_data.append({ 'name': container, 'properties': properties or {}, 'system': kwargs.get('system', {}) }) data = json.dumps({"containers": unformatted_data}) resp, body = self._request('POST', '/create_many', params=params, data=data, **kwargs) if resp.status not in (204, 200): raise exceptions.from_response(resp, body) for container in body["containers"]: results.append((container["name"], container["status"] == 201)) return results except exceptions.TooLarge: # Batch too large for the proxy pivot = len(containers) // 2 head = containers[:pivot] tail = containers[pivot:] if head: results += self.container_create_many(account, head, properties=properties, **kwargs) if tail: results += self.container_create_many(account, tail, properties=properties, **kwargs) return results except exceptions.NotFound: # Batches not supported by the proxy for container in containers: try: rc = self.container_create(account, container, properties=properties, **kwargs) results.append((container, rc)) except Exception: results.append((container, False)) return results def container_delete(self, account=None, reference=None, cid=None, **kwargs): """ Delete a container. :param account: account from which to delete the container :type account: `str` :param reference: name of the container :type reference: `str` :param cid: container id that can be used instead of account and reference :type cid: `str` :keyword headers: extra headers to send to the proxy :type headers: `dict` """ params = self._make_params(account, reference, cid=cid) del_cached_container_metadata(account=account, reference=reference, cid=cid, **kwargs) try: self._request('POST', '/destroy', params=params, **kwargs) except exceptions.Conflict as exc: raise exceptions.ContainerNotEmpty(exc) def container_show(self, account=None, reference=None, cid=None, **kwargs): """ Get information about a container (like user properties). :param account: account in which the container is :type account: `str` :param reference: name of the container :type reference: `str` :param cid: container id that can be used instead of account and reference :type cid: `str` :keyword headers: extra headers to send to the proxy :type headers: `dict` :returns: a `dict` with "properties" containing a `dict` of user properties. :deprecated: use `container_get_properties` instead """ params = self._make_params(account, reference, cid=cid) _resp, body = self._request('GET', '/show', params=params, **kwargs) return body def container_snapshot(self, account=None, reference=None, dst_account=None, dst_reference=None, cid=None, **kwargs): """ Create a snapshot of a the container. This function duplicates only the database. It doesn't duplicate the chunks of the contents. :param account: account in which the container is :type account: `str` :param reference: name of the container :type reference: `str` :param cid: container id that can be used instead of account and reference :type cid: `str` :param dst_account: account in which the snapshot will be created :type dst_account: `str` :param dst_reference: name of the snapshot :type dst_reference: `str` """ params = self._make_params(account, reference, cid=cid) data = json.dumps({"account": dst_account, "container": dst_reference}) resp, _ = self._request('POST', '/snapshot', params=params, data=data, **kwargs) return resp def container_enable(self, account=None, reference=None, cid=None, **kwargs): """ Change the status of a container database to enable :param account: account in which the container is :type account: `str` :param reference: name of the container :type reference: `str` :param cid: container id that can be used instead of account and reference """ uri = self._make_uri('admin/enable') params = self._make_params(account, reference, cid=cid) params.update({"type": "meta2"}) del_cached_container_metadata(account=account, reference=reference, cid=cid, **kwargs) resp, _ = self._direct_request('POST', uri, params=params, **kwargs) return resp def container_freeze(self, account=None, reference=None, cid=None, **kwargs): """ Freeze the database of a container :param account: account in which the container is :type account: `str` :param reference: name of the container :type reference: name of the container :param cid: container id that can be used instead of account and reference """ uri = self._make_uri('admin/freeze') params = self._make_params(account, reference, cid=cid) params.update({"type": "meta2"}) del_cached_container_metadata(account=account, reference=reference, cid=cid, **kwargs) resp, _ = self._direct_request('POST', uri, params=params, **kwargs) return resp @extract_reference_params def container_get_properties(self, account=None, reference=None, properties=None, cid=None, params=None, **kwargs): """ Get information about a container (user and system properties). :param account: account in which the container is :type account: `str` :param reference: name of the container :type reference: `str` :param cid: container id that can be used instead of account and reference :type cid: `str` :keyword headers: extra headers to send to the proxy :type headers: `dict` :returns: a `dict` with "properties" and "system" entries, containing respectively a `dict` of user properties and a `dict` of system properties. """ container_meta = get_cached_container_metadata(account=account, reference=reference, cid=cid, **kwargs) if container_meta is not None: return container_meta if not properties: properties = list() data = json.dumps(properties) _resp, container_meta = self._request('POST', '/get_properties', data=data, params=params, **kwargs) set_cached_container_metadata(container_meta, account=account, reference=reference, cid=cid, **kwargs) return container_meta def container_set_properties(self, account=None, reference=None, properties=None, clear=False, cid=None, system=None, **kwargs): params = self._make_params(account, reference, cid=cid) if clear: params["flush"] = 1 data = json.dumps({ 'properties': properties or {}, 'system': system or {} }) del_cached_container_metadata(account=account, reference=reference, cid=cid, **kwargs) _resp, body = self._request('POST', '/set_properties', data=data, params=params, **kwargs) return body def container_del_properties(self, account=None, reference=None, properties=[], cid=None, **kwargs): params = self._make_params(account, reference, cid=cid) data = json.dumps(properties) del_cached_container_metadata(account=account, reference=reference, cid=cid, **kwargs) _resp, body = self._request('POST', '/del_properties', data=data, params=params, **kwargs) return body def container_touch(self, account=None, reference=None, cid=None, recompute=False, damaged_objects=None, missing_chunks=None, **kwargs): params = self._make_params(account, reference, cid=cid) if recompute: params['recompute'] = True if damaged_objects is not None: params['damaged_objects'] = damaged_objects if missing_chunks is not None: params['missing_chunks'] = missing_chunks self._request('POST', '/touch', params=params, **kwargs) def container_dedup(self, account=None, reference=None, cid=None, **kwargs): params = self._make_params(account, reference, cid=cid) self._request('POST', '/dedup', params=params, **kwargs) def container_purge(self, account=None, reference=None, cid=None, maxvers=None, **kwargs): params = self._make_params(account, reference, cid=cid) if maxvers is not None: params["maxvers"] = maxvers self._request('POST', '/purge', params=params, **kwargs) def container_raw_insert(self, bean, account=None, reference=None, cid=None, **kwargs): params = self._make_params(account, reference, cid=cid) data = json.dumps((bean, )) if kwargs.pop("frozen", None): params["frozen"] = 1 self._request('POST', '/raw_insert', data=data, params=params, **kwargs) def container_raw_update(self, old, new, account=None, reference=None, cid=None, **kwargs): params = self._make_params(account, reference, cid=cid) data = json.dumps({"old": old, "new": new}) if kwargs.pop("frozen", None): params["frozen"] = 1 self._request('POST', '/raw_update', data=data, params=params, **kwargs) def container_raw_delete(self, account=None, reference=None, data=None, cid=None, **kwargs): """ Delete raw 'beans' from a container. :param data: dictionaries representing the beans to delete. They must have a key for each column of the meta2 database, plus a 'type' telling which type of bean it is. :type data: `list` of `dict` items """ params = self._make_params(account, reference, cid=cid) data = json.dumps(data) self._request('POST', '/raw_delete', data=data, params=params, **kwargs) def container_flush(self, account=None, reference=None, cid=None, **kwargs): params = self._make_params(account, reference, cid=cid) resp, _ = self._request('POST', '/flush', params=params, **kwargs) return { 'truncated': boolean_value(resp.getheader('x-oio-truncated'), False) } @extract_reference_params def content_list(self, account=None, reference=None, limit=None, marker=None, end_marker=None, prefix=None, delimiter=None, properties=False, cid=None, versions=False, deleted=False, params=None, **kwargs): """ Get the list of contents of a container. :returns: a tuple with container metadata `dict` as first element and a `dict` with "object" and "prefixes" as second element """ p_up = { 'max': limit, 'marker': marker, 'end_marker': end_marker, 'prefix': prefix, 'delimiter': delimiter, 'properties': properties } params.update(p_up) # As of 4.0.0.a3, to make it false, the 'all' parameter must be absent if versions: params['all'] = '1' if deleted: params['deleted'] = 1 if kwargs.get('local'): params['local'] = 1 resp, body = self._request('GET', '/list', params=params, **kwargs) return resp.headers, body @ensure_headers def content_create(self, account=None, reference=None, path=None, size=None, checksum=None, data=None, cid=None, content_id=None, stgpol=None, version=None, mime_type=None, chunk_method=None, headers=None, append=False, change_policy=False, force=False, **kwargs): """ Create a new object. This method does not upload any data, it just registers object metadata in the database. :param size: size of the object :type size: `int` :param checksum: checksum of the object (may be None when appending) :type checksum: hexadecimal `str` :param data: metadata of the object (list of chunks and dict of properties) :type data: `dict` :param cid: container id that can be used in place of `account` and `reference` :type cid: hexadecimal `str` :param content_id: the ID to set on the object, or the ID of the existing object when appending :param stgpol: name of the storage policy for the object :param version: version of the object :type version: `int` :param mime_type: MIME type to set on the object :param chunk_method: :param headers: extra headers to send to the proxy :param append: append to an existing object instead of creating it :type append: `bool` :param change_policy: change policy of an existing object :type change_policy: `bool` """ uri = self._make_uri('content/create') params = self._make_params(account, reference, path, cid=cid) if append: params['append'] = '1' if change_policy: params['change_policy'] = '1' # TODO(FVE): implement 'force' parameter if not isinstance(data, dict): warnings.simplefilter('once') warnings.warn("'data' parameter should be a dict, not a list", DeprecationWarning, stacklevel=3) if kwargs.get('meta_pos') is not None: data = data['chunks'] # TODO(FVE): change "id" into "content", and other occurrences params['id'] = content_id uri = self._make_uri('content/update') data = json.dumps(data) hdrs = { 'x-oio-content-meta-length': str(size), 'x-oio-content-meta-hash': checksum } hdrs.update(headers) if content_id is not None: hdrs['x-oio-content-meta-id'] = content_id if stgpol is not None: hdrs['x-oio-content-meta-policy'] = stgpol if version is not None: hdrs['x-oio-content-meta-version'] = str(version) if mime_type is not None: hdrs['x-oio-content-meta-mime-type'] = mime_type if chunk_method is not None: hdrs['x-oio-content-meta-chunk-method'] = chunk_method del_cached_object_metadata(account=account, reference=reference, path=path, cid=cid, version=version, **kwargs) resp, body = self._direct_request('POST', uri, data=data, params=params, headers=hdrs, **kwargs) return resp, body def content_drain(self, account=None, reference=None, path=None, cid=None, version=None, **kwargs): uri = self._make_uri('content/drain') params = self._make_params(account, reference, path, cid=cid, version=version) del_cached_object_metadata(account=account, reference=reference, path=path, cid=cid, version=version, **kwargs) resp, _ = self._direct_request('POST', uri, params=params, **kwargs) return resp.status == 204 def content_delete(self, account=None, reference=None, path=None, cid=None, version=None, **kwargs): """ Delete one object. :returns: True if the object has been deleted """ uri = self._make_uri('content/delete') params = self._make_params(account, reference, path, cid=cid, version=version) del_cached_object_metadata(account=account, reference=reference, path=path, cid=cid, version=version, **kwargs) resp, _ = self._direct_request('POST', uri, params=params, **kwargs) return resp.status == 204 def content_delete_many(self, account=None, reference=None, paths=None, cid=None, **kwargs): """ Delete several objects. :param paths: an iterable of object paths (should not be a generator) :returns: a list of tuples with the path of the content and a boolean telling if the content has been deleted :rtype: `list` of `tuple` """ uri = self._make_uri('content/delete_many') params = self._make_params(account, reference, cid=cid) unformatted_data = list() for obj in paths: unformatted_data.append({'name': obj}) data = json.dumps({"contents": unformatted_data}) results = list() for path in paths: del_cached_object_metadata(account=account, reference=reference, path=path, cid=cid, **kwargs) try: _, resp_body = self._direct_request('POST', uri, data=data, params=params, **kwargs) for obj in resp_body["contents"]: results.append((obj["name"], obj["status"] == 204)) return results except exceptions.NotFound: for obj in paths: rc = self.content_delete(account, reference, obj, cid=cid, **kwargs) results.append((obj, rc)) return results except exceptions.TooLarge: pivot = len(paths) // 2 head = paths[:pivot] tail = paths[pivot:] if head: results += self.content_delete_many(account, reference, head, cid=cid, **kwargs) if tail: results += self.content_delete_many(account, reference, tail, cid=cid, **kwargs) return results except Exception: raise @extract_reference_params def content_locate(self, account=None, reference=None, path=None, cid=None, content=None, version=None, properties=True, params=None, **kwargs): """ Get a description of the content along with the list of its chunks. :param cid: container id that can be used in place of `account` and `reference` :type cid: hexadecimal `str` :param content: content id that can be used in place of `path` :type content: hexadecimal `str` :param properties: should the request return object properties along with content description :type properties: `bool` :returns: a tuple with content metadata `dict` as first element and chunk `list` as second element """ content_meta, chunks = get_cached_object_metadata( account=account, reference=reference, path=path, cid=cid, version=version, properties=properties, **kwargs) if content_meta is not None and chunks is not None: # Refresh asynchronously so as not to slow down the current request eventlet.spawn_n(self._maybe_refresh_rawx_scores, **kwargs) for chunk in chunks: chunk['score'] = self.rawx_scores.get( chunk['url'].split('/')[2], 0) return content_meta, chunks uri = self._make_uri('content/locate') params['properties'] = properties try: resp, chunks = self._direct_request('GET', uri, params=params, **kwargs) content_meta = extract_content_headers_meta(resp.headers) except exceptions.OioNetworkException as exc: # TODO(FVE): this special behavior can be removed when # the 'content/locate' protocol is changed to include # object properties in the response body instead of headers. if properties and 'got more than ' in str(exc): params['properties'] = False _resp, chunks = self._direct_request('GET', uri, params=params, **kwargs) content_meta = self.content_get_properties(account, reference, path, cid=cid, content=content, version=version, **kwargs) else: raise set_cached_object_metadata(content_meta, chunks, account=account, reference=reference, path=path, cid=cid, version=version, properties=properties, **kwargs) return content_meta, chunks @extract_reference_params def content_prepare(self, account=None, reference=None, path=None, size=None, cid=None, stgpol=None, content_id=None, version=None, params=None, **kwargs): """ Prepare an upload: get URLs of chunks on available rawx. :keyword autocreate: create container if it doesn't exist """ uri = self._make_uri('content/prepare') data = {'size': size} if stgpol: data['policy'] = stgpol data = json.dumps(data) try: resp, body = self._direct_request('POST', uri + '2', data=data, params=params, **kwargs) chunks = body['chunks'] obj_meta = extract_content_headers_meta(resp.headers) obj_meta['properties'] = dict() # pylint: disable=no-member obj_meta['properties'].update(body.get('properties', {})) except exceptions.NotFound: # Proxy does not support v2 request (oio < 4.3) resp, chunks = self._direct_request('POST', uri, data=data, params=params, **kwargs) obj_meta = extract_content_headers_meta(resp.headers) return obj_meta, chunks @extract_reference_params def content_get_properties(self, account=None, reference=None, path=None, properties=None, cid=None, content=None, version=None, params=None, **kwargs): """ Get a description of the content along with its user properties. """ obj_meta, _ = get_cached_object_metadata(account=account, reference=reference, path=path, cid=cid, version=version, properties=True, **kwargs) if obj_meta is not None: return obj_meta uri = self._make_uri('content/get_properties') data = json.dumps(properties) if properties else None resp, body = self._direct_request('POST', uri, data=data, params=params, **kwargs) obj_meta = extract_content_headers_meta(resp.headers) obj_meta.update(body) set_cached_object_metadata(obj_meta, None, account=account, reference=reference, path=path, cid=cid, version=version, properties=True, **kwargs) return obj_meta def content_set_properties(self, account=None, reference=None, path=None, properties={}, cid=None, version=None, clear=False, **kwargs): """ Set properties on an object. :param properties: dictionary of properties """ uri = self._make_uri('content/set_properties') params = self._make_params(account, reference, path, cid=cid, version=version) if clear: params['flush'] = 1 data = json.dumps(properties) del_cached_object_metadata(account=account, reference=reference, path=path, cid=cid, version=version, **kwargs) _resp, _body = self._direct_request('POST', uri, data=data, params=params, **kwargs) def content_del_properties(self, account=None, reference=None, path=None, properties=[], cid=None, version=None, **kwargs): """ Delete some properties from an object. :param properties: list of property keys to delete :type properties: `list` :returns: True is the property has been deleted """ uri = self._make_uri('content/del_properties') params = self._make_params(account, reference, path, cid=cid, version=version) # Build a list in case the parameter is a view (not serializable). data = json.dumps([x for x in properties]) del_cached_object_metadata(account=account, reference=reference, path=path, cid=cid, version=version, **kwargs) resp, _body = self._direct_request('POST', uri, data=data, params=params, **kwargs) return resp.status == 204 def content_touch(self, account=None, reference=None, path=None, cid=None, version=None, **kwargs): uri = self._make_uri('content/touch') params = self._make_params(account, reference, path, cid=cid, version=version) self._direct_request('POST', uri, params=params, **kwargs) @extract_reference_params def content_spare(self, account=None, reference=None, path=None, data=None, cid=None, stgpol=None, params=None, **kwargs): uri = self._make_uri('content/spare') if stgpol: params['stgpol'] = stgpol data = json.dumps(data) _resp, body = self._direct_request('POST', uri, data=data, params=params, **kwargs) return body def content_truncate(self, account=None, reference=None, path=None, cid=None, version=None, size=0, **kwargs): uri = self._make_uri('content/truncate') params = self._make_params(account, reference, path, cid=cid, version=version) params['size'] = size del_cached_object_metadata(account=account, reference=reference, path=path, cid=cid, version=version, **kwargs) _resp, body = self._direct_request('POST', uri, params=params, **kwargs) return body def content_purge(self, account=None, reference=None, path=None, cid=None, maxvers=None, **kwargs): uri = self._make_uri('content/purge') params = self._make_params(account, reference, path, cid=cid) if maxvers is not None: params["maxvers"] = maxvers del_cached_object_metadata(account=account, reference=reference, path=path, cid=cid, **kwargs) self._direct_request('POST', uri, params=params, **kwargs)
class Meta1Rebuilder(MetaRebuilder): def __init__(self, conf, logger, **kwargs): super(Meta1Rebuilder, self).__init__(conf, logger, **kwargs) self.conscience = ConscienceClient(self.conf, logger=self.logger) sds_conf = load_namespace_conf(self.conf['namespace']) or {} self.meta1_digits = int(sds_conf.get('meta1_digits', 4)) def _create_worker(self, **kwargs): return Meta1RebuilderWorker(self.conf, self.logger, **kwargs) def _fill_queue(self, queue, **kwargs): if self._fill_queue_from_file(queue, **kwargs): return prefixes = set() rawx_services = self.conscience.all_services('rawx') for rawx in rawx_services: cid = cid_from_name('_RDIR', rawx['addr']) prefix = cid[:self.meta1_digits] if prefix not in prefixes: queue.put(prefix.ljust(64, '0')) prefixes.add(prefix) accounts = self.api.account_list() for account in accounts: containers = self._full_container_list(account) for container in containers: cid = cid_from_name(account, container[0]) prefix = cid[:self.meta1_digits] if prefix not in prefixes: queue.put(prefix.ljust(64, '0')) prefixes.add(prefix) def _get_report(self, start_time, end_time, passes, errors, waiting_time, rebuilder_time, elapsed, total_prefixes_processed, info, **kwargs): return ('DONE ' 'started=%(start_time)s ' 'ended=%(end_time)s ' 'elapsed=%(elapsed).2f ' 'passes=%(passes)d ' 'errors=%(errors)d ' 'meta1_prefixes=%(prefixes)d %(rate).2f/s ' 'waiting_time=%(waiting_time).2f ' 'rebuilder_time=%(rebuilder_time).2f ' '(rebuilder: %(success_rate).2f%%)' % { 'start_time': datetime.fromtimestamp( int(start_time)).isoformat(), 'end_time': datetime.fromtimestamp( int(end_time)).isoformat(), 'elapsed': elapsed, 'passes': passes, 'errors': errors, 'prefixes': total_prefixes_processed, 'rate': total_prefixes_processed / elapsed, 'rebuilder_time': rebuilder_time, 'waiting_time': waiting_time, 'success_rate': 100 * ((total_prefixes_processed - errors) / float(total_prefixes_processed or 1)) })
class Harasser(object): def __init__(self, ns, max_containers=256, max_contents=256): conf = {'namespace': ns} self.cs = ConscienceClient(conf) self.rdir = RdirClient(conf) self.rawx_list = [x['addr'] for x in self.cs.all_services('rawx')] self.sent = set() self.max_containers = max_containers self.max_contents = max_contents self.pushed_count = 0 self.pushed_time = 0 self.removed_count = 0 self.removed_time = 0 def harass_put(self, loops=None): if loops is None: loops = random.randint(1000, 2000) print("Pushing %d fake chunks" % loops) loop = loops count_start_container = random.randrange(2**20) count_start_content = random.randrange(2**20) start = time.time() nb_rawx = len(self.rawx_list) while loop > 0: args = {'mtime': int(start)} # vol_id = random.choice(self.rawx_list) # container_id = "%064X" % (random.randrange(self.max_containers)) # content_id = "%032X" % (random.randrange(self.max_contents)) vol_id = self.rawx_list[loop % nb_rawx] container_id = "%064X" % (loop + count_start_container) content_id = "%032X" % (loop + count_start_content) chunk_id = "http://%s/%064X" \ % (vol_id, random.randrange(2**128)) self.rdir.chunk_push(vol_id, container_id, content_id, chunk_id, **args) self.sent.add((vol_id, container_id, content_id, chunk_id)) loop -= 1 end = time.time() self.pushed_count += loops self.pushed_time += end - start print("%d pushed in %.3fs, %d req/s" % (loops, end - start, loops / (end - start))) def harass_del(self, min_loops=0): min_loops = min(min_loops, len(self.sent)) loops = random.randint(min_loops, len(self.sent)) print("Removing %d fake chunks" % loops) loop = loops start = time.time() while loop > 0: args = self.sent.pop() self.rdir.chunk_delete(*args) loop -= 1 end = time.time() self.removed_count += loops self.removed_time += end - start print("%d removed in %.3fs, %d req/s" % (loops, end - start, loops / (end - start))) def __call__(self): try: while True: self.harass_put() self.harass_del() except KeyboardInterrupt: print("Cleaning...") self.harass_del(len(self.sent)) print("Stats:") print("Pushed %d in %.3fs, %d req/s" % (self.pushed_count, self.pushed_time, self.pushed_count / self.pushed_time)) print("Removed %d in %.3fs, %d req/s" % (self.removed_count, self.removed_time, self.removed_count / self.removed_time))
def __init__(self, conf, tool): super(_DistributedDispatcher, self).__init__(conf, tool) self.sending = False # All available beanstalkd conscience_client = ConscienceClient(self.conf) all_beanstalkd = conscience_client.all_services('beanstalkd') all_available_beanstalkd = dict() for beanstalkd in all_beanstalkd: if beanstalkd['score'] <= 0: continue all_available_beanstalkd[beanstalkd['addr']] = beanstalkd if not all_available_beanstalkd: raise OioException('No beanstalkd available') # Beanstalkd workers workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \ or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE self.beanstalkd_workers = dict() for _, beanstalkd in all_available_beanstalkd.items(): beanstalkd_worker_addr = beanstalkd['addr'] # If the tube exists, # there should be a service that listens to this tube tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_worker_addr).tubes() if workers_tube not in tubes: continue beanstalkd_worker = BeanstalkdSender(beanstalkd_worker_addr, workers_tube, self.logger) self.beanstalkd_workers[beanstalkd_worker_addr] = beanstalkd_worker self.logger.info( 'Beanstalkd %s using tube %s is selected as a worker', beanstalkd_worker.addr, beanstalkd_worker.tube) if not self.beanstalkd_workers: raise OioException('No beanstalkd worker available') # Beanstalkd reply beanstalkd_reply = dict() try: local_services = conscience_client.local_services() for local_service in local_services: if local_service['type'] != 'beanstalkd': continue beanstalkd = all_available_beanstalkd.get( local_service['addr']) if beanstalkd is None: continue if beanstalkd_reply \ and beanstalkd_reply['score'] >= beanstalkd['score']: continue beanstalkd_reply = beanstalkd except Exception as exc: # pylint: disable=broad-except self.logger.warning( 'ERROR when searching for beanstalkd locally: %s', exc) if not beanstalkd_reply: self.logger.warn('No beanstalkd available locally') try: beanstalkd = conscience_client.next_instance('beanstalkd') beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']] except Exception as exc: # pylint: disable=broad-except self.logger.warning('ERROR when searching for beanstalkd: %s', exc) beanstalkd_reply_addr = beanstalkd_reply['addr'] # If the tube exists, another service must have already used this tube tube_reply = workers_tube + '.reply.' + str(time.time()) tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_reply_addr).tubes() if tube_reply in tubes: raise OioException('Beanstalkd %s using tube %s is already used') self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr, tube_reply, self.logger) self.logger.info( 'Beanstalkd %s using tube %s is selected for the replies', self.beanstalkd_reply.addr, self.beanstalkd_reply.tube)
class RawxDecommissionJob(XcuteRdirJob): JOB_TYPE = 'rawx-decommission' TASK_CLASS = RawxDecommissionTask DEFAULT_RAWX_TIMEOUT = 60.0 DEFAULT_MIN_CHUNK_SIZE = 0 DEFAULT_MAX_CHUNK_SIZE = 0 DEFAULT_USAGE_TARGET = 0 DEFAULT_USAGE_CHECK_INTERVAL = 60.0 @classmethod def sanitize_params(cls, job_params): sanitized_job_params, _ = super(RawxDecommissionJob, cls).sanitize_params(job_params) # specific configuration service_id = job_params.get('service_id') if not service_id: raise ValueError('Missing service ID') sanitized_job_params['service_id'] = service_id sanitized_job_params['rawx_timeout'] = float_value( job_params.get('rawx_timeout'), cls.DEFAULT_RAWX_TIMEOUT) sanitized_job_params['min_chunk_size'] = int_value( job_params.get('min_chunk_size'), cls.DEFAULT_MIN_CHUNK_SIZE) sanitized_job_params['max_chunk_size'] = int_value( job_params.get('max_chunk_size'), cls.DEFAULT_MAX_CHUNK_SIZE) excluded_rawx = job_params.get('excluded_rawx') if excluded_rawx: excluded_rawx = excluded_rawx.split(',') else: excluded_rawx = list() sanitized_job_params['excluded_rawx'] = excluded_rawx sanitized_job_params['usage_target'] = int_value( job_params.get('usage_target'), cls.DEFAULT_USAGE_TARGET) sanitized_job_params['usage_check_interval'] = float_value( job_params.get('usage_check_interval'), cls.DEFAULT_USAGE_CHECK_INTERVAL) return sanitized_job_params, 'rawx/%s' % service_id def __init__(self, conf, logger=None): super(RawxDecommissionJob, self).__init__(conf, logger=logger) self.rdir_client = RdirClient(self.conf, logger=self.logger) self.conscience_client = ConscienceClient(self.conf, logger=self.logger) def get_usage(self, service_id): services = self.conscience_client.all_services('rawx', full=True) for service in services: if service_id == service['tags'].get('tag.service_id', service['addr']): return 100 - service['tags']['stat.space'] raise ValueError('No rawx service this ID (%s)' % service_id) def get_tasks(self, job_params, marker=None): service_id = job_params['service_id'] usage_target = job_params['usage_target'] usage_check_interval = job_params['usage_check_interval'] if usage_target > 0: now = time.time() current_usage = self.get_usage(service_id) if current_usage <= usage_target: self.logger.info( 'current usage %.2f%%: target already reached (%.2f%%)', current_usage, usage_target) return last_usage_check = now chunk_infos = self.get_chunk_infos(job_params, marker=marker) for container_id, content_id, chunk_id, _ in chunk_infos: task_id = '|'.join((container_id, content_id, chunk_id)) yield task_id, { 'container_id': container_id, 'content_id': content_id, 'chunk_id': chunk_id } if usage_target <= 0: continue now = time.time() if now - last_usage_check < usage_check_interval: continue current_usage = self.get_usage(service_id) if current_usage > usage_target: last_usage_check = now continue self.logger.info('current usage %.2f%%: target reached (%.2f%%)', current_usage, usage_target) return def get_total_tasks(self, job_params, marker=None): service_id = job_params['service_id'] usage_target = job_params['usage_target'] current_usage = self.get_usage(service_id) if current_usage <= usage_target: return kept_chunks_ratio = 1 - (usage_target / float(current_usage)) chunk_infos = self.get_chunk_infos(job_params, marker=marker) i = 0 for i, (container_id, content_id, chunk_id, _) \ in enumerate(chunk_infos, 1): if i % 1000 == 0: yield ('|'.join((container_id, content_id, chunk_id)), int(math.ceil(1000 * kept_chunks_ratio))) remaining = int(math.ceil(i % 1000 * kept_chunks_ratio)) if remaining > 0: yield '|'.join((container_id, content_id, chunk_id)), remaining def get_chunk_infos(self, job_params, marker=None): service_id = job_params['service_id'] rdir_fetch_limit = job_params['rdir_fetch_limit'] rdir_timeout = job_params['rdir_timeout'] chunk_infos = self.rdir_client.chunk_fetch(service_id, timeout=rdir_timeout, limit=rdir_fetch_limit, start_after=marker) return chunk_infos