class TestRdirMeta2Client(unittest.TestCase): def setUp(self): super(TestRdirMeta2Client, self).setUp() self.namespace = "dummy" self.volid = "e29b4c56-8522-4118-82ea" self.container_url = "OPENIO/testing/test1" self.container_id = "random833999id" self.mtime = 2874884.47 self.rdir_client = RdirClient({'namespace': self.namespace}, endpoint='127.0.0.0:6000') def tearDown(self): super(TestRdirMeta2Client, self).tearDown() del self.rdir_client def test_volume_create(self): # We should normally receive an HTTPResponse with an empty body self.rdir_client._rdir_request = Mock(side_effect=(None, '')) self.rdir_client.meta2_index_create(self.volid) self.rdir_client._rdir_request.assert_called_once_with( self.volid, 'POST', 'create', service_type='meta2') del self.rdir_client._rdir_request def test_volume_fetch(self): self.rdir_client._rdir_request = Mock( return_value=(None, {"records": [], "truncated": False})) expected_args = { 'volume': self.volid, 'method': 'POST', 'action': 'fetch', 'json': { 'prefix': self.container_url, 'limit': 4096, }, 'service_type': 'meta2' } self.rdir_client.meta2_index_fetch(self.volid, prefix=self.container_url) self.rdir_client._rdir_request.assert_called_once_with(**expected_args) del self.rdir_client._rdir_request def test_volume_push(self): self.rdir_client._rdir_request = Mock(side_effect=(None, '')) expected_args = { 'volume': self.volid, 'method': 'POST', 'action': 'push', 'create': True, 'json': { 'container_url': self.container_url, 'container_id': self.container_id, 'mtime': int(self.mtime), }, 'headers': None, 'service_type': 'meta2' } self.rdir_client.meta2_index_push(self.volid, self.container_url, self.container_id, self.mtime) self.rdir_client._rdir_request.assert_called_once_with(**expected_args) del self.rdir_client._rdir_request def test_volume_delete(self): self.rdir_client._rdir_request = Mock(side_effect=(None, '')) expected_args = { 'volume': self.volid, 'method': 'POST', 'action': 'delete', 'create': False, 'json': { 'container_url': self.container_url, 'container_id': self.container_id, }, 'service_type': 'meta2' } self.rdir_client.meta2_index_delete(self.volid, self.container_url, self.container_id) self.rdir_client._rdir_request.assert_called_once_with(**expected_args) del self.rdir_client._rdir_request
class Meta2IndexingWorker(object): """ Indexing worker responsible for a single volume. """ def __init__(self, volume_path, conf, pool_manager=None): """ Initializes an Indexing worker for indexing meta2 databases. Possible values of conf relating to this worker are: - interval: (int) in sec time between two full scans. Default: half an hour. - report_interval: (int) in sec, time between two reports: Default: 300 - scanned_per_second: (int) maximum number of indexed databases /s. - try_removing_faulty_indexes : In the event where we encounter a database that's not supposed to be handled by this volume, attempt to remove it from this volume rdir index if it exists WARNING: The decision is based off of a proxy response, that could be affected by cache inconsistencies for example, use at your own risk. Default: False :param volume_path: The volume path to be indexed :param conf: The configuration to be passed to the needed services :param pool_manager: A connection pool manager. If none is given, a new one with a default size of 10 will be created. """ self.logger = get_logger(conf) self._stop = False self.volume = volume_path self.success_nb = 0 self.failed_nb = 0 self.full_scan_nb = 0 self.last_report_time = 0 self.last_scan_time = 0 self.last_index_time = 0 self.start_time = 0 self.indexed_since_last_report = 0 self.scans_interval = int_value( conf.get('interval'), 1800) self.report_interval = int_value( conf.get('report_interval'), 300) self.max_indexed_per_second = int_value( conf.get('scanned_per_second'), 3000) self.namespace, self.volume_id = check_volume_for_service_type( self.volume, "meta2") self.attempt_bad_index_removal = boolean_value( conf.get('try_removing_faulty_indexes'), False) if not pool_manager: pool_manager = get_pool_manager(pool_connections=10) self.index_client = RdirClient(conf, logger=self.logger, pool_manager=pool_manager) self.dir_client = DirectoryClient(conf, logger=self.logger, pool_manager=pool_manager) def report(self, tag): """ Log the status of indexer :param tag: One of three: starting, running, ended. """ total = self.success_nb + self.failed_nb now = time.time() elapsed = (now - self.start_time) or 0.00001 since_last_rprt = (now - self.last_report_time) or 0.00001 self.logger.info( 'volume_id=%(volume_id)s %(tag)s=%(current_time)s ' 'elapsed=%(elapsed).02f ' 'pass=%(pass)d ' 'errors=%(errors)d ' 'containers_indexed=%(total_indexed)d %(index_rate).2f/s', { 'volume_id': self.volume_id, 'tag': tag, 'current_time': datetime.fromtimestamp( int(now)).isoformat(), 'pass': self.full_scan_nb, 'errors': self.failed_nb, 'total_indexed': total, 'index_rate': self.indexed_since_last_report / since_last_rprt, 'elapsed': elapsed } ) self.last_report_time = now self.indexed_since_last_report = 0 def warn(self, msg, container_id): self.logger.warn( 'volume_id=%(volume_id)s container_id=%(container_id)s %(error)s', { 'volume_id': self.volume_id, 'container_id': container_id, 'error': msg } ) def _attempt_index_removal(self, db_path, cid): """ Fail safe removal attempt. """ try: self.index_client.meta2_index_delete(self.volume_id, db_path, cid) except exc.OioException as exception: self.warn( container_id=cid, msg="Unable to remove database from the volume " "index : {0}".format(str(exception)) ) def index_meta2_database(self, db_id): """ Add a meta2 database to the rdir index. Fails if the database isn't handled by the current volume. :param db_id: The ContentID representing the reference to the database. """ if len(db_id) < STRLEN_REFERENCEID: self.warn('Not a valid container ID', db_id) return try: srvcs = self.dir_client.list(cid=db_id) account, container = srvcs['account'], srvcs['name'] is_peer = self.volume_id in [x['host'] for x in srvcs['srv'] if x['type'] == 'meta2'] container_id = db_id.rsplit(".")[0] if six.PY2: if isinstance(account, six.text_type): account = account.encode('utf-8') if isinstance(container, six.text_type): container = container.encode('utf-8') cont_url = "{0}/{1}/{2}".format(self.namespace, account, container) if not is_peer: self.warn("Trying to index a container that isn't handled by" "this volume", db_id) if self.attempt_bad_index_removal: self._attempt_index_removal(cont_url, container_id) return self.index_client.meta2_index_push(volume_id=self.volume_id, container_url=cont_url, mtime=time.time(), container_id=container_id) self.success_nb += 1 except exc.OioException as exception: self.failed_nb += 1 self.warn("Unable to to index container: %s" % str(exception), db_id) self.indexed_since_last_report += 1 def crawl_volume(self): """ Crawl the volume assigned to this worker, and index every database. """ paths = paths_gen(self.volume) self.full_scan_nb += 1 self.success_nb = 0 self.failed_nb = 0 now = time.time() self.last_report_time = now self.report("starting") for db_path in paths: # Graceful exit, hopefully if self._stop: break db_id = db_path.rsplit("/")[-1].rsplit(".") if len(db_id) != 3: self.warn("Malformed db file name !", db_path) continue db_id = ".".join(db_id[:2]) self.index_meta2_database(db_id) self.last_index_time = ratelimit( self.last_index_time, self.max_indexed_per_second ) now = time.time() if now - self.last_report_time >= self.report_interval: self.report("running") self.report("ended") def run(self): """ Main worker loop """ self.start_time = time.time() while not self._stop: try: self.crawl_volume() self.last_scan_time = time.time() time.sleep(self.scans_interval) except exc.OioException as exception: self.logger.exception("ERROR during indexing meta2: %s", exception) def stop(self): """ Could be needed for eventually gracefully stopping. """ self._stop = True
class TestMeta2Indexing(BaseTestCase): def setUp(self): super(TestMeta2Indexing, self).setUp() self.rdir_client = RdirClient(self.conf) self.directory_client = DirectoryClient(self.conf) self.container_client = ContainerClient(self.conf) self.containers = [random_str(14) for _ in range(0, randint(1, 10))] self.containers_svcs = {} self.event_agent_name = 'event-agent-1' def tearDown(self): super(TestMeta2Indexing, self).tearDown() self._containers_cleanup() self._service(self.event_agent_name, 'start', wait=3) def _containers_cleanup(self): for container in self.containers: self.container_client.container_delete(self.account, container) for svc in self.containers_svcs[container]: self.rdir_client.meta2_index_delete( volume_id=svc['host'], container_path="{0}/{1}/{2}".format( self.ns, self.account, container), container_id=cid_from_name(self.account, container)) def _filter_by_managing_svc(self, all_containers, svc_of_interest): """ Filters through the containers returning only those that have svc_of_interest in their list of managing services. """ containers_list = [] for key in all_containers.keys(): if svc_of_interest in [x['host'] for x in all_containers[key]]: containers_list.append(key) return sorted(containers_list) def test_volume_indexing_worker(self): """ Test steps: - Generate a list of container names and create them - Collect their respective meta2 servers - For each meta2 server: - Run a meta2 indexing worker - List all rdir index records and match then with the services we're expecting. :return: """ self._service(self.event_agent_name, "stop", wait=3) for container in self.containers: self.container_client.container_create(account=self.account, reference=container) for container in self.containers: self.containers_svcs[container] = [ x for x in self.directory_client.list(account=self.account, reference=container)['srv'] if x['type'] == 'meta2' ] meta2_data_paths = {} for svc in self.conf['services']['meta2']: svc_host = svc.get('service_id', svc['addr']) meta2_data_paths[svc_host] = svc['path'] distinct_meta2_servers = set() for svc_list in self.containers_svcs.values(): for svc in svc_list: distinct_meta2_servers.add(svc['host']) for svc in distinct_meta2_servers: expected_containers = self._filter_by_managing_svc( self.containers_svcs, svc) worker = Meta2IndexingWorker(meta2_data_paths[svc], self.conf) worker.crawl_volume() indexed_containers = sorted([ x['container_url'].split('/')[-1] for x in self.rdir_client.meta2_index_fetch_all(volume_id=svc) ]) for cont in expected_containers: self.assertIn(cont, indexed_containers)