def test_locked_container_dbs(self): def run_test(num_locks, catch_503): container = 'container-%s' % uuid4() client.put_container(self.url, self.token, container) db_files = self._get_container_db_files(container) db_conns = [] for i in range(num_locks): db_conn = connect(db_files[i]) db_conn.execute('begin exclusive transaction') db_conns.append(db_conn) if catch_503: try: client.delete_container(self.url, self.token, container) except client.ClientException as err: self.assertEqual(err.http_status, 503) else: self.fail("Expected ClientException but didn't get it") else: client.delete_container(self.url, self.token, container) proxy_conf = readconf(self.configs['proxy-server'], section_name='app:proxy-server') node_timeout = int(proxy_conf.get('node_timeout', 10)) pool = GreenPool() try: with Timeout(node_timeout + 5): pool.spawn(run_test, 1, False) pool.spawn(run_test, 2, True) pool.spawn(run_test, 3, True) pool.waitall() except Timeout as err: raise Exception( "The server did not return a 503 on container db locks, " "it just hangs: %s" % err)
def test_locked_container_dbs(self): def run_test(num_locks, catch_503): container = 'container-%s' % uuid4() client.put_container(self.url, self.token, container) db_files = self._get_container_db_files(container) db_conns = [] for i in range(num_locks): db_conn = connect(db_files[i]) db_conn.execute('begin exclusive transaction') db_conns.append(db_conn) if catch_503: exc = None try: client.delete_container(self.url, self.token, container) except client.ClientException as err: exc = err self.assertEquals(exc.http_status, 503) else: client.delete_container(self.url, self.token, container) pool = GreenPool() try: with Timeout(15): pool.spawn(run_test, 1, False) pool.spawn(run_test, 2, True) pool.spawn(run_test, 3, True) pool.waitall() except Timeout as err: raise Exception( "The server did not return a 503 on container db locks, " "it just hangs: %s" % err)
def test_locked_container_dbs(self): def run_test(num_locks, catch_503): container = 'container-%s' % uuid4() client.put_container(self.url, self.token, container) db_files = self._get_container_db_files(container) db_conns = [] for i in range(num_locks): db_conn = connect(db_files[i]) db_conn.execute('begin exclusive transaction') db_conns.append(db_conn) if catch_503: try: client.delete_container(self.url, self.token, container) except client.ClientException as err: self.assertEqual(err.http_status, 503) else: self.fail("Expected ClientException but didn't get it") else: client.delete_container(self.url, self.token, container) pool = GreenPool() try: with Timeout(15): pool.spawn(run_test, 1, False) pool.spawn(run_test, 2, True) pool.spawn(run_test, 3, True) pool.waitall() except Timeout as err: raise Exception( "The server did not return a 503 on container db locks, " "it just hangs: %s" % err)
def run(self, *args, **kwargs): try: self.logger.info('conscience agent: starting') pool = GreenPool(len(self.watchers)) for watcher in self.watchers: pool.spawn(watcher.start) self.running = True while self.running: sleep(1) for w in self.watchers: if w.failed: self.watchers.remove(w) self.logger.warn('restart watcher "%s"', w.name) new_w = ServiceWatcher(self.conf, w.service) self.watchers.append(new_w) pool.spawn(new_w.start) except Exception as e: self.logger.error('ERROR in main loop %s', e) raise e finally: self.logger.warn('conscience agent: stopping') self.running = False self.stop_watchers()
def test_connection_pool_timeout(self): orig_conn_pool = memcached.MemcacheConnPool try: connections = defaultdict(Queue) pending = defaultdict(int) served = defaultdict(int) class MockConnectionPool(orig_conn_pool): def get(self): pending[self.server] += 1 conn = connections[self.server].get() pending[self.server] -= 1 return conn def put(self, *args, **kwargs): connections[self.server].put(*args, **kwargs) served[self.server] += 1 memcached.MemcacheConnPool = MockConnectionPool memcache_client = memcached.MemcacheRing(['1.2.3.4:11211', '1.2.3.5:11211'], io_timeout=0.5, pool_timeout=0.1) p = GreenPool() for i in range(10): p.spawn(memcache_client.set, 'key', 'value') # let everyone block sleep(0) self.assertEqual(pending['1.2.3.5:11211'], 10) # hand out a couple slow connection mock_conn = MagicMock(), MagicMock() mock_conn[1].sendall = lambda x: sleep(0.2) connections['1.2.3.5:11211'].put(mock_conn) connections['1.2.3.5:11211'].put(mock_conn) # so far so good, everyone is still waiting sleep(0) self.assertEqual(pending['1.2.3.5:11211'], 8) self.assertEqual(len(memcache_client._errors['1.2.3.5:11211']), 0) # but they won't wait longer than pool_timeout mock_conn = MagicMock(), MagicMock() connections['1.2.3.4:11211'].put(mock_conn) connections['1.2.3.4:11211'].put(mock_conn) p.waitall() self.assertEqual(len(memcache_client._errors['1.2.3.5:11211']), 8) self.assertEqual(served['1.2.3.5:11211'], 2) self.assertEqual(len(memcache_client._errors['1.2.3.4:11211']), 0) self.assertEqual(served['1.2.3.4:11211'], 8) # and we never got more put in that we gave out self.assertEqual(connections['1.2.3.5:11211'].qsize(), 2) self.assertEqual(connections['1.2.3.4:11211'].qsize(), 2) finally: memcached.MemcacheConnPool = orig_conn_pool
def test_connection_pool_timeout(self): orig_conn_pool = memcached.MemcacheConnPool try: connections = defaultdict(Queue) pending = defaultdict(int) served = defaultdict(int) class MockConnectionPool(orig_conn_pool): def get(self): pending[self.server] += 1 conn = connections[self.server].get() pending[self.server] -= 1 return conn def put(self, *args, **kwargs): connections[self.server].put(*args, **kwargs) served[self.server] += 1 memcached.MemcacheConnPool = MockConnectionPool memcache_client = memcached.MemcacheRing( ['1.2.3.4:11211', '1.2.3.5:11211'], io_timeout=0.5, pool_timeout=0.1) p = GreenPool() for i in range(10): p.spawn(memcache_client.set, 'key', 'value') # let everyone block sleep(0) self.assertEqual(pending['1.2.3.5:11211'], 10) # hand out a couple slow connection mock_conn = MagicMock(), MagicMock() mock_conn[1].sendall = lambda x: sleep(0.2) connections['1.2.3.5:11211'].put(mock_conn) connections['1.2.3.5:11211'].put(mock_conn) # so far so good, everyone is still waiting sleep(0) self.assertEqual(pending['1.2.3.5:11211'], 8) self.assertEqual(len(memcache_client._errors['1.2.3.5:11211']), 0) # but they won't wait longer than pool_timeout mock_conn = MagicMock(), MagicMock() connections['1.2.3.4:11211'].put(mock_conn) connections['1.2.3.4:11211'].put(mock_conn) p.waitall() self.assertEqual(len(memcache_client._errors['1.2.3.5:11211']), 8) self.assertEqual(served['1.2.3.5:11211'], 2) self.assertEqual(len(memcache_client._errors['1.2.3.4:11211']), 0) self.assertEqual(served['1.2.3.4:11211'], 8) # and we never got more put in that we gave out self.assertEqual(connections['1.2.3.5:11211'].qsize(), 2) self.assertEqual(connections['1.2.3.4:11211'].qsize(), 2) finally: memcached.MemcacheConnPool = orig_conn_pool
def test_connection_pooling_pre_0_9_17(self): with patch('swift.common.memcached.socket') as mock_module: connected = [] count = [0] def _slow_yielding_connector(addr): count[0] += 1 if count[0] % 3 == 0: raise ValueError('whoops!') sleep(0.1) connected.append(addr) mock_module.socket.return_value.connect.side_effect = \ _slow_yielding_connector # If POOL_SIZE is not small enough relative to USER_COUNT, the # "free_items" business in the eventlet.pools.Pool will cause # spurious failures below. I found these values to work well on a # VM running in VirtualBox on a late 2013 Retina MacbookPro: POOL_SIZE = 5 USER_COUNT = 50 pool = memcached.MemcacheConnPool('1.2.3.4:11211', size=POOL_SIZE, connect_timeout=10) self.assertEqual(POOL_SIZE, pool.max_size) def _user(): got = None while not got: try: got = pool.get() # This was really supposed to be "except:" but ran afoul # of the H201 check, which does not implement the "noqa" # exception. Once that's fixed, the except here can be # changed to "except: # noqa" except (Exception, BaseException): pass pool.put(got) # make a bunch of requests "at the same time" p = GreenPool() for i in range(USER_COUNT): p.spawn(_user) p.waitall() # If the except block after the "created = self.create()" call # doesn't correctly decrement self.current_size, this test will # fail by having some number less than POOL_SIZE connections (in my # testing, anyway). self.assertEqual(POOL_SIZE, len(connected)) # Subsequent requests should get and use the existing # connections, not creating any more. for i in range(USER_COUNT): p.spawn(_user) p.waitall() self.assertEqual(POOL_SIZE, len(connected))
def test_connection_pool_timeout(self): connections = defaultdict(Queue) pending = defaultdict(int) served = defaultdict(int) class MockConnectionPool(memcached.MemcacheConnPool): def get(self): pending[self.host] += 1 conn = connections[self.host].get() pending[self.host] -= 1 return conn def put(self, *args, **kwargs): connections[self.host].put(*args, **kwargs) served[self.host] += 1 with mock.patch.object(memcached, 'MemcacheConnPool', MockConnectionPool): memcache_client = memcached.MemcacheRing( ['1.2.3.4:11211', '1.2.3.5:11211'], io_timeout=0.5, pool_timeout=0.1, logger=self.logger) # Hand out a couple slow connections to 1.2.3.5, leaving 1.2.3.4 # fast. All ten (10) clients should try to talk to .5 first, and # then move on to .4, and we'll assert all that below. mock_conn = MagicMock(), MagicMock() mock_conn[0].readline = lambda: b'STORED\r\n' mock_conn[1].sendall = lambda x: sleep(0.2) connections['1.2.3.5'].put(mock_conn) connections['1.2.3.5'].put(mock_conn) mock_conn = MagicMock(), MagicMock() mock_conn[0].readline = lambda: b'STORED\r\n' connections['1.2.3.4'].put(mock_conn) connections['1.2.3.4'].put(mock_conn) p = GreenPool() for i in range(10): p.spawn(memcache_client.set, 'key', 'value') # Wait for the dust to settle. p.waitall() self.assertEqual(pending['1.2.3.5'], 8) self.assertEqual(len(memcache_client._errors['1.2.3.5:11211']), 8) self.assertEqual( self.logger.get_lines_for_level('error'), ['Timeout getting a connection to memcached: 1.2.3.5:11211'] * 8) self.assertEqual(served['1.2.3.5'], 2) self.assertEqual(pending['1.2.3.4'], 0) self.assertEqual(len(memcache_client._errors['1.2.3.4:11211']), 0) self.assertEqual(served['1.2.3.4'], 8) # and we never got more put in that we gave out self.assertEqual(connections['1.2.3.5'].qsize(), 2) self.assertEqual(connections['1.2.3.4'].qsize(), 2)
def test_connection_pooling(self): with patch('swift.common.memcached.socket') as mock_module: # patch socket, stub socket.socket, mock sock mock_sock = mock_module.socket.return_value # track clients waiting for connections connected = [] connections = Queue() errors = [] def wait_connect(addr): connected.append(addr) sleep(0.1) # yield val = connections.get() if val is not None: errors.append(val) mock_sock.connect = wait_connect memcache_client = memcached.MemcacheRing(['1.2.3.4:11211'], connect_timeout=10) # sanity self.assertEquals(1, len(memcache_client._client_cache)) for server, pool in memcache_client._client_cache.items(): self.assertEqual(2, pool.max_size) # make 10 requests "at the same time" p = GreenPool() for i in range(10): p.spawn(memcache_client.set, 'key', 'value') for i in range(3): sleep(0.1) self.assertEqual(2, len(connected)) # give out a connection connections.put(None) # at this point, only one connection should have actually been # created, the other is in the creation step, and the rest of the # clients are not attempting to connect. we let this play out a # bit to verify. for i in range(3): sleep(0.1) self.assertEqual(2, len(connected)) # finish up, this allows the final connection to be created, so # that all the other clients can use the two existing connections # and no others will be created. connections.put(None) connections.put('nono') self.assertEqual(2, len(connected)) p.waitall() self.assertEqual(2, len(connected)) self.assertEqual(0, len(errors), "A client was allowed a third connection") connections.get_nowait() self.assertTrue(connections.empty())
def run(self, *args, **kwargs): try: self.logger.info('event agent: starting') pool = GreenPool(len(self.workers)) for worker in self.workers: pool.spawn(worker.start) def front(server, backend): while True: msg = server.recv_multipart() if validate_msg(msg): try: event_id = sqlite3.Binary(msg[2]) data = msg[3] self.queue.put(event_id, data) event = ['', msg[2], msg[3]] backend.send_multipart(event) except Exception: pass finally: ack = msg[0:3] server.send_multipart(ack) def back(backend): while True: msg = backend.recv_multipart() event_id = msg[1] event_id = sqlite3.Binary(event_id) self.queue.delete(event_id) boss_pool = GreenPool(2) boss_pool.spawn_n(front, self.server, self.backend) boss_pool.spawn_n(back, self.backend) while True: sleep(1) now = time.time() if now - self.last_retry > self.retry_interval: self.retry() self.last_retry = now for w in self.workers: if w.failed: self.workers.remove(w) self.logger.warn('restart worker "%s"', w.name) new_w = EventWorker(self.conf, w.name, self.context) self.workers.append(new_w) pool.spawn(new_w.start) except Exception as e: self.logger.error('ERROR in main loop %s', e) raise e finally: self.logger.warn('event agent: stopping') self.stop_workers()
def test_connection_pool_timeout(self): orig_conn_pool = memcached.MemcacheConnPool try: connections = defaultdict(Queue) pending = defaultdict(int) served = defaultdict(int) class MockConnectionPool(orig_conn_pool): def get(self): pending[self.server] += 1 conn = connections[self.server].get() pending[self.server] -= 1 return conn def put(self, *args, **kwargs): connections[self.server].put(*args, **kwargs) served[self.server] += 1 memcached.MemcacheConnPool = MockConnectionPool memcache_client = memcached.MemcacheRing( ['1.2.3.4:11211', '1.2.3.5:11211'], io_timeout=0.5, pool_timeout=0.1) # Hand out a couple slow connections to 1.2.3.5, leaving 1.2.3.4 # fast. All ten (10) clients should try to talk to .5 first, and # then move on to .4, and we'll assert all that below. mock_conn = MagicMock(), MagicMock() mock_conn[1].sendall = lambda x: sleep(0.2) connections['1.2.3.5:11211'].put(mock_conn) connections['1.2.3.5:11211'].put(mock_conn) mock_conn = MagicMock(), MagicMock() connections['1.2.3.4:11211'].put(mock_conn) connections['1.2.3.4:11211'].put(mock_conn) p = GreenPool() for i in range(10): p.spawn(memcache_client.set, 'key', 'value') # Wait for the dust to settle. p.waitall() self.assertEqual(pending['1.2.3.5:11211'], 8) self.assertEqual(len(memcache_client._errors['1.2.3.5:11211']), 8) self.assertEqual(served['1.2.3.5:11211'], 2) self.assertEqual(pending['1.2.3.4:11211'], 0) self.assertEqual(len(memcache_client._errors['1.2.3.4:11211']), 0) self.assertEqual(served['1.2.3.4:11211'], 8) # and we never got more put in that we gave out self.assertEqual(connections['1.2.3.5:11211'].qsize(), 2) self.assertEqual(connections['1.2.3.4:11211'].qsize(), 2) finally: memcached.MemcacheConnPool = orig_conn_pool
def test_connection_pool_timeout(self): orig_conn_pool = memcached.MemcacheConnPool try: connections = defaultdict(Queue) pending = defaultdict(int) served = defaultdict(int) class MockConnectionPool(orig_conn_pool): def get(self): pending[self.host] += 1 conn = connections[self.host].get() pending[self.host] -= 1 return conn def put(self, *args, **kwargs): connections[self.host].put(*args, **kwargs) served[self.host] += 1 memcached.MemcacheConnPool = MockConnectionPool memcache_client = memcached.MemcacheRing(['1.2.3.4:11211', '1.2.3.5:11211'], io_timeout=0.5, pool_timeout=0.1) # Hand out a couple slow connections to 1.2.3.5, leaving 1.2.3.4 # fast. All ten (10) clients should try to talk to .5 first, and # then move on to .4, and we'll assert all that below. mock_conn = MagicMock(), MagicMock() mock_conn[1].sendall = lambda x: sleep(0.2) connections['1.2.3.5'].put(mock_conn) connections['1.2.3.5'].put(mock_conn) mock_conn = MagicMock(), MagicMock() connections['1.2.3.4'].put(mock_conn) connections['1.2.3.4'].put(mock_conn) p = GreenPool() for i in range(10): p.spawn(memcache_client.set, 'key', 'value') # Wait for the dust to settle. p.waitall() self.assertEqual(pending['1.2.3.5'], 8) self.assertEqual(len(memcache_client._errors['1.2.3.5:11211']), 8) self.assertEqual(served['1.2.3.5'], 2) self.assertEqual(pending['1.2.3.4'], 0) self.assertEqual(len(memcache_client._errors['1.2.3.4:11211']), 0) self.assertEqual(served['1.2.3.4'], 8) # and we never got more put in that we gave out self.assertEqual(connections['1.2.3.5'].qsize(), 2) self.assertEqual(connections['1.2.3.4'].qsize(), 2) finally: memcached.MemcacheConnPool = orig_conn_pool
def test_connection_pooling_pre_0_9_17(self): with patch('swift.common.memcached.socket') as mock_module: connected = [] count = [0] def _slow_yielding_connector(addr): count[0] += 1 if count[0] % 3 == 0: raise ValueError('whoops!') sleep(0.1) connected.append(addr) mock_module.socket.return_value.connect.side_effect = \ _slow_yielding_connector # If POOL_SIZE is not small enough relative to USER_COUNT, the # "free_items" business in the eventlet.pools.Pool will cause # spurious failures below. I found these values to work well on a # VM running in VirtualBox on a late 2013 Retina MacbookPro: POOL_SIZE = 5 USER_COUNT = 50 pool = memcached.MemcacheConnPool('1.2.3.4:11211', size=POOL_SIZE, connect_timeout=10) self.assertEqual(POOL_SIZE, pool.max_size) def _user(): got = None while not got: try: got = pool.get() except: # noqa pass pool.put(got) # make a bunch of requests "at the same time" p = GreenPool() for i in range(USER_COUNT): p.spawn(_user) p.waitall() # If the except block after the "created = self.create()" call # doesn't correctly decrement self.current_size, this test will # fail by having some number less than POOL_SIZE connections (in my # testing, anyway). self.assertEqual(POOL_SIZE, len(connected)) # Subsequent requests should get and use the existing # connections, not creating any more. for i in range(USER_COUNT): p.spawn(_user) p.waitall() self.assertEqual(POOL_SIZE, len(connected))
def serve_forever(self): self.running = True self.server = listen(self.address, self._family()) pool = GreenPool() try: while self.running: sock, address = self.server.accept() pool.spawn(self.call_handler, sock, address) self.greenlets except OSError: pass
def run(self): signal.signal(signal.SIGINT, self.signal_handler) pool = GreenPool() with open("config.yml") as file: config = yaml.load(file.read()) for router in config["routers"]: printmsg("Starting trasa on %s" % router["local_address"]) trasa = Ldp(router["local_address"]) self.trasas.append(trasa) pool.spawn(self.call_handler, trasa) pool.waitall() printmsg("All greenlets gone, exiting")
def serve_forever(self): self.running = True self.socket = socket.socket(socket.PF_PACKET, socket.SOCK_RAW, socket.htons(self.ethertype)) self.socket.bind((self.interface_name, 0)) self.get_interface_index() self.set_socket_promiscuous() self.poller = select.poll() self.poller.register( self.socket, select.POLLIN | select.POLLPRI | select.POLLERR | select.POLLHUP | select.POLLNVAL) pool = GreenPool() self.greenlets.add(pool.spawn(self.server)) self.greenlets.add(pool.spawn(self.dispatcher)) pool.waitall()
class Resources: def __init__(self, providerbase): self._providerbase = providerbase self._spec2thread = {} self._pool = GreenPool() self._resources = {} def _dispatchprovider(self, spec): parts = spec.split(":") name = parts.pop(0) provider = getattr(self._providerbase, "provide_" + name) self._resources[spec] = res = provider(*parts) return res def getresources(self, *specs): for spec in specs: if spec not in self._resources: if spec not in self._spec2thread: t = self._pool.spawn(self._dispatchprovider, spec) self._spec2thread[spec] = t resources = [] for spec in specs: if spec not in self._resources: self._spec2thread[spec].wait() resources.append(self._resources[spec]) return resources
def test_connection(self): """ conn = Connection(auth_endpoint="https://identity.api.rackspacecloud.com/v2.0", client_id=str(uuid.uuid4()), endpoint="http://localhost:8888/v1/12345", user="", key="") """ conn = Connection(auth_endpoint="https://identity.api.rackspacecloud.com/v2.0", client_id=str(uuid.uuid4()), endpoint="http://166.78.143.130/v1/12345", user="", key="") conn.connect(token='blah') def create_worker(queue_name): return conn.create_queue(queue_name, 100) def post_worker(queue): return queue.post_message('test_message', 10) def delete_worker(queue_name): conn.delete_queue(queue_name) return queue_name pool = GreenPool(1000) def on_message_posted(greenthread): msg = greenthread.wait() print msg._href def on_queue_created(greenthread): queue = greenthread.wait() print queue.name for x in range(0, 10): gt = pool.spawn(post_worker, queue) gt.link(on_message_posted) queue_names = ["queue-"+str(x) for x in xrange(0,5)] for queue_name in queue_names: gt = pool.spawn(create_worker, queue_name) gt.link(on_queue_created) pool.waitall() def delete_worker(queue_name): conn.delete_queue(queue_name) print "Queue:", queue_name, " deleted" for queue in conn.get_queues(): gt = pool.spawn_n(delete_worker, queue.name) print "Waiting for everything to finish" pool.waitall() print "Done"
def runtestsmulti(self, envlist): pool = GreenPool(size=self._toxconfig.option.numproc) threads = [] for env in envlist: threads.append(pool.spawn(self.runtests, env)) for t in threads: # re-raises any exceptions of the worker thread t.wait() if not self.toxsession.config.option.sdistonly: retcode = self._toxsession._summary() return retcode
def test_print_route_updates(self): fake_route_update = "FAKE ROUTE UPDATE" self.state_machine.route_updates.put(fake_route_update) pool = GreenPool() eventlet = pool.spawn(self.peering.print_route_updates) for _ in range(10): sleep(0) if self.route_catcher.route_updates: break self.assertEqual(len(self.route_catcher.route_updates), 1) self.assertEqual(self.route_catcher.route_updates[0], fake_route_update) eventlet.kill()
def test_connection_pooling(self): with patch('swift.common.memcached.socket') as mock_module: # patch socket, stub socket.socket, mock sock mock_sock = mock_module.socket.return_value # track clients waiting for connections connected = [] connections = Queue() def wait_connect(addr): connected.append(addr) connections.get() mock_sock.connect = wait_connect memcache_client = memcached.MemcacheRing(['1.2.3.4:11211'], connect_timeout=10) # sanity self.assertEquals(1, len(memcache_client._client_cache)) for server, pool in memcache_client._client_cache.items(): self.assertEquals(2, pool.max_size) # make 10 requests "at the same time" p = GreenPool() for i in range(10): p.spawn(memcache_client.set, 'key', 'value') for i in range(3): sleep(0.1) self.assertEquals(2, len(connected)) # give out a connection connections.put(None) for i in range(3): sleep(0.1) self.assertEquals(2, len(connected)) # finish up for i in range(8): connections.put(None) self.assertEquals(2, len(connected)) p.waitall() self.assertEquals(2, len(connected))
def test_locked_container_dbs(self): def run_test(num_locks, catch_503): container = 'container-%s' % uuid4() client.put_container(self.url, self.token, container) # Get the container info into memcache (so no stray # get_container_info calls muck up our timings) client.get_container(self.url, self.token, container) db_files = self.get_container_db_files(container) db_conns = [] for i in range(num_locks): db_conn = connect(db_files[i]) db_conn.execute('begin exclusive transaction') db_conns.append(db_conn) if catch_503: try: client.delete_container(self.url, self.token, container) except client.ClientException as err: self.assertEqual(err.http_status, 503) else: self.fail("Expected ClientException but didn't get it") else: client.delete_container(self.url, self.token, container) proxy_conf = readconf(self.configs['proxy-server'], section_name='app:proxy-server') node_timeout = int(proxy_conf.get('node_timeout', 10)) pool = GreenPool() try: with Timeout(node_timeout + 5): pool.spawn(run_test, 1, False) pool.spawn(run_test, 2, True) pool.spawn(run_test, 3, True) pool.waitall() except Timeout as err: raise Exception( "The server did not return a 503 on container db locks, " "it just hangs: %s" % err)
def run(self, *args, **kwargs): try: self.logger.info('conscience agent: starting') pool = GreenPool(len(self.watchers)) for watcher in self.watchers: pool.spawn(watcher.start) while True: sleep(1) for w in self.watchers: if w.failed: self.watchers.remove(w) self.logger.warn('restart watcher "%s"', w.name) new_w = ServiceWatcher(self.conf, w.service) self.watchers.append(new_w) pool.spawn(new_w.start) except Exception as e: self.logger.error('ERROR in main loop %s', e) raise e finally: self.logger.warn('conscience agent: stopping') self.stop_watchers()
class WaitPool(PoolInterface): def __init__(self, pool_size=1000, queue_size=1000): self._pool_size = int(pool_size) self._queue_size = int(queue_size) self._pool = GreenPool(self._pool_size) self._max_job_id = '' def can_spawn(self, job_id): if job_id <= self._max_job_id: return True if self._pool.free() > 0 or self._pool.waiting() < self._queue_size: self._max_job_id = job_id return True return False def _spawn(self, function, *args, **kwargs): return self._pool.spawn(function, *args, **kwargs)
class PriorityPool(PoolInterface): def __init__(self, low_watermark=1000, high_watermark=1000): self._low_watermark = int(low_watermark) self._high_watermark = int(high_watermark) self._pool = GreenPool(self._high_watermark) self._max_job_id = '' def can_spawn(self, job_id): if job_id <= self._max_job_id: return True if self._pool.running() < self._low_watermark: self._max_job_id = job_id return True return False def _spawn(self, function, *args, **kwargs): return self._pool.spawn(function, *args, **kwargs)
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf, logger=None): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = PrefixLoggerAdapter( logger or get_logger(conf, log_route='object-replicator'), {}) self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.bind_ip = conf.get('bind_ip', '0.0.0.0') self.servers_per_port = int(conf.get('servers_per_port', '0') or 0) self.port = None if self.servers_per_port else \ int(conf.get('bind_port', 6200)) self.concurrency = int(conf.get('concurrency', 1)) self.replicator_workers = int(conf.get('replicator_workers', 0)) self.stats_interval = int(conf.get('stats_interval', '300')) self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.replication_cycle = random.randint(0, 9) self.partition_times = [] self.interval = int(conf.get('interval') or conf.get('run_pause') or 30) if 'run_pause' in conf and 'interval' not in conf: self.logger.warning('Option object-replicator/run_pause ' 'is deprecated and will be removed in a ' 'future version. Update your configuration' ' to use option object-replicator/' 'interval.') self.rsync_timeout = int(conf.get('rsync_timeout', DEFAULT_RSYNC_TIMEOUT)) self.rsync_io_timeout = conf.get('rsync_io_timeout', '30') self.rsync_bwlimit = conf.get('rsync_bwlimit', '0') self.rsync_compress = config_true_value( conf.get('rsync_compress', 'no')) self.rsync_module = conf.get('rsync_module', '').rstrip('/') if not self.rsync_module: self.rsync_module = '{replication_ip}::object' self.http_timeout = int(conf.get('http_timeout', 60)) self.recon_cache_path = conf.get('recon_cache_path', '/var/cache/swift') self.rcache = os.path.join(self.recon_cache_path, "object.recon") self._next_rcache_update = time.time() + self.stats_interval self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.node_timeout = float(conf.get('node_timeout', 10)) self.sync_method = getattr(self, conf.get('sync_method') or 'rsync') self.network_chunk_size = int(conf.get('network_chunk_size', 65536)) self.default_headers = { 'Content-Length': '0', 'user-agent': 'object-replicator %s' % os.getpid()} self.rsync_error_log_line_length = \ int(conf.get('rsync_error_log_line_length', 0)) self.handoffs_first = config_true_value(conf.get('handoffs_first', False)) self.handoff_delete = config_auto_int_value( conf.get('handoff_delete', 'auto'), 0) if any((self.handoff_delete, self.handoffs_first)): self.logger.warning('Handoff only mode is not intended for normal ' 'operation, please disable handoffs_first and ' 'handoff_delete before the next ' 'normal rebalance') self.is_multiprocess_worker = None self._df_router = DiskFileRouter(conf, self.logger) self._child_process_reaper_queue = queue.LightQueue() def _zero_stats(self): self.stats_for_dev = defaultdict(Stats) @property def total_stats(self): return sum(self.stats_for_dev.values(), Stats()) def _emplace_log_prefix(self, worker_index): self.logger.set_prefix("[worker %d/%d pid=%d] " % ( worker_index + 1, # use 1-based indexing for more readable logs self.replicator_workers, os.getpid())) def _get_my_replication_ips(self): my_replication_ips = set() ips = whataremyips() for policy in POLICIES: self.load_object_ring(policy) for local_dev in [dev for dev in policy.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port]: my_replication_ips.add(local_dev['replication_ip']) return list(my_replication_ips) def _child_process_reaper(self): """ Consume processes from self._child_process_reaper_queue and wait() for them """ procs = set() done = False while not done: timeout = 60 if procs else None try: new_proc = self._child_process_reaper_queue.get( timeout=timeout) if new_proc is not None: procs.add(new_proc) else: done = True except queue.Empty: pass reaped_procs = set() for proc in procs: # this will reap the process if it has exited, but # otherwise will not wait if proc.poll() is not None: reaped_procs.add(proc) procs -= reaped_procs def get_worker_args(self, once=False, **kwargs): if self.replicator_workers < 1: return [] override_opts = parse_override_options(once=once, **kwargs) have_overrides = bool(override_opts.devices or override_opts.partitions or override_opts.policies) # save this off for ring-change detection later in is_healthy() self.all_local_devices = self.get_local_devices() if override_opts.devices: devices_to_replicate = [ d for d in override_opts.devices if d in self.all_local_devices] else: # The sort isn't strictly necessary since we're just trying to # spread devices around evenly, but it makes testing easier. devices_to_replicate = sorted(self.all_local_devices) # Distribute devices among workers as evenly as possible self.replicator_workers = min(self.replicator_workers, len(devices_to_replicate)) return [{'override_devices': devs, 'override_partitions': override_opts.partitions, 'override_policies': override_opts.policies, 'have_overrides': have_overrides, 'multiprocess_worker_index': index} for index, devs in enumerate( distribute_evenly(devices_to_replicate, self.replicator_workers))] def is_healthy(self): """ Check whether our set of local devices remains the same. If devices have been added or removed, then we return False here so that we can kill off any worker processes and then distribute the new set of local devices across a new set of workers so that all devices are, once again, being worked on. This function may also cause recon stats to be updated. :returns: False if any local devices have been added or removed, True otherwise """ # We update recon here because this is the only function we have in # a multiprocess replicator that gets called periodically in the # parent process. if time.time() >= self._next_rcache_update: update = self.aggregate_recon_update() dump_recon_cache(update, self.rcache, self.logger) return self.get_local_devices() == self.all_local_devices def get_local_devices(self): """ Returns a set of all local devices in all replication-type storage policies. This is the device names, e.g. "sdq" or "d1234" or something, not the full ring entries. """ ips = whataremyips(self.bind_ip) local_devices = set() for policy in POLICIES: if policy.policy_type != REPL_POLICY: continue self.load_object_ring(policy) for device in policy.object_ring.devs: if device and is_local_device( ips, self.port, device['replication_ip'], device['replication_port']): local_devices.add(device['device']) return local_devices # Just exists for doc anchor point def sync(self, node, job, suffixes, *args, **kwargs): """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean and dictionary, boolean indicating success or failure """ return self.sync_method(node, job, suffixes, *args, **kwargs) def load_object_ring(self, policy): """ Make sure the policy's rings are loaded. :param policy: the StoragePolicy instance :returns: appropriate ring object """ policy.load_ring(self.swift_dir) return policy.object_ring def _limit_rsync_log(self, line): """ If rsync_error_log_line_length is defined then limit the error to that length :param line: rsync log line :return: If enabled the line limited to rsync_error_log_line_length otherwise the initial line. """ if self.rsync_error_log_line_length: return line[:self.rsync_error_log_line_length] return line def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() proc = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error( self._limit_rsync_log( _("Killing long-running rsync: %s") % str(args))) if proc: proc.kill() try: # Note: Python 2.7's subprocess.Popen class doesn't take # any arguments for wait(), but Python 3's does. # However, Eventlet's replacement Popen takes a timeout # argument regardless of Python version, so we don't # need any conditional code here. proc.wait(timeout=1.0) except subprocess.TimeoutExpired: # Sometimes a process won't die immediately even after a # SIGKILL. This can be due to failing disks, high load, # or other reasons. We can't wait for it forever since # we're taking up a slot in the (green)thread pool, so # we send it over to another greenthread, not part of # our pool, whose sole duty is to wait for child # processes to exit. self._child_process_reaper_queue.put(proc) return 1 # failure response code total_time = time.time() - start_time for result in results.split('\n'): if result == '': continue if result.startswith('cd+'): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: self.logger.error( self._limit_rsync_log( _('Bad rsync return code: %(ret)d <- %(args)s') % {'args': str(args), 'ret': ret_val})) else: log_method = self.logger.info if results else self.logger.debug log_method( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {'src': args[-2], 'dst': args[-1], 'time': total_time}) return ret_val def rsync(self, node, job, suffixes): """ Uses rsync to implement the sync method. This was the first sync method in Swift. """ if not os.path.exists(job['path']): return False, {} args = [ 'rsync', '--recursive', '--whole-file', '--human-readable', '--xattrs', '--itemize-changes', '--ignore-existing', '--timeout=%s' % self.rsync_io_timeout, '--contimeout=%s' % self.rsync_io_timeout, '--bwlimit=%s' % self.rsync_bwlimit, '--exclude=.*.%s' % ''.join('[0-9a-zA-Z]' for i in range(6)) ] if self.rsync_compress and \ job['region'] != node['region']: # Allow for compression, but only if the remote node is in # a different region than the local one. args.append('--compress') rsync_module = rsync_module_interpolation(self.rsync_module, node) had_any = False for suffix in suffixes: spath = join(job['path'], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False, {} data_dir = get_data_dir(job['policy']) args.append(join(rsync_module, node['device'], data_dir, job['partition'])) return self._rsync(args) == 0, {} def ssync(self, node, job, suffixes, remote_check_objs=None): return ssync_sender.Sender( self, node, job, suffixes, remote_check_objs)() def check_ring(self, object_ring): """ Check to see if the ring has been updated :param object_ring: the ring to check :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff))] stats = self.stats_for_dev[job['device']] stats.attempted += 1 self.logger.increment('partition.delete.count.%s' % (job['device'],)) headers = dict(self.default_headers) headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) failure_devs_info = set() begin = time.time() handoff_partition_deleted = False try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job['path']) synced_remote_regions = {} delete_objs = None if suffixes: for node in job['nodes']: stats.rsync += 1 kwargs = {} if node['region'] in synced_remote_regions and \ self.conf.get('sync_method', 'rsync') == 'ssync': kwargs['remote_check_objs'] = \ synced_remote_regions[node['region']] # candidates is a dict(hash=>timestamp) of objects # for deletion success, candidates = self.sync( node, job, suffixes, **kwargs) if success: with Timeout(self.http_timeout): conn = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=headers) conn.getresponse().read() if node['region'] != job['region']: synced_remote_regions[node['region']] = viewkeys( candidates) else: failure_devs_info.add((node['replication_ip'], node['device'])) responses.append(success) for cand_objs in synced_remote_regions.values(): if delete_objs is None: delete_objs = cand_objs else: delete_objs = delete_objs & cand_objs if self.handoff_delete: # delete handoff if we have had handoff_delete successes delete_handoff = len([resp for resp in responses if resp]) >= \ self.handoff_delete else: # delete handoff if all syncs were successful delete_handoff = len(responses) == len(job['nodes']) and \ all(responses) if delete_handoff: stats.remove += 1 if (self.conf.get('sync_method', 'rsync') == 'ssync' and delete_objs is not None): self.logger.info(_("Removing %s objects"), len(delete_objs)) _junk, error_paths = self.delete_handoff_objs( job, delete_objs) # if replication works for a hand-off device and it failed, # the remote devices which are target of the replication # from the hand-off device will be marked. Because cleanup # after replication failed means replicator needs to # replicate again with the same info. if error_paths: failure_devs_info.update( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes']]) else: self.delete_partition(job['path']) handoff_partition_deleted = True elif not suffixes: self.delete_partition(job['path']) handoff_partition_deleted = True except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) stats.add_failure_stats(failure_devs_info) finally: target_devs_info = set([(target_dev['replication_ip'], target_dev['device']) for target_dev in job['nodes']]) stats.success += len(target_devs_info - failure_devs_info) if not handoff_partition_deleted: self.handoffs_remaining += 1 self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.delete.timing', begin) def delete_partition(self, path): self.logger.info(_("Removing partition: %s"), path) try: tpool.execute(shutil.rmtree, path) except OSError as e: if e.errno not in (errno.ENOENT, errno.ENOTEMPTY): # If there was a race to create or delete, don't worry raise def delete_handoff_objs(self, job, delete_objs): success_paths = [] error_paths = [] for object_hash in delete_objs: object_path = storage_directory(job['obj_path'], job['partition'], object_hash) tpool.execute(shutil.rmtree, object_path, ignore_errors=True) suffix_dir = dirname(object_path) try: os.rmdir(suffix_dir) success_paths.append(object_path) except OSError as e: if e.errno not in (errno.ENOENT, errno.ENOTEMPTY): error_paths.append(object_path) self.logger.exception( "Unexpected error trying to cleanup suffix dir:%r", suffix_dir) return success_paths, error_paths def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ stats = self.stats_for_dev[job['device']] stats.attempted += 1 self.logger.increment('partition.update.count.%s' % (job['device'],)) headers = dict(self.default_headers) headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) target_devs_info = set() failure_devs_info = set() begin = time.time() df_mgr = self._df_router[job['policy']] try: hashed, local_hash = tpool.execute( df_mgr._get_hashes, job['device'], job['partition'], job['policy'], do_listdir=_do_listdir( int(job['partition']), self.replication_cycle)) stats.suffix_hash += hashed self.logger.update_stats('suffix.hashes', hashed) attempts_left = len(job['nodes']) synced_remote_regions = set() random.shuffle(job['nodes']) nodes = itertools.chain( job['nodes'], job['policy'].object_ring.get_more_nodes( int(job['partition']))) while attempts_left > 0: # If this throws StopIteration it will be caught way below node = next(nodes) target_devs_info.add((node['replication_ip'], node['device'])) attempts_left -= 1 # if we have already synced to this remote region, # don't sync again on this replication pass if node['region'] in synced_remote_regions: continue try: with Timeout(self.http_timeout): resp = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '', headers=headers).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error( _('%(replication_ip)s/%(device)s ' 'responded as unmounted'), node) attempts_left += 1 failure_devs_info.add((node['replication_ip'], node['device'])) continue if resp.status != HTTP_OK: self.logger.error(_("Invalid response %(resp)s " "from %(ip)s"), {'resp': resp.status, 'ip': node['replication_ip']}) failure_devs_info.add((node['replication_ip'], node['device'])) continue remote_hash = pickle.loads(resp.read()) del resp suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] if not suffixes: stats.hashmatch += 1 continue hashed, recalc_hash = tpool.execute( df_mgr._get_hashes, job['device'], job['partition'], job['policy'], recalculate=suffixes) self.logger.update_stats('suffix.hashes', hashed) local_hash = recalc_hash suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] stats.rsync += 1 success, _junk = self.sync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=headers) conn.getresponse().read() if not success: failure_devs_info.add((node['replication_ip'], node['device'])) # add only remote region when replicate succeeded if success and node['region'] != job['region']: synced_remote_regions.add(node['region']) stats.suffix_sync += len(suffixes) self.logger.update_stats('suffix.syncs', len(suffixes)) except (Exception, Timeout): failure_devs_info.add((node['replication_ip'], node['device'])) self.logger.exception(_("Error syncing with node: %s") % node) stats.suffix_count += len(local_hash) except StopIteration: self.logger.error('Ran out of handoffs while replicating ' 'partition %s of policy %d', job['partition'], int(job['policy'])) except (Exception, Timeout): failure_devs_info.update(target_devs_info) self.logger.exception(_("Error syncing partition")) finally: stats.add_failure_stats(failure_devs_info) stats.success += len(target_devs_info - failure_devs_info) self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.update.timing', begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ stats = self.total_stats replication_count = stats.attempted if replication_count > self.last_replication_count: self.last_replication_count = replication_count elapsed = (time.time() - self.start) or 0.000001 rate = replication_count / elapsed self.logger.info( _("%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)"), {'replicated': replication_count, 'total': self.job_count, 'percentage': replication_count * 100.0 / self.job_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, replication_count, self.job_count)}) self.logger.info(_('%(success)s successes, %(failure)s failures') % dict(success=stats.success, failure=stats.failure)) if stats.suffix_count: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), {'checked': stats.suffix_count, 'hashed': (stats.suffix_hash * 100.0) / stats.suffix_count, 'synced': (stats.suffix_sync * 100.0) / stats.suffix_count}) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), {'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[ len(self.partition_times) // 2]}) else: self.logger.info( _("Nothing replicated for %s seconds."), (time.time() - self.start)) def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] df_mgr = self._df_router[policy] self.all_devs_info.update( [(dev['replication_ip'], dev['device']) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices))]: found_local = True local_dev_stats = self.stats_for_dev[local_dev['device']] try: dev_path = check_drive(self.devices_dir, local_dev['device'], self.mount_check) except ValueError as err: local_dev_stats.add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev]) self.logger.warning("%s", err) continue obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) unlink_older_than(tmp_path, time.time() - df_mgr.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition.isdigit() and int(partition) not in override_partitions): continue if (partition.startswith('auditor_status_') and partition.endswith('.json')): # ignore auditor status files continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: if part_nodes: local_dev_stats.add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in nodes]) else: local_dev_stats.add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev]) continue if not found_local: self.logger.error("Can't find itself in policy with index %d with" " ips %s and with port %s in ring file, not" " replicating", int(policy), ", ".join(ips), self.port) return jobs def collect_jobs(self, override_devices=None, override_partitions=None, override_policies=None): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. :param override_devices: if set, only jobs on these devices will be returned :param override_partitions: if set, only jobs on these partitions will be returned :param override_policies: if set, only jobs in these storage policies will be returned """ jobs = [] ips = whataremyips(self.bind_ip) for policy in POLICIES: # Skip replication if next_part_power is set. In this case # every object is hard-linked twice, but the replicator can't # detect them and would create a second copy of the file if not # yet existing - and this might double the actual transferred # and stored data next_part_power = getattr( policy.object_ring, 'next_part_power', None) if next_part_power is not None: self.logger.warning( _("next_part_power set in policy '%s'. Skipping"), policy.name) continue if policy.policy_type == REPL_POLICY: if (override_policies is not None and policy.idx not in override_policies): continue # ensure rings are loaded for policy self.load_object_ring(policy) jobs += self.build_replication_jobs( policy, ips, override_devices=override_devices, override_partitions=override_partitions) random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs def replicate(self, override_devices=None, override_partitions=None, override_policies=None, start_time=None): """Run a replication pass""" if start_time is None: start_time = time.time() self.start = start_time self.last_replication_count = 0 self.replication_cycle = (self.replication_cycle + 1) % 10 self.partition_times = [] self.my_replication_ips = self._get_my_replication_ips() self.all_devs_info = set() self.handoffs_remaining = 0 stats = eventlet.spawn(self.heartbeat) eventlet.sleep() # Give spawns a cycle current_nodes = None dev_stats = None num_jobs = 0 try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) for job in jobs: dev_stats = self.stats_for_dev[job['device']] num_jobs += 1 current_nodes = job['nodes'] try: check_drive(self.devices_dir, job['device'], self.mount_check) except ValueError as err: dev_stats.add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes']]) self.logger.warning("%s", err) continue if self.handoffs_first and not job['delete']: # in handoffs first mode, we won't process primary # partitions until rebalance was successful! if self.handoffs_remaining: self.logger.warning(_( "Handoffs first mode still has handoffs " "remaining. Aborting current " "replication pass.")) break if not self.check_ring(job['policy'].object_ring): self.logger.info(_("Ring change detected. Aborting " "current replication pass.")) return try: if isfile(job['path']): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job['path']) os.remove(job['path']) continue except OSError: continue if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) current_nodes = None self.run_pool.waitall() except (Exception, Timeout) as err: if dev_stats: if current_nodes: dev_stats.add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in current_nodes]) else: dev_stats.add_failure_stats(self.all_devs_info) self.logger.exception( _("Exception in top-level replication loop: %s"), err) finally: stats.kill() self.stats_line() def update_recon(self, total, end_time, override_devices): # Called at the end of a replication pass to update recon stats. if self.is_multiprocess_worker: # If it weren't for the failure_nodes field, we could do this as # a bunch of shared memory using multiprocessing.Value, which # would be nice because it'd avoid dealing with existing data # during an upgrade. update = { 'object_replication_per_disk': { od: {'replication_stats': self.stats_for_dev[od].to_recon(), 'replication_time': total, 'replication_last': end_time, 'object_replication_time': total, 'object_replication_last': end_time} for od in override_devices}} else: update = {'replication_stats': self.total_stats.to_recon(), 'replication_time': total, 'replication_last': end_time, 'object_replication_time': total, 'object_replication_last': end_time} dump_recon_cache(update, self.rcache, self.logger) def aggregate_recon_update(self): per_disk_stats = load_recon_cache(self.rcache).get( 'object_replication_per_disk', {}) recon_update = {} min_repl_last = float('inf') min_repl_time = float('inf') # If every child has reported some stats, then aggregate things. if all(ld in per_disk_stats for ld in self.all_local_devices): aggregated = Stats() for device_name, data in per_disk_stats.items(): aggregated += Stats.from_recon(data['replication_stats']) min_repl_time = min( min_repl_time, data['object_replication_time']) min_repl_last = min( min_repl_last, data['object_replication_last']) recon_update['replication_stats'] = aggregated.to_recon() recon_update['replication_last'] = min_repl_last recon_update['replication_time'] = min_repl_time recon_update['object_replication_last'] = min_repl_last recon_update['object_replication_time'] = min_repl_time # Clear out entries for old local devices that we no longer have devices_to_remove = set(per_disk_stats) - set(self.all_local_devices) if devices_to_remove: recon_update['object_replication_per_disk'] = { dtr: {} for dtr in devices_to_remove} return recon_update def run_once(self, multiprocess_worker_index=None, have_overrides=False, *args, **kwargs): if multiprocess_worker_index is not None: self.is_multiprocess_worker = True self._emplace_log_prefix(multiprocess_worker_index) rsync_reaper = eventlet.spawn(self._child_process_reaper) self._zero_stats() self.logger.info(_("Running object replicator in script mode.")) override_opts = parse_override_options(once=True, **kwargs) devices = override_opts.devices or None partitions = override_opts.partitions or None policies = override_opts.policies or None start_time = time.time() self.replicate( override_devices=devices, override_partitions=partitions, override_policies=policies, start_time=start_time) end_time = time.time() total = (end_time - start_time) / 60 self.logger.info( _("Object replication complete (once). (%.02f minutes)"), total) # If we've been manually run on a subset of # policies/devices/partitions, then our recon stats are not # representative of how replication is doing, so we don't publish # them. if self.is_multiprocess_worker: # The main process checked for overrides and determined that # there were none should_update_recon = not have_overrides else: # We are single-process, so update recon only if we worked on # everything should_update_recon = not (partitions or devices or policies) if should_update_recon: self.update_recon(total, end_time, devices) # Give rsync processes one last chance to exit, then bail out and # let them be init's problem self._child_process_reaper_queue.put(None) rsync_reaper.wait() def run_forever(self, multiprocess_worker_index=None, override_devices=None, *args, **kwargs): if multiprocess_worker_index is not None: self.is_multiprocess_worker = True self._emplace_log_prefix(multiprocess_worker_index) self.logger.info(_("Starting object replicator in daemon mode.")) eventlet.spawn_n(self._child_process_reaper) # Run the replicator continually while True: self._zero_stats() self.logger.info(_("Starting object replication pass.")) # Run the replicator start = time.time() self.replicate(override_devices=override_devices) end = time.time() total = (end - start) / 60 self.logger.info( _("Object replication complete. (%.02f minutes)"), total) self.update_recon(total, end, override_devices) self.logger.debug('Replication sleeping for %s seconds.', self.interval) sleep(self.interval) def post_multiprocess_run(self): # This method is called after run_once using multiple workers. update = self.aggregate_recon_update() dump_recon_cache(update, self.rcache, self.logger)
class Ldp(object): LISTEN_PORT = 646 MULTICAST_ADDRESS = '224.0.0.2' def __init__(self, listen_ip): self.listen_ip = listen_ip self.running = False self.socket = None self.eventlets = [] self.last_message_id = 0 def get_message_id(self): self.last_message_id += 1 return self.last_message_id def run(self): self.running = True self.pool = GreenPool() self.eventlets = [] self.eventlets.append(self.pool.spawn(self.handle_packets_in)) self.eventlets.append(self.pool.spawn(self.hello_timer)) self.eventlets.append(self.pool.spawn(self.run_tcp_handler)) self.pool.waitall() def run_tcp_handler(self): print("Starting TCP socket on %s:%s" % (self.listen_ip, self.LISTEN_PORT)) self.stream_server = StreamServer((self.listen_ip, self.LISTEN_PORT), self.handle_tcp) self.stream_server.serve_forever() def handle_tcp(self, socket, address): peer_ip, peer_port = address messages_sent = 0 print("Got connection from %s:%s" % (peer_ip, peer_port)) input_stream = socket.makefile(mode="rb") chopper = Chopper(4, 2, 0, input_stream) state_machine = LdpStateMachine(self.listen_ip, peer_ip) try: while True: sleep(0) serialised_pdu = chopper.next() print("Got PDU from %s:%s" % (peer_ip, peer_port)) pdu = parse_ldp_pdu(serialised_pdu) messages = pdu.messages for message in messages: outbound_messages = state_machine.message_received(message) outbound_pdus = [] for outbound_message in outbound_messages: outbound_message.message_id = self.get_message_id() print("Sending message %s" % outbound_message) pdu = LdpPdu(1, self.listen_ip, 0, [outbound_message.pack()]) outbound_pdus.append(pdu) for pdu in outbound_pdus: socket.send(pdu.pack()) if state_machine.state == "NONEXISTENT": break except (SocketClosedError, StopIteration) as e: print("Socket closed from %s:%s" % (peer_ip, peer_port)) print("Closing socket with %s:%s" % (peer_ip, peer_port)) socket.close() def handle_packets_in(self): self.multicast_socket = MulticastSocket(self.MULTICAST_ADDRESS, self.LISTEN_PORT, self.listen_ip) self.multicast_socket.bind() try: while self.running: sleep(1) while True: data, address = self.multicast_socket.recv(4096, 10) if not data: break pdu = parse_ldp_pdu(data) messages = pdu.messages if len(messages) > 1: print( "Weird... got PDU from %s with lots of messages: " % (address, messages)) continue message = messages[0] if not isinstance(message, LdpHelloMessage): print( "Got message from %s but it isn't a hello message: %s" % (address, message)) continue print("Got hello message from %s ID %s" % (address, message.message_id)) except OSError: pass def hello_timer(self): next_timer_at = int(time()) while self.running: sleep(1) if int(time()) > next_timer_at: self.send_hello(self.get_message_id()) next_timer_at += 5 def send_hello(self, message_id): print("Sending hello message") tlvs = {0x0401: build_byte_string("ac1a016a")} message = LdpHelloMessage(message_id, 15, False, False, tlvs) pdu = LdpPdu(1, self.listen_ip, 0, [message.pack()]) if self.multicast_socket: self.multicast_socket.send(pdu.pack()) else: print("Not sending; UDP socket dead") def shutdown(self): self.running = False self.multicast_socket.shutdown() for eventlet in self.eventlets: eventlet.kill()
''' running count() method in a native threading manner ''' class CountThread(threading.Thread): def run(self): count() print "running count() as two threads" c1 = CountThread() c2 = CountThread() start_time = datetime.datetime.now() c1.start() c2.start() c1.join() c2.join() end_time = datetime.datetime.now() print end_time - start_time ''' running count() in a green threading manner ''' print "running count() as two green threads" start_time = datetime.datetime.now() pool = GreenPool() pool.spawn(count()) pool.spawn(count()) end_time = datetime.datetime.now() print end_time - start_time
def reap_container(self, account, account_partition, account_nodes, container): """ Deletes the data and the container itself for the given container. This will call :func:`reap_object` up to sqrt(self.concurrency) times concurrently for the objects in the container. If there is any exception while deleting a single object, the process will continue for any other objects in the container and the failed objects will be tried again the next time this function is called with the same parameters. If there is any exception while listing the objects for deletion, the process will stop (but will obviously be tried again the next time this function is called with the same parameters). This is a possibility since the listing comes from querying just the primary remote container server. Once all objects have been attempted to be deleted, the container itself will be attempted to be deleted by sending a delete request to all container nodes. The format of the delete request is such that each container server will update a corresponding account server, removing the container from the account's listing. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param account: The name of the account for the container. :param account_partition: The partition for the account on the account ring. :param account_nodes: The primary node dicts for the account. :param container: The name of the container to delete. * See also: :func:`swift.common.ring.Ring.get_nodes` for a description of the account node dicts. """ account_nodes = list(account_nodes) part, nodes = self.get_container_ring().get_nodes(account, container) node = nodes[-1] pool = GreenPool(size=self.object_concurrency) marker = '' while True: objects = None try: headers, objects = direct_get_container( node, part, account, container, marker=marker, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout) self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 self.logger.increment('return_codes.2') except ClientException as err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.logger.increment( 'return_codes.%d' % (err.http_status / 100,)) if not objects: break try: policy_index = headers.get('X-Backend-Storage-Policy-Index', 0) for obj in objects: if isinstance(obj['name'], unicode): obj['name'] = obj['name'].encode('utf8') pool.spawn(self.reap_object, account, container, part, nodes, obj['name'], policy_index) pool.waitall() except (Exception, Timeout): self.logger.exception(_('Exception with objects for container ' '%(container)s for account %(account)s' ), {'container': container, 'account': account}) marker = objects[-1]['name'] if marker == '': break successes = 0 failures = 0 for node in nodes: anode = account_nodes.pop() try: direct_delete_container( node, part, account, container, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout, headers={'X-Account-Host': '%(ip)s:%(port)s' % anode, 'X-Account-Partition': str(account_partition), 'X-Account-Device': anode['device'], 'X-Account-Override-Deleted': 'yes'}) successes += 1 self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 self.logger.increment('return_codes.2') except ClientException as err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) failures += 1 self.logger.increment('containers_failures') self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.logger.increment( 'return_codes.%d' % (err.http_status / 100,)) if successes > failures: self.stats_containers_deleted += 1 self.logger.increment('containers_deleted') elif not successes: self.stats_containers_remaining += 1 self.logger.increment('containers_remaining') else: self.stats_containers_possibly_remaining += 1 self.logger.increment('containers_possibly_remaining')
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = get_logger(conf, log_route='object-replicator') self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.vm_test_mode = config_true_value(conf.get('vm_test_mode', 'no')) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.port = int(conf.get('bind_port', 6000)) self.concurrency = int(conf.get('concurrency', 1)) self.stats_interval = int(conf.get('stats_interval', '300')) self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7)) self.partition_times = [] self.run_pause = int(conf.get('run_pause', 30)) self.rsync_timeout = int(conf.get('rsync_timeout', 900)) self.rsync_io_timeout = conf.get('rsync_io_timeout', '30') self.rsync_bwlimit = conf.get('rsync_bwlimit', '0') self.http_timeout = int(conf.get('http_timeout', 60)) self.lockup_timeout = int(conf.get('lockup_timeout', 1800)) self.recon_cache_path = conf.get('recon_cache_path', '/var/cache/swift') self.rcache = os.path.join(self.recon_cache_path, "object.recon") self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.node_timeout = float(conf.get('node_timeout', 10)) self.sync_method = getattr(self, conf.get('sync_method') or 'rsync') self.network_chunk_size = int(conf.get('network_chunk_size', 65536)) self.disk_chunk_size = int(conf.get('disk_chunk_size', 65536)) self.headers = { 'Content-Length': '0', 'user-agent': 'obj-replicator %s' % os.getpid() } self.rsync_error_log_line_length = \ int(conf.get('rsync_error_log_line_length', 0)) self.handoffs_first = config_true_value( conf.get('handoffs_first', False)) self.handoff_delete = config_auto_int_value( conf.get('handoff_delete', 'auto'), 0) self._diskfile_mgr = DiskFileManager(conf, self.logger) def sync(self, node, job, suffixes): # Just exists for doc anchor point """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean indicating success or failure """ return self.sync_method(node, job, suffixes) def get_object_ring(self, policy_idx): """ Get the ring object to use to handle a request based on its policy. :policy_idx: policy index as defined in swift.conf :returns: appropriate ring object """ return POLICIES.get_object_ring(policy_idx, self.swift_dir) def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() ret_val = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error(_("Killing long-running rsync: %s"), str(args)) proc.kill() return 1 # failure response code total_time = time.time() - start_time for result in results.split('\n'): if result == '': continue if result.startswith('cd+'): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: error_line = _('Bad rsync return code: %(ret)d <- %(args)s') % \ {'args': str(args), 'ret': ret_val} if self.rsync_error_log_line_length: error_line = error_line[:self.rsync_error_log_line_length] self.logger.error(error_line) elif results: self.logger.info( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), { 'src': args[-2], 'dst': args[-1], 'time': total_time }) else: self.logger.debug( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), { 'src': args[-2], 'dst': args[-1], 'time': total_time }) return ret_val def rsync(self, node, job, suffixes): """ Uses rsync to implement the sync method. This was the first sync method in Swift. """ if not os.path.exists(job['path']): return False args = [ 'rsync', '--recursive', '--whole-file', '--human-readable', '--xattrs', '--itemize-changes', '--ignore-existing', '--timeout=%s' % self.rsync_io_timeout, '--contimeout=%s' % self.rsync_io_timeout, '--bwlimit=%s' % self.rsync_bwlimit, ] node_ip = rsync_ip(node['replication_ip']) if self.vm_test_mode: rsync_module = '%s::object%s' % (node_ip, node['replication_port']) else: rsync_module = '%s::object' % node_ip had_any = False for suffix in suffixes: spath = join(job['path'], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False data_dir = get_data_dir(job['policy_idx']) args.append( join(rsync_module, node['device'], data_dir, job['partition'])) return self._rsync(args) == 0 def ssync(self, node, job, suffixes): return ssync_sender.Sender(self, node, job, suffixes)() def check_ring(self, object_ring): """ Check to see if the ring has been updated :param object_ring: the ring to check :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [ suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff)) ] self.replication_count += 1 self.logger.increment('partition.delete.count.%s' % (job['device'], )) self.headers[POLICY_INDEX] = job['policy_idx'] begin = time.time() try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job['path']) if suffixes: for node in job['nodes']: success = self.sync(node, job, suffixes) if success: with Timeout(self.http_timeout): conn = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=self.headers) conn.getresponse().read() responses.append(success) if self.handoff_delete: # delete handoff if we have had handoff_delete successes delete_handoff = len([resp for resp in responses if resp]) >= \ self.handoff_delete else: # delete handoff if all syncs were successful delete_handoff = len(responses) == len(job['nodes']) and \ all(responses) if not suffixes or delete_handoff: self.logger.info(_("Removing partition: %s"), job['path']) tpool.execute(shutil.rmtree, job['path'], ignore_errors=True) except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) finally: self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.delete.timing', begin) def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ self.replication_count += 1 self.logger.increment('partition.update.count.%s' % (job['device'], )) self.headers[POLICY_INDEX] = job['policy_idx'] begin = time.time() try: hashed, local_hash = tpool_reraise( get_hashes, job['path'], do_listdir=(self.replication_count % 10) == 0, reclaim_age=self.reclaim_age) self.suffix_hash += hashed self.logger.update_stats('suffix.hashes', hashed) attempts_left = len(job['nodes']) nodes = itertools.chain( job['nodes'], job['object_ring'].get_more_nodes(int(job['partition']))) while attempts_left > 0: # If this throws StopIterator it will be caught way below node = next(nodes) attempts_left -= 1 try: with Timeout(self.http_timeout): resp = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '', headers=self.headers).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error( _('%(ip)s/%(device)s responded' ' as unmounted'), node) attempts_left += 1 continue if resp.status != HTTP_OK: self.logger.error( _("Invalid response %(resp)s " "from %(ip)s"), { 'resp': resp.status, 'ip': node['replication_ip'] }) continue remote_hash = pickle.loads(resp.read()) del resp suffixes = [ suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1) ] if not suffixes: continue hashed, recalc_hash = tpool_reraise( get_hashes, job['path'], recalculate=suffixes, reclaim_age=self.reclaim_age) self.logger.update_stats('suffix.hashes', hashed) local_hash = recalc_hash suffixes = [ suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1) ] self.sync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=self.headers) conn.getresponse().read() self.suffix_sync += len(suffixes) self.logger.update_stats('suffix.syncs', len(suffixes)) except (Exception, Timeout): self.logger.exception( _("Error syncing with node: %s") % node) self.suffix_count += len(local_hash) except (Exception, Timeout): self.logger.exception(_("Error syncing partition")) finally: self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.update.timing', begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ if self.replication_count: elapsed = (time.time() - self.start) or 0.000001 rate = self.replication_count / elapsed self.logger.info( _("%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)"), { 'replicated': self.replication_count, 'total': self.job_count, 'percentage': self.replication_count * 100.0 / self.job_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, self.replication_count, self.job_count) }) if self.suffix_count: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), { 'checked': self.suffix_count, 'hashed': (self.suffix_hash * 100.0) / self.suffix_count, 'synced': (self.suffix_sync * 100.0) / self.suffix_count }) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), { 'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[len(self.partition_times) // 2] }) else: self.logger.info(_("Nothing replicated for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the replicator finishes its replication pass in some eventuality. """ while True: eventlet.sleep(self.lockup_timeout) if self.replication_count == self.last_replication_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_replication_count = self.replication_count def process_repl(self, policy, jobs, ips): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [ dev for dev in obj_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port ]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = obj_ring.get_part_nodes(int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring)) except (ValueError, OSError): continue def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. """ jobs = [] ips = whataremyips() for policy in POLICIES: if policy.policy_type == 'replication': self.process_repl(policy, jobs, ips) # add else conditions here for future policy types random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs def replicate(self, override_devices=None, override_partitions=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] if override_devices is None: override_devices = [] if override_partitions is None: override_partitions = [] stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs() for job in jobs: if override_devices and job['device'] not in override_devices: continue if override_partitions and \ job['partition'] not in override_partitions: continue dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), job['device']) continue if not self.check_ring(job['object_ring']): self.logger.info( _("Ring change detected. Aborting " "current replication pass.")) return if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() def run_once(self, *args, **kwargs): start = time.time() self.logger.info(_("Running object replicator in script mode.")) override_devices = list_from_csv(kwargs.get('devices')) override_partitions = list_from_csv(kwargs.get('partitions')) self.replicate(override_devices=override_devices, override_partitions=override_partitions) total = (time.time() - start) / 60 self.logger.info( _("Object replication complete (once). (%.02f minutes)"), total) if not (override_partitions or override_devices): dump_recon_cache( { 'object_replication_time': total, 'object_replication_last': time.time() }, self.rcache, self.logger) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object replicator in daemon mode.")) # Run the replicator continually while True: start = time.time() self.logger.info(_("Starting object replication pass.")) # Run the replicator self.replicate() total = (time.time() - start) / 60 self.logger.info(_("Object replication complete. (%.02f minutes)"), total) dump_recon_cache( { 'object_replication_time': total, 'object_replication_last': time.time() }, self.rcache, self.logger) self.logger.debug('Replication sleeping for %s seconds.', self.run_pause) sleep(self.run_pause)
def test_connection(self): """ conn = Connection( auth_endpoint="https://identity.api.rackspacecloud.com/v2.0", client_id=str(uuid.uuid4()), endpoint="http://localhost:8888/v1/12345", user="", key="") """ conn = Connection( auth_endpoint="https://identity.api.rackspacecloud.com/v2.0", client_id=str(uuid.uuid4()), endpoint="http://166.78.143.130/v1/12345", user="", key="") conn.connect(token='blah') def create_worker(queue_name): return conn.create_queue(queue_name) def post_worker(queue): return queue.post_message('test_message', 10) def delete_worker(queue_name): conn.delete_queue(queue_name) return queue_name pool = GreenPool(100) def on_message_posted(greenthread): msg = greenthread.wait() print msg._href def on_queue_created(greenthread): queue = greenthread.wait() print queue.name for x in range(0, 10): gt = pool.spawn(post_worker, queue) gt.link(on_message_posted) queue_names = ["queue-" + str(x) for x in xrange(0, 5)] for queue_name in queue_names: gt = pool.spawn(create_worker, queue_name) gt.link(on_queue_created) pool.waitall() def delete_worker(queue_name): conn.delete_queue(queue_name) print "Queue:", queue_name, " deleted" for queue in conn.get_queues(): gt = pool.spawn_n(delete_worker, queue.name) print "Waiting for everything to finish" pool.waitall() print "Done"
products_updated.append(id) else: products_errored.append(id) if not async: for p in products: if api_call(p['cart_prod_id'], p['qty']) == 1: products_updated.append(p['cart_prod_id']) else: products_errored.append(p['cart_prod_id']) return products_updated, products_errored, retry_count pool = GreenPool(max_connections) monkey_patch() for p in products: pool.spawn(api_call, p['cart_prod_id'], p['qty']).link(callback, p['cart_prod_id']) pool.waitall() if retry and len(products_errored) > 0: product_dict = dict(zip([x['cart_prod_id'] for x in products], [x['qty'] for x in products])) while (len(products_errored) > 0) and (retry_count < len(products)*2): # @todo tweak the retry count for index, product_id in enumerate(products_errored): qty = product_dict[product_id] products_errored.pop(index) retry_count += 1 pool.spawn(api_call, product_id, qty).link(callback, product_id) pool.waitall() return products_updated, products_errored, retry_count if __name__ == '__main__':
db_conns = [] for i in range(num_locks): db_conn = connect(db_files[i]) db_conn.execute('begin exclusive transaction') db_conns.append(db_conn) if catch_503: exc = None try: client.delete_container(self.url, self.token, container) except client.ClientException, err: exc = err self.assertEquals(exc.http_status, 503) else: client.delete_container(self.url, self.token, container) pool = GreenPool() try: with Timeout(15): pool.spawn(run_test, 1, False) pool.spawn(run_test, 2, True) pool.spawn(run_test, 3, True) pool.waitall() except Timeout, err: raise Exception( "The server did not return a 503 on container db locks, " "it just hangs: %s" % err) if __name__ == '__main__': main()
''' class CountThread(threading.Thread): def run(self): count() print "running count() as two threads" c1 = CountThread() c2 = CountThread() start_time = datetime.datetime.now() c1.start() c2.start() c1.join() c2.join() end_time = datetime.datetime.now() print end_time - start_time ''' running count() in a green threading manner ''' print "running count() as two green threads" start_time = datetime.datetime.now() pool = GreenPool() pool.spawn(count()) pool.spawn(count()) end_time = datetime.datetime.now() print end_time - start_time
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf, logger=None): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = logger or get_logger(conf, log_route="object-replicator") self.devices_dir = conf.get("devices", "/srv/node") self.mount_check = config_true_value(conf.get("mount_check", "true")) self.swift_dir = conf.get("swift_dir", "/etc/swift") self.bind_ip = conf.get("bind_ip", "0.0.0.0") self.servers_per_port = int(conf.get("servers_per_port", "0") or 0) self.port = None if self.servers_per_port else int(conf.get("bind_port", 6000)) self.concurrency = int(conf.get("concurrency", 1)) self.stats_interval = int(conf.get("stats_interval", "300")) self.ring_check_interval = int(conf.get("ring_check_interval", 15)) self.next_check = time.time() + self.ring_check_interval self.reclaim_age = int(conf.get("reclaim_age", 86400 * 7)) self.partition_times = [] self.interval = int(conf.get("interval") or conf.get("run_pause") or 30) self.rsync_timeout = int(conf.get("rsync_timeout", 900)) self.rsync_io_timeout = conf.get("rsync_io_timeout", "30") self.rsync_bwlimit = conf.get("rsync_bwlimit", "0") self.rsync_compress = config_true_value(conf.get("rsync_compress", "no")) self.rsync_module = conf.get("rsync_module", "").rstrip("/") if not self.rsync_module: self.rsync_module = "{replication_ip}::object" if config_true_value(conf.get("vm_test_mode", "no")): self.logger.warning( "Option object-replicator/vm_test_mode " "is deprecated and will be removed in a " "future version. Update your " "configuration to use option " "object-replicator/rsync_module." ) self.rsync_module += "{replication_port}" self.http_timeout = int(conf.get("http_timeout", 60)) self.lockup_timeout = int(conf.get("lockup_timeout", 1800)) self.recon_cache_path = conf.get("recon_cache_path", "/var/cache/swift") self.rcache = os.path.join(self.recon_cache_path, "object.recon") self.conn_timeout = float(conf.get("conn_timeout", 0.5)) self.node_timeout = float(conf.get("node_timeout", 10)) self.sync_method = getattr(self, conf.get("sync_method") or "rsync") self.network_chunk_size = int(conf.get("network_chunk_size", 65536)) self.default_headers = {"Content-Length": "0", "user-agent": "object-replicator %s" % os.getpid()} self.rsync_error_log_line_length = int(conf.get("rsync_error_log_line_length", 0)) self.handoffs_first = config_true_value(conf.get("handoffs_first", False)) self.handoff_delete = config_auto_int_value(conf.get("handoff_delete", "auto"), 0) if any((self.handoff_delete, self.handoffs_first)): self.logger.warning( "Handoff only mode is not intended for normal " "operation, please disable handoffs_first and " "handoff_delete before the next " "normal rebalance" ) self._diskfile_mgr = DiskFileManager(conf, self.logger) def _zero_stats(self): """Zero out the stats.""" self.stats = { "attempted": 0, "success": 0, "failure": 0, "hashmatch": 0, "rsync": 0, "remove": 0, "start": time.time(), "failure_nodes": {}, } def _add_failure_stats(self, failure_devs_info): for node, dev in failure_devs_info: self.stats["failure"] += 1 failure_devs = self.stats["failure_nodes"].setdefault(node, {}) failure_devs.setdefault(dev, 0) failure_devs[dev] += 1 def _get_my_replication_ips(self): my_replication_ips = set() ips = whataremyips() for policy in POLICIES: self.load_object_ring(policy) for local_dev in [ dev for dev in policy.object_ring.devs if dev and dev["replication_ip"] in ips and dev["replication_port"] == self.port ]: my_replication_ips.add(local_dev["replication_ip"]) return list(my_replication_ips) # Just exists for doc anchor point def sync(self, node, job, suffixes, *args, **kwargs): """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean and dictionary, boolean indicating success or failure """ return self.sync_method(node, job, suffixes, *args, **kwargs) def load_object_ring(self, policy): """ Make sure the policy's rings are loaded. :param policy: the StoragePolicy instance :returns: appropriate ring object """ policy.load_ring(self.swift_dir) return policy.object_ring def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() ret_val = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error(_("Killing long-running rsync: %s"), str(args)) proc.kill() return 1 # failure response code total_time = time.time() - start_time for result in results.split("\n"): if result == "": continue if result.startswith("cd+"): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: error_line = _("Bad rsync return code: %(ret)d <- %(args)s") % {"args": str(args), "ret": ret_val} if self.rsync_error_log_line_length: error_line = error_line[: self.rsync_error_log_line_length] self.logger.error(error_line) elif results: self.logger.info( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {"src": args[-2], "dst": args[-1], "time": total_time}, ) else: self.logger.debug( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {"src": args[-2], "dst": args[-1], "time": total_time}, ) return ret_val def rsync(self, node, job, suffixes): """ Uses rsync to implement the sync method. This was the first sync method in Swift. """ if not os.path.exists(job["path"]): return False, {} args = [ "rsync", "--recursive", "--whole-file", "--human-readable", "--xattrs", "--itemize-changes", "--ignore-existing", "--timeout=%s" % self.rsync_io_timeout, "--contimeout=%s" % self.rsync_io_timeout, "--bwlimit=%s" % self.rsync_bwlimit, ] if self.rsync_compress and job["region"] != node["region"]: # Allow for compression, but only if the remote node is in # a different region than the local one. args.append("--compress") rsync_module = rsync_module_interpolation(self.rsync_module, node) had_any = False for suffix in suffixes: spath = join(job["path"], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False, {} data_dir = get_data_dir(job["policy"]) args.append(join(rsync_module, node["device"], data_dir, job["partition"])) return self._rsync(args) == 0, {} def ssync(self, node, job, suffixes, remote_check_objs=None): return ssync_sender.Sender(self, node, job, suffixes, remote_check_objs)() def check_ring(self, object_ring): """ Check to see if the ring has been updated :param object_ring: the ring to check :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff))] self.replication_count += 1 self.logger.increment("partition.delete.count.%s" % (job["device"],)) headers = dict(self.default_headers) headers["X-Backend-Storage-Policy-Index"] = int(job["policy"]) failure_devs_info = set() begin = time.time() try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job["path"]) synced_remote_regions = {} delete_objs = None if suffixes: for node in job["nodes"]: self.stats["rsync"] += 1 kwargs = {} if node["region"] in synced_remote_regions and self.conf.get("sync_method", "rsync") == "ssync": kwargs["remote_check_objs"] = synced_remote_regions[node["region"]] # candidates is a dict(hash=>timestamp) of objects # for deletion success, candidates = self.sync(node, job, suffixes, **kwargs) if success: with Timeout(self.http_timeout): conn = http_connect( node["replication_ip"], node["replication_port"], node["device"], job["partition"], "REPLICATE", "/" + "-".join(suffixes), headers=headers, ) conn.getresponse().read() if node["region"] != job["region"]: synced_remote_regions[node["region"]] = viewkeys(candidates) else: failure_devs_info.add((node["replication_ip"], node["device"])) responses.append(success) for region, cand_objs in synced_remote_regions.items(): if delete_objs is None: delete_objs = cand_objs else: delete_objs = delete_objs & cand_objs if self.handoff_delete: # delete handoff if we have had handoff_delete successes delete_handoff = len([resp for resp in responses if resp]) >= self.handoff_delete else: # delete handoff if all syncs were successful delete_handoff = len(responses) == len(job["nodes"]) and all(responses) if delete_handoff: self.stats["remove"] += 1 if self.conf.get("sync_method", "rsync") == "ssync" and delete_objs is not None: self.logger.info(_("Removing %s objects"), len(delete_objs)) _junk, error_paths = self.delete_handoff_objs(job, delete_objs) # if replication works for a hand-off device and it failed, # the remote devices which are target of the replication # from the hand-off device will be marked. Because cleanup # after replication failed means replicator needs to # replicate again with the same info. if error_paths: failure_devs_info.update( [(failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in job["nodes"]] ) else: self.delete_partition(job["path"]) elif not suffixes: self.delete_partition(job["path"]) except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) finally: target_devs_info = set( [(target_dev["replication_ip"], target_dev["device"]) for target_dev in job["nodes"]] ) self.stats["success"] += len(target_devs_info - failure_devs_info) self._add_failure_stats(failure_devs_info) self.partition_times.append(time.time() - begin) self.logger.timing_since("partition.delete.timing", begin) def delete_partition(self, path): self.logger.info(_("Removing partition: %s"), path) tpool.execute(shutil.rmtree, path) def delete_handoff_objs(self, job, delete_objs): success_paths = [] error_paths = [] for object_hash in delete_objs: object_path = storage_directory(job["obj_path"], job["partition"], object_hash) tpool.execute(shutil.rmtree, object_path, ignore_errors=True) suffix_dir = dirname(object_path) try: os.rmdir(suffix_dir) success_paths.append(object_path) except OSError as e: if e.errno not in (errno.ENOENT, errno.ENOTEMPTY): error_paths.append(object_path) self.logger.exception("Unexpected error trying to cleanup suffix dir:%r", suffix_dir) return success_paths, error_paths def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ self.replication_count += 1 self.logger.increment("partition.update.count.%s" % (job["device"],)) headers = dict(self.default_headers) headers["X-Backend-Storage-Policy-Index"] = int(job["policy"]) target_devs_info = set() failure_devs_info = set() begin = time.time() try: hashed, local_hash = tpool_reraise( self._diskfile_mgr._get_hashes, job["path"], do_listdir=(self.replication_count % 10) == 0, reclaim_age=self.reclaim_age, ) self.suffix_hash += hashed self.logger.update_stats("suffix.hashes", hashed) attempts_left = len(job["nodes"]) synced_remote_regions = set() random.shuffle(job["nodes"]) nodes = itertools.chain(job["nodes"], job["policy"].object_ring.get_more_nodes(int(job["partition"]))) while attempts_left > 0: # If this throws StopIteration it will be caught way below node = next(nodes) target_devs_info.add((node["replication_ip"], node["device"])) attempts_left -= 1 # if we have already synced to this remote region, # don't sync again on this replication pass if node["region"] in synced_remote_regions: continue try: with Timeout(self.http_timeout): resp = http_connect( node["replication_ip"], node["replication_port"], node["device"], job["partition"], "REPLICATE", "", headers=headers, ).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error(_("%(ip)s/%(device)s responded" " as unmounted"), node) attempts_left += 1 failure_devs_info.add((node["replication_ip"], node["device"])) continue if resp.status != HTTP_OK: self.logger.error( _("Invalid response %(resp)s " "from %(ip)s"), {"resp": resp.status, "ip": node["replication_ip"]}, ) failure_devs_info.add((node["replication_ip"], node["device"])) continue remote_hash = pickle.loads(resp.read()) del resp suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] if not suffixes: self.stats["hashmatch"] += 1 continue hashed, recalc_hash = tpool_reraise( self._diskfile_mgr._get_hashes, job["path"], recalculate=suffixes, reclaim_age=self.reclaim_age ) self.logger.update_stats("suffix.hashes", hashed) local_hash = recalc_hash suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] self.stats["rsync"] += 1 success, _junk = self.sync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect( node["replication_ip"], node["replication_port"], node["device"], job["partition"], "REPLICATE", "/" + "-".join(suffixes), headers=headers, ) conn.getresponse().read() if not success: failure_devs_info.add((node["replication_ip"], node["device"])) # add only remote region when replicate succeeded if success and node["region"] != job["region"]: synced_remote_regions.add(node["region"]) self.suffix_sync += len(suffixes) self.logger.update_stats("suffix.syncs", len(suffixes)) except (Exception, Timeout): failure_devs_info.add((node["replication_ip"], node["device"])) self.logger.exception(_("Error syncing with node: %s") % node) self.suffix_count += len(local_hash) except (Exception, Timeout): failure_devs_info.update(target_devs_info) self.logger.exception(_("Error syncing partition")) finally: self.stats["success"] += len(target_devs_info - failure_devs_info) self._add_failure_stats(failure_devs_info) self.partition_times.append(time.time() - begin) self.logger.timing_since("partition.update.timing", begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ if self.replication_count: elapsed = (time.time() - self.start) or 0.000001 rate = self.replication_count / elapsed self.logger.info( _( "%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)" ), { "replicated": self.replication_count, "total": self.job_count, "percentage": self.replication_count * 100.0 / self.job_count, "time": time.time() - self.start, "rate": rate, "remaining": "%d%s" % compute_eta(self.start, self.replication_count, self.job_count), }, ) if self.suffix_count: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), { "checked": self.suffix_count, "hashed": (self.suffix_hash * 100.0) / self.suffix_count, "synced": (self.suffix_sync * 100.0) / self.suffix_count, }, ) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), { "max": self.partition_times[-1], "min": self.partition_times[0], "med": self.partition_times[len(self.partition_times) // 2], }, ) else: self.logger.info(_("Nothing replicated for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the replicator finishes its replication pass in some eventuality. """ while True: eventlet.sleep(self.lockup_timeout) if self.replication_count == self.last_replication_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_replication_count = self.replication_count def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] self.all_devs_info.update([(dev["replication_ip"], dev["device"]) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [ dev for dev in policy.object_ring.devs if ( dev and is_local_device(ips, self.port, dev["replication_ip"], dev["replication_port"]) and (override_devices is None or dev["device"] in override_devices) ) ]: found_local = True dev_path = join(self.devices_dir, local_dev["device"]) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self._add_failure_stats( [ (failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in policy.object_ring.devs if failure_dev ] ) self.logger.warning(_("%s is not mounted"), local_dev["device"]) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception("ERROR creating %s" % obj_path) continue for partition in os.listdir(obj_path): if override_partitions is not None and partition not in override_partitions: continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node["id"] != local_dev["id"]] jobs.append( dict( path=job_path, device=local_dev["device"], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev["region"], ) ) except ValueError: if part_nodes: self._add_failure_stats( [(failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in nodes] ) else: self._add_failure_stats( [ (failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in policy.object_ring.devs if failure_dev ] ) continue if not found_local: self.logger.error( "Can't find itself %s with port %s in ring " "file, not replicating", ", ".join(ips), self.port ) return jobs def collect_jobs(self, override_devices=None, override_partitions=None, override_policies=None): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. :param override_devices: if set, only jobs on these devices will be returned :param override_partitions: if set, only jobs on these partitions will be returned :param override_policies: if set, only jobs in these storage policies will be returned """ jobs = [] ips = whataremyips(self.bind_ip) for policy in POLICIES: if policy.policy_type == REPL_POLICY: if override_policies is not None and str(policy.idx) not in override_policies: continue # ensure rings are loaded for policy self.load_object_ring(policy) jobs += self.build_replication_jobs( policy, ips, override_devices=override_devices, override_partitions=override_partitions ) random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job["delete"]) self.job_count = len(jobs) return jobs def replicate(self, override_devices=None, override_partitions=None, override_policies=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] self.my_replication_ips = self._get_my_replication_ips() self.all_devs_info = set() stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle current_nodes = None try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs( override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies, ) for job in jobs: current_nodes = job["nodes"] if override_devices and job["device"] not in override_devices: continue if override_partitions and job["partition"] not in override_partitions: continue dev_path = join(self.devices_dir, job["device"]) if self.mount_check and not ismount(dev_path): self._add_failure_stats( [(failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in job["nodes"]] ) self.logger.warning(_("%s is not mounted"), job["device"]) continue if not self.check_ring(job["policy"].object_ring): self.logger.info(_("Ring change detected. Aborting " "current replication pass.")) return try: if isfile(job["path"]): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning("Removing partition directory " "which was a file: %s", job["path"]) os.remove(job["path"]) continue except OSError: continue if job["delete"]: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) current_nodes = None with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): if current_nodes: self._add_failure_stats( [(failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in current_nodes] ) else: self._add_failure_stats(self.all_devs_info) self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() self.stats["attempted"] = self.replication_count def run_once(self, *args, **kwargs): self._zero_stats() self.logger.info(_("Running object replicator in script mode.")) override_devices = list_from_csv(kwargs.get("devices")) override_partitions = list_from_csv(kwargs.get("partitions")) override_policies = list_from_csv(kwargs.get("policies")) if not override_devices: override_devices = None if not override_partitions: override_partitions = None if not override_policies: override_policies = None self.replicate( override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies, ) total = (time.time() - self.stats["start"]) / 60 self.logger.info(_("Object replication complete (once). (%.02f minutes)"), total) if not (override_partitions or override_devices): replication_last = time.time() dump_recon_cache( { "replication_stats": self.stats, "replication_time": total, "replication_last": replication_last, "object_replication_time": total, "object_replication_last": replication_last, }, self.rcache, self.logger, ) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object replicator in daemon mode.")) # Run the replicator continually while True: self._zero_stats() self.logger.info(_("Starting object replication pass.")) # Run the replicator self.replicate() total = (time.time() - self.stats["start"]) / 60 self.logger.info(_("Object replication complete. (%.02f minutes)"), total) replication_last = time.time() dump_recon_cache( { "replication_stats": self.stats, "replication_time": total, "replication_last": replication_last, "object_replication_time": total, "object_replication_last": replication_last, }, self.rcache, self.logger, ) self.logger.debug("Replication sleeping for %s seconds.", self.interval) sleep(self.interval)
def run(self, *args, **kwargs): try: self.logger.info('event agent: starting') pool = GreenPool(len(self.workers)) for worker in self.workers: pool.spawn(worker.start) def front(server, backend): while True: msg = server.recv_multipart() if validate_msg(msg): try: event_id = msg[2] data = msg[3] self.queue.put(event_id, data) event = ['', msg[2], msg[3]] backend.send_multipart(event) except Exception: pass finally: ack = msg[0:3] server.send_multipart(ack) def back(backend): while True: msg = backend.recv_multipart() event_id = msg[1] success = msg[2] if not success: self.queue.failed(event_id) self.logger.warn('event %s moved to failed', binascii.hexlify(event_id)) else: self.queue.delete(event_id) self.logger.debug('event %s removed from queue', binascii.hexlify(event_id)) boss_pool = GreenPool(2) boss_pool.spawn_n(front, self.server, self.backend) boss_pool.spawn_n(back, self.backend) while True: results = self.queue.load(self.batch_size) for event in results: event_id, data = event msg = ['', event_id, str(data)] self.backend.send_multipart(msg) self.retries_run_time = ratelimit( self.retries_run_time, self.max_retries_per_second) for w in self.workers: if w.failed: self.workers.remove(w) self.logger.warn('restart worker "%s"', w.name) new_w = EventWorker(self.conf, w.name, self.context) self.workers.append(new_w) pool.spawn(new_w.start) sleep(SLEEP_TIME) except Exception as e: self.logger.error('ERROR in main loop %s', e) raise finally: self.logger.warn('event agent: stopping') self.stop_workers() self.context.destroy(linger=True) self.context = None
class ObjectMover(Daemon): def __init__(self, conf): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = get_logger(conf, log_route='object-mover') self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.vm_test_mode = config_true_value(conf.get('vm_test_mode', 'no')) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.bind_ip = conf.get('bind_ip', '0.0.0.0') self.servers_per_port = int(conf.get('servers_per_port', '0') or 0) self.port = None if self.servers_per_port else \ int(conf.get('bind_port', 6000)) self.concurrency = int(conf.get('concurrency', 1)) self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7)) self.handoffs_first = config_true_value(conf.get('handoffs_first', False)) self.data_moving_map_dump = (conf.get('data_moving_map_dump') or DEFAULT_DUMP_FILE) self._diskfile_mgr = DiskFileManager(conf, self.logger) self.mover_tmp_dir = (conf.get('mover_tmp_dir') or 'data_mover') self.retries = int(conf.get('retries', 3)) self.test = bool(conf.get('test', False)) self.retrie_list = [] def create_remote_directory(self, job): """ Creates a temporal directory, at remote server. :param job: information about the partition being synced """ node = job['node'] args = ["ssh", rsync_ip(node['replication_ip']), "mkdir", "-p", job['remote_path']] if not self.test: proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() #TODO: ret_val check (results, ret_val) else: print " ".join(args) #TODO: same as replicator load_object_ring def load_object_ring(self, policy): """ Make sure the policy's rings are loaded. :param policy: the StoragePolicy instance :returns: appropriate ring object """ policy.load_ring(self.swift_dir) return policy.object_ring #TODO: check if _rsync from replicator will be used instead def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() ret_val = None proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() total_time = time.time() - start_time for result in results.split('\n'): if result == '': continue if result.startswith('cd+'): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: error_line = 'Bad rsync return code: %(ret)d <- %(args)s' % \ {'args': str(args), 'ret': ret_val} if self.rsync_error_log_line_length: error_line = error_line[:self.rsync_error_log_line_length] self.logger.error(error_line) elif results: self.logger.info( "Successful rsync of %(src)s at %(dst)s (%(time).03f)", {'src': args[-2], 'dst': args[-1], 'time': total_time}) else: self.logger.debug( "Successful rsync of %(src)s at %(dst)s (%(time).03f)", {'src': args[-2], 'dst': args[-1], 'time': total_time}) return ret_val def rsync(self, job): """ Uses rsync to implement the sync method. This was the first sync method in Swift. """ if not os.path.exists(job['path']): if self.test: print "Error: the path %s does not exists" % job['path'] return False, {} args = [ 'rsync', '-a', '--whole-file', '--human-readable', '--xattrs', '--ignore-existing', ] node = job['node'] node_ip = rsync_ip(node['replication_ip']) rsync_module = '%s:%s' % (node_ip, job['remote_path']) args.append(job['path']) args.append(rsync_module) if not self.test: return self._rsync(args) == 0, {} else: print " ".join(args) return True, {} def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ self.logger.increment('partition.update.count.%s' % (job['device'],)) begin = time.time() try: self.create_remote_directory(job) success, _junk = self.rsync(job) if not success: self.retrie_list.append(job) except (Exception, Timeout): self.logger.exception("Error syncing partition") finally: self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.update.timing', begin) #TODO: same as replicator kill coros def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def build_replication_jobs(self, policy, ips, old_dict, new_dict, moving_map): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy :param policy: swift policy object :param ips: the local server ips :param old_dict: dictionary with devices from old ring :param new_dict: dictionary with devices from new ring :param moving_map: the dictionary that contains all the partitions that should be moved, their sources and destinations """ jobs = [] data_dir = get_data_dir(policy) devices = Set(map(lambda x: x[1], moving_map.values())) partitions = Set(map(lambda x: x[0], moving_map.values())) for local_dev in [dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) )]: if self.test: print local_dev['id'] if unicode(local_dev['id']) not in devices: continue dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self.logger.warn('%s is not mounted' % local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) for partition in os.listdir(obj_path): partition = unicode(partition) if (partition not in partitions): continue try: key = "%s_%s" % (local_dev['id'], partition) if key not in moving_map: continue job_path = join(obj_path, partition) _, source_id, dest_id = moving_map[key] if source_id != unicode(local_dev['id']): continue node = {} replication_ip, replication_device = new_dict[dest_id] node['replication_ip'] = replication_ip node['device'] = replication_device remote_path = os.path.join(self.devices_dir, node['device'], self.mover_tmp_dir) jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, node=node, policy=policy, partition=partition, remote_path=remote_path)) except ValueError: continue except Exception as e: self.logger.exception( "an %s exception accure at build_replication_jobs" % e) if self.test: print e return jobs def collect_jobs(self, old_dict, new_dict, moving_map): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. :param old_dict: dictionary with devices from old ring :param new_dict: dictionary with devices from new ring :param moving_map: the dictionary that contains all the partitions that should be moved, their sources and destinations """ jobs = [] ips = whataremyips(self.bind_ip) for policy in POLICIES: if policy.policy_type == REPL_POLICY: # ensure rings are loaded for policy self.load_object_ring(policy) jobs += self.build_replication_jobs( policy, ips, old_dict, new_dict, moving_map) random.shuffle(jobs) return jobs def move(self, old_dict, new_dict, moving_map): """Run a move pass. :param old_dict: dictionary with devices from old ring :param new_dict: dictionary with devices from new ring :param moving_map: the dictionary that contains all the partitions that should be moved, their sources and destinations """ self.start = time.time() self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(old_dict, new_dict, moving_map) for job in jobs: dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self.logger.warn('%s is not mounted' % job['device']) continue try: if isfile(job['path']): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job['path']) os.remove(job['path']) continue except OSError: continue self.run_pool.spawn(self.update, job) self.run_pool.waitall() except (Exception, Timeout) as e: self.kill_coros() self.logger.exception( "Exception in top-level partition move loop %s" % e) if self.test: print e def run_once(self, *args, **kwargs): start = time.time() self.logger.info("Running object mover in script mode.") old_dict, new_dict, moving_map =\ load_moving_map(self.data_moving_map_dump) self.move(old_dict, new_dict, moving_map) trie = 0 while trie < self.retries: if len(self.retrie_list) == 0: break current_retrie_list = self.retrie_list self.retrie_list = [] for job in current_retrie_list: self.update(job) trie += 1 total = (time.time() - start) / 60 self.logger.info( "Object move complete (once). " "(%.02f minutes), %s partition movement failed" % (total, len(self.retrie_list)))
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = get_logger(conf, log_route='object-replicator') self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = conf.get('mount_check', 'true').lower() in \ ('true', 't', '1', 'on', 'yes', 'y') self.vm_test_mode = conf.get( 'vm_test_mode', 'no').lower() in ('yes', 'true', 'on', '1') self.chase_dir = conf.get('chase_dir', '/etc/chase') self.port = int(conf.get('bind_port', 6000)) self.concurrency = int(conf.get('concurrency', 1)) self.stats_interval = int(conf.get('stats_interval', '300')) self.object_ring = Ring(join(self.chase_dir, 'object.ring.gz')) self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7)) self.partition_times = [] self.run_pause = int(conf.get('run_pause', 30)) self.rsync_timeout = int(conf.get('rsync_timeout', 900)) self.rsync_io_timeout = conf.get('rsync_io_timeout', '30') self.http_timeout = int(conf.get('http_timeout', 60)) self.lockup_timeout = int(conf.get('lockup_timeout', 1800)) self.recon_enable = conf.get( 'recon_enable', 'no').lower() in TRUE_VALUES self.recon_cache_path = conf.get( 'recon_cache_path', '/var/cache/chase') self.recon_object = os.path.join(self.recon_cache_path, "object.recon") def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() ret_val = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error(_("Killing long-running rsync: %s"), str(args)) proc.kill() return 1 # failure response code total_time = time.time() - start_time for result in results.split('\n'): if result == '': continue if result.startswith('cd+'): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: self.logger.error(_('Bad rsync return code: %(args)s -> %(ret)d'), {'args': str(args), 'ret': ret_val}) elif results: self.logger.info( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {'src': args[-2], 'dst': args[-1], 'time': total_time}) else: self.logger.debug( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {'src': args[-2], 'dst': args[-1], 'time': total_time}) return ret_val def rsync(self, node, job, suffixes): """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean indicating success or failure """ if not os.path.exists(job['path']): return False args = [ 'rsync', '--recursive', '--whole-file', '--human-readable', '--xattrs', '--itemize-changes', '--ignore-existing', '--timeout=%s' % self.rsync_io_timeout, '--contimeout=%s' % self.rsync_io_timeout, ] if self.vm_test_mode: rsync_module = '[%s]::object%s' % (node['ip'], node['port']) else: rsync_module = '[%s]::object' % node['ip'] had_any = False for suffix in suffixes: spath = join(job['path'], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False args.append(join(rsync_module, node['device'], 'objects', job['partition'])) return self._rsync(args) == 0 def check_ring(self): """ Check to see if the ring has been updated :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if self.object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff))] self.replication_count += 1 begin = time.time() try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job['path']) if suffixes: for node in job['nodes']: success = self.rsync(node, job, suffixes) if success: with Timeout(self.http_timeout): http_connect(node['ip'], node['port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers={'Content-Length': '0'}).getresponse().read() responses.append(success) if not suffixes or (len(responses) == \ self.object_ring.replica_count and all(responses)): self.logger.info(_("Removing partition: %s"), job['path']) tpool.execute(shutil.rmtree, job['path'], ignore_errors=True) except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) finally: self.partition_times.append(time.time() - begin) def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ self.replication_count += 1 begin = time.time() try: hashed, local_hash = tpool.execute(tpooled_get_hashes, job['path'], do_listdir=(self.replication_count % 10) == 0, reclaim_age=self.reclaim_age) # See tpooled_get_hashes "Hack". if isinstance(hashed, BaseException): raise hashed self.suffix_hash += hashed attempts_left = self.object_ring.replica_count - 1 nodes = itertools.chain(job['nodes'], self.object_ring.get_more_nodes(int(job['partition']))) while attempts_left > 0: # If this throws StopIterator it will be caught way below node = next(nodes) attempts_left -= 1 try: with Timeout(self.http_timeout): resp = http_connect(node['ip'], node['port'], node['device'], job['partition'], 'REPLICATE', '', headers={'Content-Length': '0'}).getresponse() if resp.status == 507: self.logger.error(_('%(ip)s/%(device)s responded' ' as unmounted'), node) attempts_left += 1 continue if resp.status != 200: self.logger.error(_("Invalid response %(resp)s " "from %(ip)s"), {'resp': resp.status, 'ip': node['ip']}) continue remote_hash = pickle.loads(resp.read()) del resp suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] if not suffixes: continue hashed, recalc_hash = tpool.execute(tpooled_get_hashes, job['path'], recalculate=suffixes, reclaim_age=self.reclaim_age) # See tpooled_get_hashes "Hack". if isinstance(hashed, BaseException): raise hashed local_hash = recalc_hash suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] self.rsync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect(node['ip'], node['port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers={'Content-Length': '0'}) conn.getresponse().read() self.suffix_sync += len(suffixes) except (Exception, Timeout): self.logger.exception(_("Error syncing with node: %s") % node) self.suffix_count += len(local_hash) except (Exception, Timeout): self.logger.exception(_("Error syncing partition")) finally: self.partition_times.append(time.time() - begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ if self.replication_count: rate = self.replication_count / (time.time() - self.start) self.logger.info(_("%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)"), {'replicated': self.replication_count, 'total': self.job_count, 'percentage': self.replication_count * 100.0 / self.job_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, self.replication_count, self.job_count)}) if self.suffix_count: self.logger.info(_("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), {'checked': self.suffix_count, 'hashed': (self.suffix_hash * 100.0) / self.suffix_count, 'synced': (self.suffix_sync * 100.0) / self.suffix_count}) self.partition_times.sort() self.logger.info(_("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), {'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[ len(self.partition_times) // 2]}) else: self.logger.info(_("Nothing replicated for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the replicator finishes its replication pass in some eventuality. """ while True: eventlet.sleep(self.lockup_timeout) if self.replication_count == self.last_replication_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_replication_count = self.replication_count def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. """ jobs = [] ips = whataremyips() for local_dev in [dev for dev in self.object_ring.devs if dev and dev['ip'] in ips and dev['port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not os.path.ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): continue for partition in os.listdir(obj_path): try: nodes = [node for node in self.object_ring.get_part_nodes(int(partition)) if node['id'] != local_dev['id']] jobs.append(dict(path=join(obj_path, partition), nodes=nodes, delete=len(nodes) > self.object_ring.replica_count - 1, partition=partition)) except ValueError: continue random.shuffle(jobs) # Partititons that need to be deleted take priority jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs def replicate(self): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs() for job in jobs: if not self.check_ring(): self.logger.info(_("Ring change detected. Aborting " "current replication pass.")) return if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() def run_once(self, *args, **kwargs): start = time.time() self.logger.info(_("Running object replicator in script mode.")) self.replicate() total = (time.time() - start) / 60 self.logger.info( _("Object replication complete. (%.02f minutes)"), total) if self.recon_enable: try: dump_recon_cache('object_replication_time', total, \ self.recon_object) except (Exception, Timeout): self.logger.exception(_('Exception dumping recon cache')) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object replicator in daemon mode.")) # Run the replicator continually while True: start = time.time() self.logger.info(_("Starting object replication pass.")) # Run the replicator self.replicate() total = (time.time() - start) / 60 self.logger.info( _("Object replication complete. (%.02f minutes)"), total) if self.recon_enable: try: dump_recon_cache('object_replication_time', total, \ self.recon_object) except (Exception, Timeout): self.logger.exception(_('Exception dumping recon cache')) self.logger.debug(_('Replication sleeping for %s seconds.'), self.run_pause) sleep(self.run_pause)
class Chewie(object): SIOCGIFHWADDR = 0x8927 SIOCGIFINDEX = 0x8933 PACKET_MR_MULTICAST = 0 PACKET_MR_PROMISC = 1 SOL_PACKET = 263 PACKET_ADD_MEMBERSHIP = 1 EAP_ADDRESS = MacAddress.from_string("01:80:c2:00:00:03") def __init__(self, interface_name, credentials, logger=None, auth_handler=None, group_address=None): self.interface_name = interface_name self.credentials = credentials self.logger = logger self.auth_handler = auth_handler self.group_address = group_address if not group_address: self.group_address = self.EAP_ADDRESS def run(self): self.logger.info("CHEWIE: Starting") self.open_socket() self.get_interface_info() self.build_state_machine() self.join_multicast_group() self.start_threads_and_wait() def start_threads_and_wait(self): self.pool = GreenPool() self.eventlets = [] self.eventlets.append(self.pool.spawn(self.send_messages)) self.eventlets.append(self.pool.spawn(self.receive_messages)) self.pool.waitall() def auth_success(self, src_mac): if self.auth_handler: self.auth_handler(src_mac, self.group_address) def send_messages(self): while True: sleep(0) message = self.state_machine.output_messages.get() self.logger.info("CHEWIE: Sending message %s to %s" % (message, str(self.group_address))) self.socket.send(MessagePacker.pack(message, self.group_address)) def receive_messages(self): while True: sleep(0) packed_message = self.socket.recv(4096) message = MessageParser.parse(packed_message) self.logger.info("CHEWIE: Received message: %s" % message) event = EventMessageReceived(message) self.state_machine.event(event) def open_socket(self): self.socket = socket.socket(socket.PF_PACKET, socket.SOCK_RAW, socket.htons(0x888e)) self.socket.bind((self.interface_name, 0)) def build_state_machine(self): self.state_machine = StateMachine(self.interface_address, self.auth_success) def get_interface_info(self): self.get_interface_address() self.get_interface_index() def get_interface_address(self): # http://man7.org/linux/man-pages/man7/netdevice.7.html ifreq = struct.pack('16sH6s', self.interface_name.encode("utf-8"), 0, b"") response = ioctl(self.socket, self.SIOCGIFHWADDR, ifreq) _interface_name, _address_family, interface_address = struct.unpack('16sH6s', response) self.interface_address = MacAddress(interface_address) def get_interface_index(self): # http://man7.org/linux/man-pages/man7/netdevice.7.html ifreq = struct.pack('16sI', self.interface_name.encode("utf-8"), 0) response = ioctl(self.socket, self.SIOCGIFINDEX, ifreq) _ifname, self.interface_index = struct.unpack('16sI', response) def join_multicast_group(self): # TODO this works but should blank out the end bytes mreq = struct.pack("IHH8s", self.interface_index, self.PACKET_MR_PROMISC, len(self.EAP_ADDRESS.address), self.EAP_ADDRESS.address) self.socket.setsockopt(self.SOL_PACKET, self.PACKET_ADD_MEMBERSHIP, mreq)
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = get_logger(conf, log_route="object-replicator") self.devices_dir = conf.get("devices", "/srv/node") self.mount_check = conf.get("mount_check", "true").lower() in ("true", "t", "1", "on", "yes", "y") self.vm_test_mode = conf.get("vm_test_mode", "no").lower() in ("yes", "true", "on", "1") self.swift_dir = conf.get("swift_dir", "/etc/swift") self.port = int(conf.get("bind_port", 6000)) self.concurrency = int(conf.get("concurrency", 1)) self.stats_interval = int(conf.get("stats_interval", "300")) self.object_ring = Ring(self.swift_dir, ring_name="object") self.ring_check_interval = int(conf.get("ring_check_interval", 15)) self.next_check = time.time() + self.ring_check_interval self.reclaim_age = int(conf.get("reclaim_age", 86400 * 7)) self.partition_times = [] self.run_pause = int(conf.get("run_pause", 30)) self.rsync_timeout = int(conf.get("rsync_timeout", 900)) self.rsync_io_timeout = conf.get("rsync_io_timeout", "30") self.http_timeout = int(conf.get("http_timeout", 60)) self.lockup_timeout = int(conf.get("lockup_timeout", 1800)) self.recon_cache_path = conf.get("recon_cache_path", "/var/cache/swift") self.rcache = os.path.join(self.recon_cache_path, "object.recon") def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() ret_val = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error(_("Killing long-running rsync: %s"), str(args)) proc.kill() return 1 # failure response code total_time = time.time() - start_time for result in results.split("\n"): if result == "": continue if result.startswith("cd+"): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: self.logger.error(_("Bad rsync return code: %(args)s -> %(ret)d"), {"args": str(args), "ret": ret_val}) elif results: self.logger.info( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {"src": args[-2], "dst": args[-1], "time": total_time}, ) else: self.logger.debug( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {"src": args[-2], "dst": args[-1], "time": total_time}, ) return ret_val def rsync(self, node, job, suffixes): """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean indicating success or failure """ if not os.path.exists(job["path"]): return False args = [ "rsync", "--recursive", "--whole-file", "--human-readable", "--xattrs", "--itemize-changes", "--ignore-existing", "--timeout=%s" % self.rsync_io_timeout, "--contimeout=%s" % self.rsync_io_timeout, ] node_ip = rsync_ip(node["ip"]) if self.vm_test_mode: rsync_module = "%s::object%s" % (node_ip, node["port"]) else: rsync_module = "%s::object" % node_ip had_any = False for suffix in suffixes: spath = join(job["path"], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False args.append(join(rsync_module, node["device"], "objects", job["partition"])) return self._rsync(args) == 0 def check_ring(self): """ Check to see if the ring has been updated :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if self.object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff))] self.replication_count += 1 self.logger.increment("partition.delete.count.%s" % (job["device"],)) begin = time.time() try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job["path"]) if suffixes: for node in job["nodes"]: success = self.rsync(node, job, suffixes) if success: with Timeout(self.http_timeout): http_connect( node["ip"], node["port"], node["device"], job["partition"], "REPLICATE", "/" + "-".join(suffixes), headers={"Content-Length": "0"}, ).getresponse().read() responses.append(success) if not suffixes or (len(responses) == len(job["nodes"]) and all(responses)): self.logger.info(_("Removing partition: %s"), job["path"]) tpool.execute(shutil.rmtree, job["path"], ignore_errors=True) except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) finally: self.partition_times.append(time.time() - begin) self.logger.timing_since("partition.delete.timing", begin) def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ self.replication_count += 1 self.logger.increment("partition.update.count.%s" % (job["device"],)) begin = time.time() try: hashed, local_hash = tpool_reraise( get_hashes, job["path"], do_listdir=(self.replication_count % 10) == 0, reclaim_age=self.reclaim_age ) self.suffix_hash += hashed self.logger.update_stats("suffix.hashes", hashed) attempts_left = len(job["nodes"]) nodes = itertools.chain(job["nodes"], self.object_ring.get_more_nodes(int(job["partition"]))) while attempts_left > 0: # If this throws StopIterator it will be caught way below node = next(nodes) attempts_left -= 1 try: with Timeout(self.http_timeout): resp = http_connect( node["ip"], node["port"], node["device"], job["partition"], "REPLICATE", "", headers={"Content-Length": "0"}, ).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error(_("%(ip)s/%(device)s responded" " as unmounted"), node) attempts_left += 1 continue if resp.status != HTTP_OK: self.logger.error( _("Invalid response %(resp)s " "from %(ip)s"), {"resp": resp.status, "ip": node["ip"]} ) continue remote_hash = pickle.loads(resp.read()) del resp suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] if not suffixes: continue hashed, recalc_hash = tpool_reraise( get_hashes, job["path"], recalculate=suffixes, reclaim_age=self.reclaim_age ) self.logger.update_stats("suffix.hashes", hashed) local_hash = recalc_hash suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] self.rsync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect( node["ip"], node["port"], node["device"], job["partition"], "REPLICATE", "/" + "-".join(suffixes), headers={"Content-Length": "0"}, ) conn.getresponse().read() self.suffix_sync += len(suffixes) self.logger.update_stats("suffix.syncs", len(suffixes)) except (Exception, Timeout): self.logger.exception(_("Error syncing with node: %s") % node) self.suffix_count += len(local_hash) except (Exception, Timeout): self.logger.exception(_("Error syncing partition")) finally: self.partition_times.append(time.time() - begin) self.logger.timing_since("partition.update.timing", begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ if self.replication_count: elapsed = (time.time() - self.start) or 0.000001 rate = self.replication_count / elapsed self.logger.info( _( "%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)" ), { "replicated": self.replication_count, "total": self.job_count, "percentage": self.replication_count * 100.0 / self.job_count, "time": time.time() - self.start, "rate": rate, "remaining": "%d%s" % compute_eta(self.start, self.replication_count, self.job_count), }, ) if self.suffix_count: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), { "checked": self.suffix_count, "hashed": (self.suffix_hash * 100.0) / self.suffix_count, "synced": (self.suffix_sync * 100.0) / self.suffix_count, }, ) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), { "max": self.partition_times[-1], "min": self.partition_times[0], "med": self.partition_times[len(self.partition_times) // 2], }, ) else: self.logger.info(_("Nothing replicated for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the replicator finishes its replication pass in some eventuality. """ while True: eventlet.sleep(self.lockup_timeout) if self.replication_count == self.last_replication_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_replication_count = self.replication_count def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. """ jobs = [] ips = whataremyips() for local_dev in [ dev for dev in self.object_ring.devs if dev and dev["ip"] in ips and dev["port"] == self.port ]: dev_path = join(self.devices_dir, local_dev["device"]) obj_path = join(dev_path, "objects") tmp_path = join(dev_path, "tmp") if self.mount_check and not os.path.ismount(dev_path): self.logger.warn(_("%s is not mounted"), local_dev["device"]) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): mkdirs(obj_path) continue for partition in os.listdir(obj_path): try: part_nodes = self.object_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node["id"] != local_dev["id"]] jobs.append( dict( path=join(obj_path, partition), device=local_dev["device"], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition, ) ) except ValueError: continue random.shuffle(jobs) # Partititons that need to be deleted take priority jobs.sort(key=lambda job: not job["delete"]) self.job_count = len(jobs) return jobs def replicate(self): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs() for job in jobs: dev_path = join(self.devices_dir, job["device"]) if self.mount_check and not os.path.ismount(dev_path): self.logger.warn(_("%s is not mounted"), job["device"]) continue if not self.check_ring(): self.logger.info(_("Ring change detected. Aborting " "current replication pass.")) return if job["delete"]: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() def run_once(self, *args, **kwargs): start = time.time() self.logger.info(_("Running object replicator in script mode.")) self.replicate() total = (time.time() - start) / 60 self.logger.info(_("Object replication complete. (%.02f minutes)"), total) dump_recon_cache({"object_replication_time": total}, self.rcache, self.logger) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object replicator in daemon mode.")) # Run the replicator continually while True: start = time.time() self.logger.info(_("Starting object replication pass.")) # Run the replicator self.replicate() total = (time.time() - start) / 60 self.logger.info(_("Object replication complete. (%.02f minutes)"), total) dump_recon_cache({"object_replication_time": total}, self.rcache, self.logger) self.logger.debug(_("Replication sleeping for %s seconds."), self.run_pause) sleep(self.run_pause)
class ObjectReconstructor(Daemon): """ Reconstruct objects using erasure code. And also rebalance EC Fragment Archive objects off handoff nodes. Encapsulates most logic and data needed by the object reconstruction process. Each call to .reconstruct() performs one pass. It's up to the caller to do this in a loop. """ def __init__(self, conf, logger=None): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = logger or get_logger( conf, log_route='object-reconstructor') self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.bind_ip = conf.get('bind_ip', '0.0.0.0') self.servers_per_port = int(conf.get('servers_per_port', '0') or 0) self.port = None if self.servers_per_port else \ int(conf.get('bind_port', 6200)) self.concurrency = int(conf.get('concurrency', 1)) self.stats_interval = int(conf.get('stats_interval', '300')) self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7)) self.partition_times = [] self.interval = int(conf.get('interval') or conf.get('run_pause') or 30) self.http_timeout = int(conf.get('http_timeout', 60)) self.lockup_timeout = int(conf.get('lockup_timeout', 1800)) self.recon_cache_path = conf.get('recon_cache_path', '/var/cache/swift') self.rcache = os.path.join(self.recon_cache_path, "object.recon") # defaults subject to change after beta self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.node_timeout = float(conf.get('node_timeout', 10)) self.network_chunk_size = int(conf.get('network_chunk_size', 65536)) self.disk_chunk_size = int(conf.get('disk_chunk_size', 65536)) self.headers = { 'Content-Length': '0', 'user-agent': 'obj-reconstructor %s' % os.getpid()} self.handoffs_first = config_true_value(conf.get('handoffs_first', False)) self._df_router = DiskFileRouter(conf, self.logger) def load_object_ring(self, policy): """ Make sure the policy's rings are loaded. :param policy: the StoragePolicy instance :returns: appropriate ring object """ policy.load_ring(self.swift_dir) return policy.object_ring def check_ring(self, object_ring): """ Check to see if the ring has been updated :param object_ring: the ring to check :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if object_ring.has_changed(): return False return True def _full_path(self, node, part, path, policy): return '%(replication_ip)s:%(replication_port)s' \ '/%(device)s/%(part)s%(path)s ' \ 'policy#%(policy)d frag#%(frag_index)s' % { 'replication_ip': node['replication_ip'], 'replication_port': node['replication_port'], 'device': node['device'], 'part': part, 'path': path, 'policy': policy, 'frag_index': node.get('index', 'handoff'), } def _get_response(self, node, part, path, headers, policy): """ Helper method for reconstruction that GETs a single EC fragment archive :param node: the node to GET from :param part: the partition :param path: full path of the desired EC archive :param headers: the headers to send :param policy: an instance of :class:`~swift.common.storage_policy.BaseStoragePolicy` :returns: response """ resp = None try: with ConnectionTimeout(self.conn_timeout): conn = http_connect(node['ip'], node['port'], node['device'], part, 'GET', path, headers=headers) with Timeout(self.node_timeout): resp = conn.getresponse() if resp.status not in [HTTP_OK, HTTP_NOT_FOUND]: self.logger.warning( _("Invalid response %(resp)s from %(full_path)s"), {'resp': resp.status, 'full_path': self._full_path(node, part, path, policy)}) resp = None elif resp.status == HTTP_NOT_FOUND: resp = None except (Exception, Timeout): self.logger.exception( _("Trying to GET %(full_path)s"), { 'full_path': self._full_path(node, part, path, policy)}) return resp def reconstruct_fa(self, job, node, datafile_metadata): """ Reconstructs a fragment archive - this method is called from ssync after a remote node responds that is missing this object - the local diskfile is opened to provide metadata - but to reconstruct the missing fragment archive we must connect to multiple object servers. :param job: job from ssync_sender :param node: node that we're rebuilding to :param datafile_metadata: the datafile metadata to attach to the rebuilt fragment archive :returns: a DiskFile like class for use by ssync :raises DiskFileError: if the fragment archive cannot be reconstructed """ part_nodes = job['policy'].object_ring.get_part_nodes( job['partition']) part_nodes.remove(node) # the fragment index we need to reconstruct is the position index # of the node we're rebuilding to within the primary part list fi_to_rebuild = node['index'] # KISS send out connection requests to all nodes, see what sticks headers = self.headers.copy() headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) pile = GreenAsyncPile(len(part_nodes)) path = datafile_metadata['name'] for node in part_nodes: pile.spawn(self._get_response, node, job['partition'], path, headers, job['policy']) responses = [] etag = None for resp in pile: if not resp: continue resp.headers = HeaderKeyDict(resp.getheaders()) if str(fi_to_rebuild) == \ resp.headers.get('X-Object-Sysmeta-Ec-Frag-Index'): continue if resp.headers.get('X-Object-Sysmeta-Ec-Frag-Index') in set( r.headers.get('X-Object-Sysmeta-Ec-Frag-Index') for r in responses): continue responses.append(resp) etag = sorted(responses, reverse=True, key=lambda r: Timestamp( r.headers.get('X-Backend-Timestamp') ))[0].headers.get('X-Object-Sysmeta-Ec-Etag') responses = [r for r in responses if r.headers.get('X-Object-Sysmeta-Ec-Etag') == etag] if len(responses) >= job['policy'].ec_ndata: break else: self.logger.error( 'Unable to get enough responses (%s/%s) ' 'to reconstruct %s with ETag %s' % ( len(responses), job['policy'].ec_ndata, self._full_path(node, job['partition'], datafile_metadata['name'], job['policy']), etag)) raise DiskFileError('Unable to reconstruct EC archive') rebuilt_fragment_iter = self.make_rebuilt_fragment_iter( responses[:job['policy'].ec_ndata], path, job['policy'], fi_to_rebuild) return RebuildingECDiskFileStream(datafile_metadata, fi_to_rebuild, rebuilt_fragment_iter) def _reconstruct(self, policy, fragment_payload, frag_index): return policy.pyeclib_driver.reconstruct(fragment_payload, [frag_index])[0] def make_rebuilt_fragment_iter(self, responses, path, policy, frag_index): """ Turn a set of connections from backend object servers into a generator that yields up the rebuilt fragment archive for frag_index. """ def _get_one_fragment(resp): buff = '' remaining_bytes = policy.fragment_size while remaining_bytes: chunk = resp.read(remaining_bytes) if not chunk: break remaining_bytes -= len(chunk) buff += chunk return buff def fragment_payload_iter(): # We need a fragment from each connections, so best to # use a GreenPile to keep them ordered and in sync pile = GreenPile(len(responses)) while True: for resp in responses: pile.spawn(_get_one_fragment, resp) try: with Timeout(self.node_timeout): fragment_payload = [fragment for fragment in pile] except (Exception, Timeout): self.logger.exception( _("Error trying to rebuild %(path)s " "policy#%(policy)d frag#%(frag_index)s"), {'path': path, 'policy': policy, 'frag_index': frag_index, }) break if not all(fragment_payload): break rebuilt_fragment = self._reconstruct( policy, fragment_payload, frag_index) yield rebuilt_fragment return fragment_payload_iter() def stats_line(self): """ Logs various stats for the currently running reconstruction pass. """ if (self.device_count and self.part_count and self.reconstruction_device_count): elapsed = (time.time() - self.start) or 0.000001 rate = self.reconstruction_part_count / elapsed total_part_count = (self.part_count * self.device_count / self.reconstruction_device_count) self.logger.info( _("%(reconstructed)d/%(total)d (%(percentage).2f%%)" " partitions of %(device)d/%(dtotal)d " "(%(dpercentage).2f%%) devices" " reconstructed in %(time).2fs " "(%(rate).2f/sec, %(remaining)s remaining)"), {'reconstructed': self.reconstruction_part_count, 'total': self.part_count, 'percentage': self.reconstruction_part_count * 100.0 / self.part_count, 'device': self.reconstruction_device_count, 'dtotal': self.device_count, 'dpercentage': self.reconstruction_device_count * 100.0 / self.device_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, self.reconstruction_part_count, total_part_count)}) if self.suffix_count and self.partition_times: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), {'checked': self.suffix_count, 'hashed': (self.suffix_hash * 100.0) / self.suffix_count, 'synced': (self.suffix_sync * 100.0) / self.suffix_count}) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), {'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[ len(self.partition_times) // 2]}) else: self.logger.info( _("Nothing reconstructed for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during reconstruction. It periodically logs progress. """ while True: sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the reconstructor finishes its reconstruction pass in some eventuality. """ while True: sleep(self.lockup_timeout) if self.reconstruction_count == self.last_reconstruction_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_reconstruction_count = self.reconstruction_count def _get_hashes(self, policy, path, recalculate=None, do_listdir=False): df_mgr = self._df_router[policy] hashed, suffix_hashes = tpool_reraise( df_mgr._get_hashes, path, recalculate=recalculate, do_listdir=do_listdir, reclaim_age=self.reclaim_age) self.logger.update_stats('suffix.hashes', hashed) return suffix_hashes def get_suffix_delta(self, local_suff, local_index, remote_suff, remote_index): """ Compare the local suffix hashes with the remote suffix hashes for the given local and remote fragment indexes. Return those suffixes which should be synced. :param local_suff: the local suffix hashes (from _get_hashes) :param local_index: the local fragment index for the job :param remote_suff: the remote suffix hashes (from remote REPLICATE request) :param remote_index: the remote fragment index for the job :returns: a list of strings, the suffix dirs to sync """ suffixes = [] for suffix, sub_dict_local in local_suff.items(): sub_dict_remote = remote_suff.get(suffix, {}) if (sub_dict_local.get(None) != sub_dict_remote.get(None) or sub_dict_local.get(local_index) != sub_dict_remote.get(remote_index)): suffixes.append(suffix) return suffixes def rehash_remote(self, node, job, suffixes): try: with Timeout(self.http_timeout): conn = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(sorted(suffixes)), headers=self.headers) conn.getresponse().read() except (Exception, Timeout): self.logger.exception( _("Trying to sync suffixes with %s") % self._full_path( node, job['partition'], '', job['policy'])) def _get_suffixes_to_sync(self, job, node): """ For SYNC jobs we need to make a remote REPLICATE request to get the remote node's current suffix's hashes and then compare to our local suffix's hashes to decide which suffixes (if any) are out of sync. :param: the job dict, with the keys defined in ``_get_part_jobs`` :param node: the remote node dict :returns: a (possibly empty) list of strings, the suffixes to be synced with the remote node. """ # get hashes from the remote node remote_suffixes = None try: with Timeout(self.http_timeout): resp = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '', headers=self.headers).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error( _('%s responded as unmounted'), self._full_path(node, job['partition'], '', job['policy'])) elif resp.status != HTTP_OK: full_path = self._full_path(node, job['partition'], '', job['policy']) self.logger.error( _("Invalid response %(resp)s from %(full_path)s"), {'resp': resp.status, 'full_path': full_path}) else: remote_suffixes = pickle.loads(resp.read()) except (Exception, Timeout): # all exceptions are logged here so that our caller can # safely catch our exception and continue to the next node # without logging self.logger.exception('Unable to get remote suffix hashes ' 'from %r' % self._full_path( node, job['partition'], '', job['policy'])) if remote_suffixes is None: raise SuffixSyncError('Unable to get remote suffix hashes') suffixes = self.get_suffix_delta(job['hashes'], job['frag_index'], remote_suffixes, node['index']) # now recalculate local hashes for suffixes that don't # match so we're comparing the latest local_suff = self._get_hashes(job['policy'], job['path'], recalculate=suffixes) suffixes = self.get_suffix_delta(local_suff, job['frag_index'], remote_suffixes, node['index']) self.suffix_count += len(suffixes) return suffixes def delete_reverted_objs(self, job, objects, frag_index): """ For EC we can potentially revert only some of a partition so we'll delete reverted objects here. Note that we delete the fragment index of the file we sent to the remote node. :param job: the job being processed :param objects: a dict of objects to be deleted, each entry maps hash=>timestamp :param frag_index: (int) the fragment index of data files to be deleted """ df_mgr = self._df_router[job['policy']] for object_hash, timestamps in objects.items(): try: df = df_mgr.get_diskfile_from_hash( job['local_dev']['device'], job['partition'], object_hash, job['policy'], frag_index=frag_index) df.purge(timestamps['ts_data'], frag_index) except DiskFileError: self.logger.exception( 'Unable to purge DiskFile (%r %r %r)', object_hash, timestamps['ts_data'], frag_index) continue def process_job(self, job): """ Sync the local partition with the remote node(s) according to the parameters of the job. For primary nodes, the SYNC job type will define both left and right hand sync_to nodes to ssync with as defined by this primary nodes index in the node list based on the fragment index found in the partition. For non-primary nodes (either handoff revert, or rebalance) the REVERT job will define a single node in sync_to which is the proper/new home for the fragment index. N.B. ring rebalancing can be time consuming and handoff nodes' fragment indexes do not have a stable order, it's possible to have more than one REVERT job for a partition, and in some rare failure conditions there may even also be a SYNC job for the same partition - but each one will be processed separately because each job will define a separate list of node(s) to 'sync_to'. :param: the job dict, with the keys defined in ``_get_job_info`` """ self.headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) begin = time.time() if job['job_type'] == REVERT: self._revert(job, begin) else: self._sync(job, begin) self.partition_times.append(time.time() - begin) self.reconstruction_count += 1 def _sync(self, job, begin): """ Process a SYNC job. """ self.logger.increment( 'partition.update.count.%s' % (job['local_dev']['device'],)) # after our left and right partners, if there's some sort of # failure we'll continue onto the remaining primary nodes and # make sure they're in sync - or potentially rebuild missing # fragments we find dest_nodes = itertools.chain( job['sync_to'], # I think we could order these based on our index to better # protect against a broken chain [ n for n in job['policy'].object_ring.get_part_nodes(job['partition']) if n['id'] != job['local_dev']['id'] and n['id'] not in (m['id'] for m in job['sync_to']) ], ) syncd_with = 0 for node in dest_nodes: if syncd_with >= len(job['sync_to']): # success! break try: suffixes = self._get_suffixes_to_sync(job, node) except SuffixSyncError: continue if not suffixes: syncd_with += 1 continue # ssync any out-of-sync suffixes with the remote node success, _ = ssync_sender( self, node, job, suffixes)() # let remote end know to rehash it's suffixes self.rehash_remote(node, job, suffixes) # update stats for this attempt self.suffix_sync += len(suffixes) self.logger.update_stats('suffix.syncs', len(suffixes)) if success: syncd_with += 1 self.logger.timing_since('partition.update.timing', begin) def _revert(self, job, begin): """ Process a REVERT job. """ self.logger.increment( 'partition.delete.count.%s' % (job['local_dev']['device'],)) # we'd desperately like to push this partition back to it's # primary location, but if that node is down, the next best thing # is one of the handoff locations - which *might* be us already! dest_nodes = itertools.chain( job['sync_to'], job['policy'].object_ring.get_more_nodes(job['partition']), ) syncd_with = 0 reverted_objs = {} for node in dest_nodes: if syncd_with >= len(job['sync_to']): break if node['id'] == job['local_dev']['id']: # this is as good a place as any for this data for now break success, in_sync_objs = ssync_sender( self, node, job, job['suffixes'])() self.rehash_remote(node, job, job['suffixes']) if success: syncd_with += 1 reverted_objs.update(in_sync_objs) if syncd_with >= len(job['sync_to']): self.delete_reverted_objs( job, reverted_objs, job['frag_index']) self.logger.timing_since('partition.delete.timing', begin) def _get_part_jobs(self, local_dev, part_path, partition, policy): """ Helper function to build jobs for a partition, this method will read the suffix hashes and create job dictionaries to describe the needed work. There will be one job for each fragment index discovered in the partition. For a fragment index which corresponds to this node's ring index, a job with job_type SYNC will be created to ensure that the left and right hand primary ring nodes for the part have the corresponding left and right hand fragment archives. A fragment index (or entire partition) for which this node is not the primary corresponding node, will create job(s) with job_type REVERT to ensure that fragment archives are pushed to the correct node and removed from this one. A partition may result in multiple jobs. Potentially many REVERT jobs, and zero or one SYNC job. :param local_dev: the local device :param part_path: full path to partition :param partition: partition number :param policy: the policy :returns: a list of dicts of job info """ # find all the fi's in the part, and which suffixes have them hashes = self._get_hashes(policy, part_path, do_listdir=True) non_data_fragment_suffixes = [] data_fi_to_suffixes = defaultdict(list) for suffix, fi_hash in hashes.items(): if not fi_hash: # this is for sanity and clarity, normally an empty # suffix would get del'd from the hashes dict, but an # OSError trying to re-hash the suffix could leave the # value empty - it will log the exception; but there's # no way to properly address this suffix at this time. continue data_frag_indexes = [f for f in fi_hash if f is not None] if not data_frag_indexes: non_data_fragment_suffixes.append(suffix) else: for fi in data_frag_indexes: data_fi_to_suffixes[fi].append(suffix) # helper to ensure consistent structure of jobs def build_job(job_type, frag_index, suffixes, sync_to): return { 'job_type': job_type, 'frag_index': frag_index, 'suffixes': suffixes, 'sync_to': sync_to, 'partition': partition, 'path': part_path, 'hashes': hashes, 'policy': policy, 'local_dev': local_dev, # ssync likes to have it handy 'device': local_dev['device'], } # aggregate jobs for all the fragment index in this part jobs = [] # check the primary nodes - to see if the part belongs here part_nodes = policy.object_ring.get_part_nodes(partition) for node in part_nodes: if node['id'] == local_dev['id']: # this partition belongs here, we'll need a sync job frag_index = node['index'] try: suffixes = data_fi_to_suffixes.pop(frag_index) except KeyError: suffixes = [] sync_job = build_job( job_type=SYNC, frag_index=frag_index, suffixes=suffixes, sync_to=_get_partners(frag_index, part_nodes), ) # ssync callback to rebuild missing fragment_archives sync_job['sync_diskfile_builder'] = self.reconstruct_fa jobs.append(sync_job) break # assign remaining data fragment suffixes to revert jobs ordered_fis = sorted((len(suffixes), fi) for fi, suffixes in data_fi_to_suffixes.items()) for count, fi in ordered_fis: revert_job = build_job( job_type=REVERT, frag_index=fi, suffixes=data_fi_to_suffixes[fi], sync_to=[part_nodes[fi]], ) jobs.append(revert_job) # now we need to assign suffixes that have no data fragments if non_data_fragment_suffixes: if jobs: # the first job will be either the sync_job, or the # revert_job for the fragment index that is most common # among the suffixes jobs[0]['suffixes'].extend(non_data_fragment_suffixes) else: # this is an unfortunate situation, we need a revert job to # push partitions off this node, but none of the suffixes # have any data fragments to hint at which node would be a # good candidate to receive the tombstones. jobs.append(build_job( job_type=REVERT, frag_index=None, suffixes=non_data_fragment_suffixes, # this is super safe sync_to=part_nodes, # something like this would be probably be better # sync_to=random.sample(part_nodes, 3), )) # return a list of jobs for this part return jobs def collect_parts(self, override_devices=None, override_partitions=None): """ Helper for yielding partitions in the top level reconstructor """ override_devices = override_devices or [] override_partitions = override_partitions or [] ips = whataremyips(self.bind_ip) for policy in POLICIES: if policy.policy_type != EC_POLICY: continue self._diskfile_mgr = self._df_router[policy] self.load_object_ring(policy) data_dir = get_data_dir(policy) local_devices = list(six.moves.filter( lambda dev: dev and is_local_device( ips, self.port, dev['replication_ip'], dev['replication_port']), policy.object_ring.devs)) if override_devices: self.device_count = len(override_devices) else: self.device_count = len(local_devices) for local_dev in local_devices: if override_devices and (local_dev['device'] not in override_devices): continue self.reconstruction_device_count += 1 dev_path = self._df_router[policy].get_dev_path( local_dev['device']) if not dev_path: self.logger.warning(_('%s is not mounted'), local_dev['device']) continue obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception( 'Unable to create %s' % obj_path) continue try: partitions = os.listdir(obj_path) except OSError: self.logger.exception( 'Unable to list partitions in %r' % obj_path) continue self.part_count += len(partitions) for partition in partitions: part_path = join(obj_path, partition) if not (partition.isdigit() and os.path.isdir(part_path)): self.logger.warning( 'Unexpected entity in data dir: %r' % part_path) remove_file(part_path) self.reconstruction_part_count += 1 continue partition = int(partition) if override_partitions and (partition not in override_partitions): continue part_info = { 'local_dev': local_dev, 'policy': policy, 'partition': partition, 'part_path': part_path, } yield part_info self.reconstruction_part_count += 1 def build_reconstruction_jobs(self, part_info): """ Helper function for collect_jobs to build jobs for reconstruction using EC style storage policy """ jobs = self._get_part_jobs(**part_info) random.shuffle(jobs) if self.handoffs_first: # Move the handoff revert jobs to the front of the list jobs.sort(key=lambda job: job['job_type'], reverse=True) self.job_count += len(jobs) return jobs def _reset_stats(self): self.start = time.time() self.job_count = 0 self.part_count = 0 self.device_count = 0 self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.reconstruction_count = 0 self.reconstruction_part_count = 0 self.reconstruction_device_count = 0 self.last_reconstruction_count = -1 def delete_partition(self, path): self.logger.info(_("Removing partition: %s"), path) tpool.execute(shutil.rmtree, path, ignore_errors=True) def reconstruct(self, **kwargs): """Run a reconstruction pass""" self._reset_stats() self.partition_times = [] stats = spawn(self.heartbeat) lockup_detector = spawn(self.detect_lockups) sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) for part_info in self.collect_parts(**kwargs): if not self.check_ring(part_info['policy'].object_ring): self.logger.info(_("Ring change detected. Aborting " "current reconstruction pass.")) return jobs = self.build_reconstruction_jobs(part_info) if not jobs: # If this part belongs on this node, _get_part_jobs # will *always* build a sync_job - even if there's # no suffixes in the partition that needs to sync. # If there's any suffixes in the partition then our # job list would have *at least* one revert job. # Therefore we know this part a) doesn't belong on # this node and b) doesn't have any suffixes in it. self.run_pool.spawn(self.delete_partition, part_info['part_path']) for job in jobs: self.run_pool.spawn(self.process_job, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level" "reconstruction loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() def run_once(self, *args, **kwargs): start = time.time() self.logger.info(_("Running object reconstructor in script mode.")) override_devices = list_from_csv(kwargs.get('devices')) override_partitions = [int(p) for p in list_from_csv(kwargs.get('partitions'))] self.reconstruct( override_devices=override_devices, override_partitions=override_partitions) total = (time.time() - start) / 60 self.logger.info( _("Object reconstruction complete (once). (%.02f minutes)"), total) if not (override_partitions or override_devices): dump_recon_cache({'object_reconstruction_time': total, 'object_reconstruction_last': time.time()}, self.rcache, self.logger) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object reconstructor in daemon mode.")) # Run the reconstructor continually while True: start = time.time() self.logger.info(_("Starting object reconstruction pass.")) # Run the reconstructor self.reconstruct() total = (time.time() - start) / 60 self.logger.info( _("Object reconstruction complete. (%.02f minutes)"), total) dump_recon_cache({'object_reconstruction_time': total, 'object_reconstruction_last': time.time()}, self.rcache, self.logger) self.logger.debug('reconstruction sleeping for %s seconds.', self.interval) sleep(self.interval)
def reap_container(self, account, account_partition, account_nodes, container): """ Deletes the data and the container itself for the given container. This will call :func:`reap_object` up to sqrt(self.concurrency) times concurrently for the objects in the container. If there is any exception while deleting a single object, the process will continue for any other objects in the container and the failed objects will be tried again the next time this function is called with the same parameters. If there is any exception while listing the objects for deletion, the process will stop (but will obviously be tried again the next time this function is called with the same parameters). This is a possibility since the listing comes from querying just the primary remote container server. Once all objects have been attempted to be deleted, the container itself will be attempted to be deleted by sending a delete request to all container nodes. The format of the delete request is such that each container server will update a corresponding account server, removing the container from the account's listing. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param account: The name of the account for the container. :param account_partition: The partition for the account on the account ring. :param account_nodes: The primary node dicts for the account. :param container: The name of the container to delete. * See also: :func:`swift.common.ring.Ring.get_nodes` for a description of the account node dicts. """ account_nodes = list(account_nodes) part, nodes = self.get_container_ring().get_nodes(account, container) node = nodes[-1] pool = GreenPool(size=self.object_concurrency) marker = '' while True: objects = None try: objects = direct_get_container( node, part, account, container, marker=marker, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout)[1] self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 except ClientException, err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 if not objects: break try: for obj in objects: if isinstance(obj['name'], unicode): obj['name'] = obj['name'].encode('utf8') pool.spawn(self.reap_object, account, container, part, nodes, obj['name']) pool.waitall() except Exception: self.logger.exception( _('Exception with objects for container ' '%(container)s for account %(account)s'), { 'container': container, 'account': account }) marker = objects[-1]['name']
class ObjectReconstructor(Daemon): """ Reconstruct objects using erasure code. And also rebalance EC Fragment Archive objects off handoff nodes. Encapsulates most logic and data needed by the object reconstruction process. Each call to .reconstruct() performs one pass. It's up to the caller to do this in a loop. """ def __init__(self, conf, logger=None): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = logger or get_logger(conf, log_route='object-reconstructor') self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.bind_ip = conf.get('bind_ip', '0.0.0.0') self.servers_per_port = int(conf.get('servers_per_port', '0') or 0) self.port = None if self.servers_per_port else \ int(conf.get('bind_port', 6000)) self.concurrency = int(conf.get('concurrency', 1)) self.stats_interval = int(conf.get('stats_interval', '300')) self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7)) self.partition_times = [] self.interval = int( conf.get('interval') or conf.get('run_pause') or 30) self.http_timeout = int(conf.get('http_timeout', 60)) self.lockup_timeout = int(conf.get('lockup_timeout', 1800)) self.recon_cache_path = conf.get('recon_cache_path', '/var/cache/swift') self.rcache = os.path.join(self.recon_cache_path, "object.recon") # defaults subject to change after beta self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.node_timeout = float(conf.get('node_timeout', 10)) self.network_chunk_size = int(conf.get('network_chunk_size', 65536)) self.disk_chunk_size = int(conf.get('disk_chunk_size', 65536)) self.headers = { 'Content-Length': '0', 'user-agent': 'obj-reconstructor %s' % os.getpid() } self.handoffs_first = config_true_value( conf.get('handoffs_first', False)) self._df_router = DiskFileRouter(conf, self.logger) def load_object_ring(self, policy): """ Make sure the policy's rings are loaded. :param policy: the StoragePolicy instance :returns: appropriate ring object """ policy.load_ring(self.swift_dir) return policy.object_ring def check_ring(self, object_ring): """ Check to see if the ring has been updated :param object_ring: the ring to check :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if object_ring.has_changed(): return False return True def _full_path(self, node, part, path, policy): return '%(replication_ip)s:%(replication_port)s' \ '/%(device)s/%(part)s%(path)s ' \ 'policy#%(policy)d frag#%(frag_index)s' % { 'replication_ip': node['replication_ip'], 'replication_port': node['replication_port'], 'device': node['device'], 'part': part, 'path': path, 'policy': policy, 'frag_index': node.get('index', 'handoff'), } def _get_response(self, node, part, path, headers, policy): """ Helper method for reconstruction that GETs a single EC fragment archive :param node: the node to GET from :param part: the partition :param path: full path of the desired EC archive :param headers: the headers to send :param policy: an instance of :class:`~swift.common.storage_policy.BaseStoragePolicy` :returns: response """ resp = None try: with ConnectionTimeout(self.conn_timeout): conn = http_connect(node['ip'], node['port'], node['device'], part, 'GET', path, headers=headers) with Timeout(self.node_timeout): resp = conn.getresponse() if resp.status not in [HTTP_OK, HTTP_NOT_FOUND]: self.logger.warning( _("Invalid response %(resp)s from %(full_path)s"), { 'resp': resp.status, 'full_path': self._full_path(node, part, path, policy) }) resp = None elif resp.status == HTTP_NOT_FOUND: resp = None except (Exception, Timeout): self.logger.exception( _("Trying to GET %(full_path)s"), {'full_path': self._full_path(node, part, path, policy)}) return resp def reconstruct_fa(self, job, node, datafile_metadata): """ Reconstructs a fragment archive - this method is called from ssync after a remote node responds that is missing this object - the local diskfile is opened to provide metadata - but to reconstruct the missing fragment archive we must connect to multiple object servers. :param job: job from ssync_sender :param node: node that we're rebuilding to :param datafile_metadata: the datafile metadata to attach to the rebuilt fragment archive :returns: a DiskFile like class for use by ssync :raises DiskFileError: if the fragment archive cannot be reconstructed """ part_nodes = job['policy'].object_ring.get_part_nodes(job['partition']) part_nodes.remove(node) # the fragment index we need to reconstruct is the position index # of the node we're rebuilding to within the primary part list fi_to_rebuild = node['index'] # KISS send out connection requests to all nodes, see what sticks headers = self.headers.copy() headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) pile = GreenAsyncPile(len(part_nodes)) path = datafile_metadata['name'] for node in part_nodes: pile.spawn(self._get_response, node, job['partition'], path, headers, job['policy']) responses = [] etag = None for resp in pile: if not resp: continue resp.headers = HeaderKeyDict(resp.getheaders()) if str(fi_to_rebuild) == \ resp.headers.get('X-Object-Sysmeta-Ec-Frag-Index'): continue if resp.headers.get('X-Object-Sysmeta-Ec-Frag-Index') in set( r.headers.get('X-Object-Sysmeta-Ec-Frag-Index') for r in responses): continue responses.append(resp) etag = sorted( responses, reverse=True, key=lambda r: Timestamp(r.headers.get('X-Backend-Timestamp')) )[0].headers.get('X-Object-Sysmeta-Ec-Etag') responses = [ r for r in responses if r.headers.get('X-Object-Sysmeta-Ec-Etag') == etag ] if len(responses) >= job['policy'].ec_ndata: break else: self.logger.error('Unable to get enough responses (%s/%s) ' 'to reconstruct %s with ETag %s' % (len(responses), job['policy'].ec_ndata, self._full_path(node, job['partition'], datafile_metadata['name'], job['policy']), etag)) raise DiskFileError('Unable to reconstruct EC archive') rebuilt_fragment_iter = self.make_rebuilt_fragment_iter( responses[:job['policy'].ec_ndata], path, job['policy'], fi_to_rebuild) return RebuildingECDiskFileStream(datafile_metadata, fi_to_rebuild, rebuilt_fragment_iter) def _reconstruct(self, policy, fragment_payload, frag_index): return policy.pyeclib_driver.reconstruct(fragment_payload, [frag_index])[0] def make_rebuilt_fragment_iter(self, responses, path, policy, frag_index): """ Turn a set of connections from backend object servers into a generator that yields up the rebuilt fragment archive for frag_index. """ def _get_one_fragment(resp): buff = '' remaining_bytes = policy.fragment_size while remaining_bytes: chunk = resp.read(remaining_bytes) if not chunk: break remaining_bytes -= len(chunk) buff += chunk return buff def fragment_payload_iter(): # We need a fragment from each connections, so best to # use a GreenPile to keep them ordered and in sync pile = GreenPile(len(responses)) while True: for resp in responses: pile.spawn(_get_one_fragment, resp) try: with Timeout(self.node_timeout): fragment_payload = [fragment for fragment in pile] except (Exception, Timeout): self.logger.exception( _("Error trying to rebuild %(path)s " "policy#%(policy)d frag#%(frag_index)s"), { 'path': path, 'policy': policy, 'frag_index': frag_index, }) break if not all(fragment_payload): break rebuilt_fragment = self._reconstruct(policy, fragment_payload, frag_index) yield rebuilt_fragment return fragment_payload_iter() def stats_line(self): """ Logs various stats for the currently running reconstruction pass. """ if (self.device_count and self.part_count and self.reconstruction_device_count): elapsed = (time.time() - self.start) or 0.000001 rate = self.reconstruction_part_count / elapsed total_part_count = (self.part_count * self.device_count / self.reconstruction_device_count) self.logger.info( _("%(reconstructed)d/%(total)d (%(percentage).2f%%)" " partitions of %(device)d/%(dtotal)d " "(%(dpercentage).2f%%) devices" " reconstructed in %(time).2fs " "(%(rate).2f/sec, %(remaining)s remaining)"), { 'reconstructed': self.reconstruction_part_count, 'total': self.part_count, 'percentage': self.reconstruction_part_count * 100.0 / self.part_count, 'device': self.reconstruction_device_count, 'dtotal': self.device_count, 'dpercentage': self.reconstruction_device_count * 100.0 / self.device_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, self.reconstruction_part_count, total_part_count) }) if self.suffix_count and self.partition_times: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), { 'checked': self.suffix_count, 'hashed': (self.suffix_hash * 100.0) / self.suffix_count, 'synced': (self.suffix_sync * 100.0) / self.suffix_count }) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), { 'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[len(self.partition_times) // 2] }) else: self.logger.info(_("Nothing reconstructed for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during reconstruction. It periodically logs progress. """ while True: sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the reconstructor finishes its reconstruction pass in some eventuality. """ while True: sleep(self.lockup_timeout) if self.reconstruction_count == self.last_reconstruction_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_reconstruction_count = self.reconstruction_count def _get_hashes(self, policy, path, recalculate=None, do_listdir=False): df_mgr = self._df_router[policy] hashed, suffix_hashes = tpool_reraise(df_mgr._get_hashes, path, recalculate=recalculate, do_listdir=do_listdir, reclaim_age=self.reclaim_age) self.logger.update_stats('suffix.hashes', hashed) return suffix_hashes def get_suffix_delta(self, local_suff, local_index, remote_suff, remote_index): """ Compare the local suffix hashes with the remote suffix hashes for the given local and remote fragment indexes. Return those suffixes which should be synced. :param local_suff: the local suffix hashes (from _get_hashes) :param local_index: the local fragment index for the job :param remote_suff: the remote suffix hashes (from remote REPLICATE request) :param remote_index: the remote fragment index for the job :returns: a list of strings, the suffix dirs to sync """ suffixes = [] for suffix, sub_dict_local in local_suff.items(): sub_dict_remote = remote_suff.get(suffix, {}) if (sub_dict_local.get(None) != sub_dict_remote.get(None) or sub_dict_local.get(local_index) != sub_dict_remote.get(remote_index)): suffixes.append(suffix) return suffixes def rehash_remote(self, node, job, suffixes): try: with Timeout(self.http_timeout): conn = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(sorted(suffixes)), headers=self.headers) conn.getresponse().read() except (Exception, Timeout): self.logger.exception( _("Trying to sync suffixes with %s") % self._full_path(node, job['partition'], '', job['policy'])) def _get_suffixes_to_sync(self, job, node): """ For SYNC jobs we need to make a remote REPLICATE request to get the remote node's current suffix's hashes and then compare to our local suffix's hashes to decide which suffixes (if any) are out of sync. :param: the job dict, with the keys defined in ``_get_part_jobs`` :param node: the remote node dict :returns: a (possibly empty) list of strings, the suffixes to be synced with the remote node. """ # get hashes from the remote node remote_suffixes = None try: with Timeout(self.http_timeout): resp = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '', headers=self.headers).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error( _('%s responded as unmounted'), self._full_path(node, job['partition'], '', job['policy'])) elif resp.status != HTTP_OK: full_path = self._full_path(node, job['partition'], '', job['policy']) self.logger.error( _("Invalid response %(resp)s from %(full_path)s"), { 'resp': resp.status, 'full_path': full_path }) else: remote_suffixes = pickle.loads(resp.read()) except (Exception, Timeout): # all exceptions are logged here so that our caller can # safely catch our exception and continue to the next node # without logging self.logger.exception( 'Unable to get remote suffix hashes ' 'from %r' % self._full_path(node, job['partition'], '', job['policy'])) if remote_suffixes is None: raise SuffixSyncError('Unable to get remote suffix hashes') suffixes = self.get_suffix_delta(job['hashes'], job['frag_index'], remote_suffixes, node['index']) # now recalculate local hashes for suffixes that don't # match so we're comparing the latest local_suff = self._get_hashes(job['policy'], job['path'], recalculate=suffixes) suffixes = self.get_suffix_delta(local_suff, job['frag_index'], remote_suffixes, node['index']) self.suffix_count += len(suffixes) return suffixes def delete_reverted_objs(self, job, objects, frag_index): """ For EC we can potentially revert only some of a partition so we'll delete reverted objects here. Note that we delete the fragment index of the file we sent to the remote node. :param job: the job being processed :param objects: a dict of objects to be deleted, each entry maps hash=>timestamp :param frag_index: (int) the fragment index of data files to be deleted """ df_mgr = self._df_router[job['policy']] for object_hash, timestamps in objects.items(): try: df = df_mgr.get_diskfile_from_hash(job['local_dev']['device'], job['partition'], object_hash, job['policy'], frag_index=frag_index) df.purge(timestamps['ts_data'], frag_index) except DiskFileError: self.logger.exception('Unable to purge DiskFile (%r %r %r)', object_hash, timestamps['ts_data'], frag_index) continue def process_job(self, job): """ Sync the local partition with the remote node(s) according to the parameters of the job. For primary nodes, the SYNC job type will define both left and right hand sync_to nodes to ssync with as defined by this primary nodes index in the node list based on the fragment index found in the partition. For non-primary nodes (either handoff revert, or rebalance) the REVERT job will define a single node in sync_to which is the proper/new home for the fragment index. N.B. ring rebalancing can be time consuming and handoff nodes' fragment indexes do not have a stable order, it's possible to have more than one REVERT job for a partition, and in some rare failure conditions there may even also be a SYNC job for the same partition - but each one will be processed separately because each job will define a separate list of node(s) to 'sync_to'. :param: the job dict, with the keys defined in ``_get_job_info`` """ self.headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) begin = time.time() if job['job_type'] == REVERT: self._revert(job, begin) else: self._sync(job, begin) self.partition_times.append(time.time() - begin) self.reconstruction_count += 1 def _sync(self, job, begin): """ Process a SYNC job. """ self.logger.increment('partition.update.count.%s' % (job['local_dev']['device'], )) # after our left and right partners, if there's some sort of # failure we'll continue onto the remaining primary nodes and # make sure they're in sync - or potentially rebuild missing # fragments we find dest_nodes = itertools.chain( job['sync_to'], # I think we could order these based on our index to better # protect against a broken chain [ n for n in job['policy'].object_ring.get_part_nodes( job['partition']) if n['id'] != job['local_dev']['id'] and n['id'] not in (m['id'] for m in job['sync_to']) ], ) syncd_with = 0 for node in dest_nodes: if syncd_with >= len(job['sync_to']): # success! break try: suffixes = self._get_suffixes_to_sync(job, node) except SuffixSyncError: continue if not suffixes: syncd_with += 1 continue # ssync any out-of-sync suffixes with the remote node success, _ = ssync_sender(self, node, job, suffixes)() # let remote end know to rehash it's suffixes self.rehash_remote(node, job, suffixes) # update stats for this attempt self.suffix_sync += len(suffixes) self.logger.update_stats('suffix.syncs', len(suffixes)) if success: syncd_with += 1 self.logger.timing_since('partition.update.timing', begin) def _revert(self, job, begin): """ Process a REVERT job. """ self.logger.increment('partition.delete.count.%s' % (job['local_dev']['device'], )) # we'd desperately like to push this partition back to it's # primary location, but if that node is down, the next best thing # is one of the handoff locations - which *might* be us already! dest_nodes = itertools.chain( job['sync_to'], job['policy'].object_ring.get_more_nodes(job['partition']), ) syncd_with = 0 reverted_objs = {} for node in dest_nodes: if syncd_with >= len(job['sync_to']): break if node['id'] == job['local_dev']['id']: # this is as good a place as any for this data for now break success, in_sync_objs = ssync_sender(self, node, job, job['suffixes'])() self.rehash_remote(node, job, job['suffixes']) if success: syncd_with += 1 reverted_objs.update(in_sync_objs) if syncd_with >= len(job['sync_to']): self.delete_reverted_objs(job, reverted_objs, job['frag_index']) self.logger.timing_since('partition.delete.timing', begin) def _get_part_jobs(self, local_dev, part_path, partition, policy): """ Helper function to build jobs for a partition, this method will read the suffix hashes and create job dictionaries to describe the needed work. There will be one job for each fragment index discovered in the partition. For a fragment index which corresponds to this node's ring index, a job with job_type SYNC will be created to ensure that the left and right hand primary ring nodes for the part have the corresponding left and right hand fragment archives. A fragment index (or entire partition) for which this node is not the primary corresponding node, will create job(s) with job_type REVERT to ensure that fragment archives are pushed to the correct node and removed from this one. A partition may result in multiple jobs. Potentially many REVERT jobs, and zero or one SYNC job. :param local_dev: the local device :param part_path: full path to partition :param partition: partition number :param policy: the policy :returns: a list of dicts of job info """ # find all the fi's in the part, and which suffixes have them hashes = self._get_hashes(policy, part_path, do_listdir=True) non_data_fragment_suffixes = [] data_fi_to_suffixes = defaultdict(list) for suffix, fi_hash in hashes.items(): if not fi_hash: # this is for sanity and clarity, normally an empty # suffix would get del'd from the hashes dict, but an # OSError trying to re-hash the suffix could leave the # value empty - it will log the exception; but there's # no way to properly address this suffix at this time. continue data_frag_indexes = [f for f in fi_hash if f is not None] if not data_frag_indexes: non_data_fragment_suffixes.append(suffix) else: for fi in data_frag_indexes: data_fi_to_suffixes[fi].append(suffix) # helper to ensure consistent structure of jobs def build_job(job_type, frag_index, suffixes, sync_to): return { 'job_type': job_type, 'frag_index': frag_index, 'suffixes': suffixes, 'sync_to': sync_to, 'partition': partition, 'path': part_path, 'hashes': hashes, 'policy': policy, 'local_dev': local_dev, # ssync likes to have it handy 'device': local_dev['device'], } # aggregate jobs for all the fragment index in this part jobs = [] # check the primary nodes - to see if the part belongs here part_nodes = policy.object_ring.get_part_nodes(partition) for node in part_nodes: if node['id'] == local_dev['id']: # this partition belongs here, we'll need a sync job frag_index = node['index'] try: suffixes = data_fi_to_suffixes.pop(frag_index) except KeyError: suffixes = [] sync_job = build_job( job_type=SYNC, frag_index=frag_index, suffixes=suffixes, sync_to=_get_partners(frag_index, part_nodes), ) # ssync callback to rebuild missing fragment_archives sync_job['sync_diskfile_builder'] = self.reconstruct_fa jobs.append(sync_job) break # assign remaining data fragment suffixes to revert jobs ordered_fis = sorted((len(suffixes), fi) for fi, suffixes in data_fi_to_suffixes.items()) for count, fi in ordered_fis: revert_job = build_job( job_type=REVERT, frag_index=fi, suffixes=data_fi_to_suffixes[fi], sync_to=[part_nodes[fi]], ) jobs.append(revert_job) # now we need to assign suffixes that have no data fragments if non_data_fragment_suffixes: if jobs: # the first job will be either the sync_job, or the # revert_job for the fragment index that is most common # among the suffixes jobs[0]['suffixes'].extend(non_data_fragment_suffixes) else: # this is an unfortunate situation, we need a revert job to # push partitions off this node, but none of the suffixes # have any data fragments to hint at which node would be a # good candidate to receive the tombstones. jobs.append( build_job( job_type=REVERT, frag_index=None, suffixes=non_data_fragment_suffixes, # this is super safe sync_to=part_nodes, # something like this would be probably be better # sync_to=random.sample(part_nodes, 3), )) # return a list of jobs for this part return jobs def collect_parts(self, override_devices=None, override_partitions=None): """ Helper for yielding partitions in the top level reconstructor """ override_devices = override_devices or [] override_partitions = override_partitions or [] ips = whataremyips(self.bind_ip) for policy in POLICIES: if policy.policy_type != EC_POLICY: continue self._diskfile_mgr = self._df_router[policy] self.load_object_ring(policy) data_dir = get_data_dir(policy) local_devices = list( six.moves.filter( lambda dev: dev and is_local_device( ips, self.port, dev['replication_ip'], dev[ 'replication_port']), policy.object_ring.devs)) if override_devices: self.device_count = len(override_devices) else: self.device_count = len(local_devices) for local_dev in local_devices: if override_devices and (local_dev['device'] not in override_devices): continue self.reconstruction_device_count += 1 dev_path = self._df_router[policy].get_dev_path( local_dev['device']) if not dev_path: self.logger.warning(_('%s is not mounted'), local_dev['device']) continue obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('Unable to create %s' % obj_path) continue try: partitions = os.listdir(obj_path) except OSError: self.logger.exception('Unable to list partitions in %r' % obj_path) continue self.part_count += len(partitions) for partition in partitions: part_path = join(obj_path, partition) if not (partition.isdigit() and os.path.isdir(part_path)): self.logger.warning( 'Unexpected entity in data dir: %r' % part_path) remove_file(part_path) self.reconstruction_part_count += 1 continue partition = int(partition) if override_partitions and (partition not in override_partitions): continue part_info = { 'local_dev': local_dev, 'policy': policy, 'partition': partition, 'part_path': part_path, } yield part_info self.reconstruction_part_count += 1 def build_reconstruction_jobs(self, part_info): """ Helper function for collect_jobs to build jobs for reconstruction using EC style storage policy """ jobs = self._get_part_jobs(**part_info) random.shuffle(jobs) if self.handoffs_first: # Move the handoff revert jobs to the front of the list jobs.sort(key=lambda job: job['job_type'], reverse=True) self.job_count += len(jobs) return jobs def _reset_stats(self): self.start = time.time() self.job_count = 0 self.part_count = 0 self.device_count = 0 self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.reconstruction_count = 0 self.reconstruction_part_count = 0 self.reconstruction_device_count = 0 self.last_reconstruction_count = -1 def delete_partition(self, path): self.logger.info(_("Removing partition: %s"), path) tpool.execute(shutil.rmtree, path, ignore_errors=True) def reconstruct(self, **kwargs): """Run a reconstruction pass""" self._reset_stats() self.partition_times = [] stats = spawn(self.heartbeat) lockup_detector = spawn(self.detect_lockups) sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) for part_info in self.collect_parts(**kwargs): if not self.check_ring(part_info['policy'].object_ring): self.logger.info( _("Ring change detected. Aborting " "current reconstruction pass.")) return jobs = self.build_reconstruction_jobs(part_info) if not jobs: # If this part belongs on this node, _get_part_jobs # will *always* build a sync_job - even if there's # no suffixes in the partition that needs to sync. # If there's any suffixes in the partition then our # job list would have *at least* one revert job. # Therefore we know this part a) doesn't belong on # this node and b) doesn't have any suffixes in it. self.run_pool.spawn(self.delete_partition, part_info['part_path']) for job in jobs: self.run_pool.spawn(self.process_job, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception( _("Exception in top-level" "reconstruction loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() def run_once(self, *args, **kwargs): start = time.time() self.logger.info(_("Running object reconstructor in script mode.")) override_devices = list_from_csv(kwargs.get('devices')) override_partitions = [ int(p) for p in list_from_csv(kwargs.get('partitions')) ] self.reconstruct(override_devices=override_devices, override_partitions=override_partitions) total = (time.time() - start) / 60 self.logger.info( _("Object reconstruction complete (once). (%.02f minutes)"), total) if not (override_partitions or override_devices): dump_recon_cache( { 'object_reconstruction_time': total, 'object_reconstruction_last': time.time() }, self.rcache, self.logger) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object reconstructor in daemon mode.")) # Run the reconstructor continually while True: start = time.time() self.logger.info(_("Starting object reconstruction pass.")) # Run the reconstructor self.reconstruct() total = (time.time() - start) / 60 self.logger.info( _("Object reconstruction complete. (%.02f minutes)"), total) dump_recon_cache( { 'object_reconstruction_time': total, 'object_reconstruction_last': time.time() }, self.rcache, self.logger) self.logger.debug('reconstruction sleeping for %s seconds.', self.interval) sleep(self.interval)
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = get_logger(conf, log_route='object-replicator') self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = conf.get('mount_check', 'true').lower() in \ ('true', 't', '1', 'on', 'yes', 'y') self.vm_test_mode = conf.get('vm_test_mode', 'no').lower() in ('yes', 'true', 'on', '1') self.chase_dir = conf.get('chase_dir', '/etc/chase') self.port = int(conf.get('bind_port', 6000)) self.concurrency = int(conf.get('concurrency', 1)) self.stats_interval = int(conf.get('stats_interval', '300')) self.object_ring = Ring(join(self.chase_dir, 'object.ring.gz')) self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7)) self.partition_times = [] self.run_pause = int(conf.get('run_pause', 30)) self.rsync_timeout = int(conf.get('rsync_timeout', 900)) self.rsync_io_timeout = conf.get('rsync_io_timeout', '30') self.http_timeout = int(conf.get('http_timeout', 60)) self.lockup_timeout = int(conf.get('lockup_timeout', 1800)) self.recon_enable = conf.get('recon_enable', 'no').lower() in TRUE_VALUES self.recon_cache_path = conf.get('recon_cache_path', '/var/cache/chase') self.recon_object = os.path.join(self.recon_cache_path, "object.recon") def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() ret_val = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error(_("Killing long-running rsync: %s"), str(args)) proc.kill() return 1 # failure response code total_time = time.time() - start_time for result in results.split('\n'): if result == '': continue if result.startswith('cd+'): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: self.logger.error(_('Bad rsync return code: %(args)s -> %(ret)d'), { 'args': str(args), 'ret': ret_val }) elif results: self.logger.info( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), { 'src': args[-2], 'dst': args[-1], 'time': total_time }) else: self.logger.debug( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), { 'src': args[-2], 'dst': args[-1], 'time': total_time }) return ret_val def rsync(self, node, job, suffixes): """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean indicating success or failure """ if not os.path.exists(job['path']): return False args = [ 'rsync', '--recursive', '--whole-file', '--human-readable', '--xattrs', '--itemize-changes', '--ignore-existing', '--timeout=%s' % self.rsync_io_timeout, '--contimeout=%s' % self.rsync_io_timeout, ] if self.vm_test_mode: rsync_module = '[%s]::object%s' % (node['ip'], node['port']) else: rsync_module = '[%s]::object' % node['ip'] had_any = False for suffix in suffixes: spath = join(job['path'], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False args.append( join(rsync_module, node['device'], 'objects', job['partition'])) return self._rsync(args) == 0 def check_ring(self): """ Check to see if the ring has been updated :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if self.object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [ suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff)) ] self.replication_count += 1 begin = time.time() try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job['path']) if suffixes: for node in job['nodes']: success = self.rsync(node, job, suffixes) if success: with Timeout(self.http_timeout): http_connect(node['ip'], node['port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers={ 'Content-Length': '0' }).getresponse().read() responses.append(success) if not suffixes or (len(responses) == \ self.object_ring.replica_count and all(responses)): self.logger.info(_("Removing partition: %s"), job['path']) tpool.execute(shutil.rmtree, job['path'], ignore_errors=True) except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) finally: self.partition_times.append(time.time() - begin) def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ self.replication_count += 1 begin = time.time() try: hashed, local_hash = tpool.execute( tpooled_get_hashes, job['path'], do_listdir=(self.replication_count % 10) == 0, reclaim_age=self.reclaim_age) # See tpooled_get_hashes "Hack". if isinstance(hashed, BaseException): raise hashed self.suffix_hash += hashed attempts_left = self.object_ring.replica_count - 1 nodes = itertools.chain( job['nodes'], self.object_ring.get_more_nodes(int(job['partition']))) while attempts_left > 0: # If this throws StopIterator it will be caught way below node = next(nodes) attempts_left -= 1 try: with Timeout(self.http_timeout): resp = http_connect(node['ip'], node['port'], node['device'], job['partition'], 'REPLICATE', '', headers={ 'Content-Length': '0' }).getresponse() if resp.status == 507: self.logger.error( _('%(ip)s/%(device)s responded' ' as unmounted'), node) attempts_left += 1 continue if resp.status != 200: self.logger.error( _("Invalid response %(resp)s " "from %(ip)s"), { 'resp': resp.status, 'ip': node['ip'] }) continue remote_hash = pickle.loads(resp.read()) del resp suffixes = [ suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1) ] if not suffixes: continue hashed, recalc_hash = tpool.execute( tpooled_get_hashes, job['path'], recalculate=suffixes, reclaim_age=self.reclaim_age) # See tpooled_get_hashes "Hack". if isinstance(hashed, BaseException): raise hashed local_hash = recalc_hash suffixes = [ suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1) ] self.rsync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect(node['ip'], node['port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers={'Content-Length': '0'}) conn.getresponse().read() self.suffix_sync += len(suffixes) except (Exception, Timeout): self.logger.exception( _("Error syncing with node: %s") % node) self.suffix_count += len(local_hash) except (Exception, Timeout): self.logger.exception(_("Error syncing partition")) finally: self.partition_times.append(time.time() - begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ if self.replication_count: rate = self.replication_count / (time.time() - self.start) self.logger.info( _("%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)"), { 'replicated': self.replication_count, 'total': self.job_count, 'percentage': self.replication_count * 100.0 / self.job_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, self.replication_count, self.job_count) }) if self.suffix_count: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), { 'checked': self.suffix_count, 'hashed': (self.suffix_hash * 100.0) / self.suffix_count, 'synced': (self.suffix_sync * 100.0) / self.suffix_count }) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), { 'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[len(self.partition_times) // 2] }) else: self.logger.info(_("Nothing replicated for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the replicator finishes its replication pass in some eventuality. """ while True: eventlet.sleep(self.lockup_timeout) if self.replication_count == self.last_replication_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_replication_count = self.replication_count def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. """ jobs = [] ips = whataremyips() for local_dev in [ dev for dev in self.object_ring.devs if dev and dev['ip'] in ips and dev['port'] == self.port ]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not os.path.ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): continue for partition in os.listdir(obj_path): try: nodes = [ node for node in self.object_ring.get_part_nodes( int(partition)) if node['id'] != local_dev['id'] ] jobs.append( dict(path=join(obj_path, partition), nodes=nodes, delete=len(nodes) > self.object_ring.replica_count - 1, partition=partition)) except ValueError: continue random.shuffle(jobs) # Partititons that need to be deleted take priority jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs def replicate(self): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs() for job in jobs: if not self.check_ring(): self.logger.info( _("Ring change detected. Aborting " "current replication pass.")) return if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() def run_once(self, *args, **kwargs): start = time.time() self.logger.info(_("Running object replicator in script mode.")) self.replicate() total = (time.time() - start) / 60 self.logger.info(_("Object replication complete. (%.02f minutes)"), total) if self.recon_enable: try: dump_recon_cache('object_replication_time', total, \ self.recon_object) except (Exception, Timeout): self.logger.exception(_('Exception dumping recon cache')) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object replicator in daemon mode.")) # Run the replicator continually while True: start = time.time() self.logger.info(_("Starting object replication pass.")) # Run the replicator self.replicate() total = (time.time() - start) / 60 self.logger.info(_("Object replication complete. (%.02f minutes)"), total) if self.recon_enable: try: dump_recon_cache('object_replication_time', total, \ self.recon_object) except (Exception, Timeout): self.logger.exception(_('Exception dumping recon cache')) self.logger.debug(_('Replication sleeping for %s seconds.'), self.run_pause) sleep(self.run_pause)
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf, logger=None): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = PrefixLoggerAdapter( logger or get_logger(conf, log_route='object-replicator'), {}) self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.bind_ip = conf.get('bind_ip', '0.0.0.0') self.servers_per_port = int(conf.get('servers_per_port', '0') or 0) self.port = None if self.servers_per_port else \ int(conf.get('bind_port', 6200)) self.concurrency = int(conf.get('concurrency', 1)) self.replicator_workers = int(conf.get('replicator_workers', 0)) self.stats_interval = int(conf.get('stats_interval', '300')) self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.replication_cycle = random.randint(0, 9) self.partition_times = [] self.interval = int( conf.get('interval') or conf.get('run_pause') or 30) self.rsync_timeout = int( conf.get('rsync_timeout', DEFAULT_RSYNC_TIMEOUT)) self.rsync_io_timeout = conf.get('rsync_io_timeout', '30') self.rsync_bwlimit = conf.get('rsync_bwlimit', '0') self.rsync_compress = config_true_value( conf.get('rsync_compress', 'no')) self.rsync_module = conf.get('rsync_module', '').rstrip('/') if not self.rsync_module: self.rsync_module = '{replication_ip}::object' self.http_timeout = int(conf.get('http_timeout', 60)) self.recon_cache_path = conf.get('recon_cache_path', '/var/cache/swift') self.rcache = os.path.join(self.recon_cache_path, "object.recon") self._next_rcache_update = time.time() + self.stats_interval self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.node_timeout = float(conf.get('node_timeout', 10)) self.sync_method = getattr(self, conf.get('sync_method') or 'rsync') self.network_chunk_size = int(conf.get('network_chunk_size', 65536)) self.default_headers = { 'Content-Length': '0', 'user-agent': 'object-replicator %s' % os.getpid() } self.rsync_error_log_line_length = \ int(conf.get('rsync_error_log_line_length', 0)) self.handoffs_first = config_true_value( conf.get('handoffs_first', False)) self.handoff_delete = config_auto_int_value( conf.get('handoff_delete', 'auto'), 0) if any((self.handoff_delete, self.handoffs_first)): self.logger.warning('Handoff only mode is not intended for normal ' 'operation, please disable handoffs_first and ' 'handoff_delete before the next ' 'normal rebalance') self.is_multiprocess_worker = None self._df_router = DiskFileRouter(conf, self.logger) self._child_process_reaper_queue = queue.LightQueue() def _zero_stats(self): self.stats_for_dev = defaultdict(Stats) @property def total_stats(self): return sum(self.stats_for_dev.values(), Stats()) def _emplace_log_prefix(self, worker_index): self.logger.set_prefix("[worker %d/%d pid=%d] " % ( worker_index + 1, # use 1-based indexing for more readable logs self.replicator_workers, os.getpid())) def _get_my_replication_ips(self): my_replication_ips = set() ips = whataremyips() for policy in POLICIES: self.load_object_ring(policy) for local_dev in [ dev for dev in policy.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port ]: my_replication_ips.add(local_dev['replication_ip']) return list(my_replication_ips) def _child_process_reaper(self): """ Consume processes from self._child_process_reaper_queue and wait() for them """ procs = set() done = False while not done: timeout = 60 if procs else None try: new_proc = self._child_process_reaper_queue.get( timeout=timeout) if new_proc is not None: procs.add(new_proc) else: done = True except queue.Empty: pass reaped_procs = set() for proc in procs: try: # this will reap the process if it has exited, but # otherwise will not wait proc.wait(timeout=0) reaped_procs.add(proc) except subprocess.TimeoutExpired: pass procs -= reaped_procs def get_worker_args(self, once=False, **kwargs): if self.replicator_workers < 1: return [] override_opts = parse_override_options(once=once, **kwargs) have_overrides = bool(override_opts.devices or override_opts.partitions or override_opts.policies) # save this off for ring-change detection later in is_healthy() self.all_local_devices = self.get_local_devices() if override_opts.devices: devices_to_replicate = [ d for d in override_opts.devices if d in self.all_local_devices ] else: # The sort isn't strictly necessary since we're just trying to # spread devices around evenly, but it makes testing easier. devices_to_replicate = sorted(self.all_local_devices) # Distribute devices among workers as evenly as possible self.replicator_workers = min(self.replicator_workers, len(devices_to_replicate)) return [{ 'override_devices': devs, 'override_partitions': override_opts.partitions, 'override_policies': override_opts.policies, 'have_overrides': have_overrides, 'multiprocess_worker_index': index } for index, devs in enumerate( distribute_evenly(devices_to_replicate, self.replicator_workers))] def is_healthy(self): """ Check whether our set of local devices remains the same. If devices have been added or removed, then we return False here so that we can kill off any worker processes and then distribute the new set of local devices across a new set of workers so that all devices are, once again, being worked on. This function may also cause recon stats to be updated. :returns: False if any local devices have been added or removed, True otherwise """ # We update recon here because this is the only function we have in # a multiprocess replicator that gets called periodically in the # parent process. if time.time() >= self._next_rcache_update: update = self.aggregate_recon_update() dump_recon_cache(update, self.rcache, self.logger) return self.get_local_devices() == self.all_local_devices def get_local_devices(self): """ Returns a set of all local devices in all replication-type storage policies. This is the device names, e.g. "sdq" or "d1234" or something, not the full ring entries. """ ips = whataremyips(self.bind_ip) local_devices = set() for policy in POLICIES: if policy.policy_type != REPL_POLICY: continue self.load_object_ring(policy) for device in policy.object_ring.devs: if device and is_local_device(ips, self.port, device['replication_ip'], device['replication_port']): local_devices.add(device['device']) return local_devices # Just exists for doc anchor point def sync(self, node, job, suffixes, *args, **kwargs): """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean and dictionary, boolean indicating success or failure """ return self.sync_method(node, job, suffixes, *args, **kwargs) def load_object_ring(self, policy): """ Make sure the policy's rings are loaded. :param policy: the StoragePolicy instance :returns: appropriate ring object """ policy.load_ring(self.swift_dir) return policy.object_ring def _limit_rsync_log(self, line): """ If rsync_error_log_line_length is defined then limit the error to that length :param line: rsync log line :return: If enabled the line limited to rsync_error_log_line_length otherwise the initial line. """ if self.rsync_error_log_line_length: return line[:self.rsync_error_log_line_length] return line def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() proc = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error( self._limit_rsync_log( _("Killing long-running rsync: %s") % str(args))) if proc: proc.kill() try: # Note: Python 2.7's subprocess.Popen class doesn't take # any arguments for wait(), but Python 3's does. # However, Eventlet's replacement Popen takes a timeout # argument regardless of Python version, so we don't # need any conditional code here. proc.wait(timeout=1.0) except subprocess.TimeoutExpired: # Sometimes a process won't die immediately even after a # SIGKILL. This can be due to failing disks, high load, # or other reasons. We can't wait for it forever since # we're taking up a slot in the (green)thread pool, so # we send it over to another greenthread, not part of # our pool, whose sole duty is to wait for child # processes to exit. self._child_process_reaper_queue.put(proc) return 1 # failure response code total_time = time.time() - start_time for result in results.split('\n'): if result == '': continue if result.startswith('cd+'): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: self.logger.error( self._limit_rsync_log( _('Bad rsync return code: %(ret)d <- %(args)s') % { 'args': str(args), 'ret': ret_val })) else: log_method = self.logger.info if results else self.logger.debug log_method( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), { 'src': args[-2], 'dst': args[-1], 'time': total_time }) return ret_val def rsync(self, node, job, suffixes): """ Uses rsync to implement the sync method. This was the first sync method in Swift. """ if not os.path.exists(job['path']): return False, {} args = [ 'rsync', '--recursive', '--whole-file', '--human-readable', '--xattrs', '--itemize-changes', '--ignore-existing', '--timeout=%s' % self.rsync_io_timeout, '--contimeout=%s' % self.rsync_io_timeout, '--bwlimit=%s' % self.rsync_bwlimit, '--exclude=.*.%s' % ''.join('[0-9a-zA-Z]' for i in range(6)) ] if self.rsync_compress and \ job['region'] != node['region']: # Allow for compression, but only if the remote node is in # a different region than the local one. args.append('--compress') rsync_module = rsync_module_interpolation(self.rsync_module, node) had_any = False for suffix in suffixes: spath = join(job['path'], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False, {} data_dir = get_data_dir(job['policy']) args.append( join(rsync_module, node['device'], data_dir, job['partition'])) return self._rsync(args) == 0, {} def ssync(self, node, job, suffixes, remote_check_objs=None): return ssync_sender.Sender(self, node, job, suffixes, remote_check_objs)() def check_ring(self, object_ring): """ Check to see if the ring has been updated :param object_ring: the ring to check :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [ suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff)) ] stats = self.stats_for_dev[job['device']] stats.attempted += 1 self.logger.increment('partition.delete.count.%s' % (job['device'], )) headers = dict(self.default_headers) headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) failure_devs_info = set() begin = time.time() handoff_partition_deleted = False try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job['path']) synced_remote_regions = {} delete_objs = None if suffixes: for node in job['nodes']: stats.rsync += 1 kwargs = {} if node['region'] in synced_remote_regions and \ self.conf.get('sync_method', 'rsync') == 'ssync': kwargs['remote_check_objs'] = \ synced_remote_regions[node['region']] # candidates is a dict(hash=>timestamp) of objects # for deletion success, candidates = self.sync(node, job, suffixes, **kwargs) if success: with Timeout(self.http_timeout): conn = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=headers) conn.getresponse().read() if node['region'] != job['region']: synced_remote_regions[node['region']] = viewkeys( candidates) else: failure_devs_info.add( (node['replication_ip'], node['device'])) responses.append(success) for cand_objs in synced_remote_regions.values(): if delete_objs is None: delete_objs = cand_objs else: delete_objs = delete_objs & cand_objs if self.handoff_delete: # delete handoff if we have had handoff_delete successes delete_handoff = len([resp for resp in responses if resp]) >= \ self.handoff_delete else: # delete handoff if all syncs were successful delete_handoff = len(responses) == len(job['nodes']) and \ all(responses) if delete_handoff: stats.remove += 1 if (self.conf.get('sync_method', 'rsync') == 'ssync' and delete_objs is not None): self.logger.info(_("Removing %s objects"), len(delete_objs)) _junk, error_paths = self.delete_handoff_objs( job, delete_objs) # if replication works for a hand-off device and it failed, # the remote devices which are target of the replication # from the hand-off device will be marked. Because cleanup # after replication failed means replicator needs to # replicate again with the same info. if error_paths: failure_devs_info.update([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes'] ]) else: self.delete_partition(job['path']) handoff_partition_deleted = True elif not suffixes: self.delete_partition(job['path']) handoff_partition_deleted = True except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) stats.add_failure_stats(failure_devs_info) finally: target_devs_info = set([(target_dev['replication_ip'], target_dev['device']) for target_dev in job['nodes']]) stats.success += len(target_devs_info - failure_devs_info) if not handoff_partition_deleted: self.handoffs_remaining += 1 self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.delete.timing', begin) def delete_partition(self, path): self.logger.info(_("Removing partition: %s"), path) tpool.execute(shutil.rmtree, path) def delete_handoff_objs(self, job, delete_objs): success_paths = [] error_paths = [] for object_hash in delete_objs: object_path = storage_directory(job['obj_path'], job['partition'], object_hash) tpool.execute(shutil.rmtree, object_path, ignore_errors=True) suffix_dir = dirname(object_path) try: os.rmdir(suffix_dir) success_paths.append(object_path) except OSError as e: if e.errno not in (errno.ENOENT, errno.ENOTEMPTY): error_paths.append(object_path) self.logger.exception( "Unexpected error trying to cleanup suffix dir:%r", suffix_dir) return success_paths, error_paths def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ stats = self.stats_for_dev[job['device']] stats.attempted += 1 self.logger.increment('partition.update.count.%s' % (job['device'], )) headers = dict(self.default_headers) headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) target_devs_info = set() failure_devs_info = set() begin = time.time() df_mgr = self._df_router[job['policy']] try: hashed, local_hash = tpool_reraise(df_mgr._get_hashes, job['device'], job['partition'], job['policy'], do_listdir=_do_listdir( int(job['partition']), self.replication_cycle)) stats.suffix_hash += hashed self.logger.update_stats('suffix.hashes', hashed) attempts_left = len(job['nodes']) synced_remote_regions = set() random.shuffle(job['nodes']) nodes = itertools.chain( job['nodes'], job['policy'].object_ring.get_more_nodes( int(job['partition']))) while attempts_left > 0: # If this throws StopIteration it will be caught way below node = next(nodes) target_devs_info.add((node['replication_ip'], node['device'])) attempts_left -= 1 # if we have already synced to this remote region, # don't sync again on this replication pass if node['region'] in synced_remote_regions: continue try: with Timeout(self.http_timeout): resp = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '', headers=headers).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error( _('%(replication_ip)s/%(device)s ' 'responded as unmounted'), node) attempts_left += 1 failure_devs_info.add( (node['replication_ip'], node['device'])) continue if resp.status != HTTP_OK: self.logger.error( _("Invalid response %(resp)s " "from %(ip)s"), { 'resp': resp.status, 'ip': node['replication_ip'] }) failure_devs_info.add( (node['replication_ip'], node['device'])) continue remote_hash = pickle.loads(resp.read()) del resp suffixes = [ suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1) ] if not suffixes: stats.hashmatch += 1 continue hashed, recalc_hash = tpool_reraise(df_mgr._get_hashes, job['device'], job['partition'], job['policy'], recalculate=suffixes) self.logger.update_stats('suffix.hashes', hashed) local_hash = recalc_hash suffixes = [ suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1) ] stats.rsync += 1 success, _junk = self.sync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=headers) conn.getresponse().read() if not success: failure_devs_info.add( (node['replication_ip'], node['device'])) # add only remote region when replicate succeeded if success and node['region'] != job['region']: synced_remote_regions.add(node['region']) stats.suffix_sync += len(suffixes) self.logger.update_stats('suffix.syncs', len(suffixes)) except (Exception, Timeout): failure_devs_info.add( (node['replication_ip'], node['device'])) self.logger.exception( _("Error syncing with node: %s") % node) stats.suffix_count += len(local_hash) except StopIteration: self.logger.error( 'Ran out of handoffs while replicating ' 'partition %s of policy %d', job['partition'], int(job['policy'])) except (Exception, Timeout): failure_devs_info.update(target_devs_info) self.logger.exception(_("Error syncing partition")) finally: stats.add_failure_stats(failure_devs_info) stats.success += len(target_devs_info - failure_devs_info) self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.update.timing', begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ stats = self.total_stats replication_count = stats.attempted if replication_count > self.last_replication_count: self.last_replication_count = replication_count elapsed = (time.time() - self.start) or 0.000001 rate = replication_count / elapsed self.logger.info( _("%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)"), { 'replicated': replication_count, 'total': self.job_count, 'percentage': replication_count * 100.0 / self.job_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, replication_count, self.job_count) }) self.logger.info( _('%(success)s successes, %(failure)s failures') % dict(success=stats.success, failure=stats.failure)) if stats.suffix_count: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), { 'checked': stats.suffix_count, 'hashed': (stats.suffix_hash * 100.0) / stats.suffix_count, 'synced': (stats.suffix_sync * 100.0) / stats.suffix_count }) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), { 'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[len(self.partition_times) // 2] }) else: self.logger.info(_("Nothing replicated for %s seconds."), (time.time() - self.start)) def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] df_mgr = self._df_router[policy] self.all_devs_info.update([(dev['replication_ip'], dev['device']) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [ dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices)) ]: found_local = True dev_path = check_drive(self.devices_dir, local_dev['device'], self.mount_check) local_dev_stats = self.stats_for_dev[local_dev['device']] if not dev_path: local_dev_stats.add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev ]) self.logger.warning(_('%s is not mounted'), local_dev['device']) continue obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) unlink_older_than(tmp_path, time.time() - df_mgr.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue if (partition.startswith('auditor_status_') and partition.endswith('.json')): # ignore auditor status files continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: if part_nodes: local_dev_stats.add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in nodes ]) else: local_dev_stats.add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev ]) continue if not found_local: self.logger.error( "Can't find itself in policy with index %d with" " ips %s and with port %s in ring file, not" " replicating", int(policy), ", ".join(ips), self.port) return jobs def collect_jobs(self, override_devices=None, override_partitions=None, override_policies=None): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. :param override_devices: if set, only jobs on these devices will be returned :param override_partitions: if set, only jobs on these partitions will be returned :param override_policies: if set, only jobs in these storage policies will be returned """ jobs = [] ips = whataremyips(self.bind_ip) for policy in POLICIES: # Skip replication if next_part_power is set. In this case # every object is hard-linked twice, but the replicator can't # detect them and would create a second copy of the file if not # yet existing - and this might double the actual transferred # and stored data next_part_power = getattr(policy.object_ring, 'next_part_power', None) if next_part_power is not None: self.logger.warning( _("next_part_power set in policy '%s'. Skipping"), policy.name) continue if policy.policy_type == REPL_POLICY: if (override_policies is not None and policy.idx not in override_policies): continue # ensure rings are loaded for policy self.load_object_ring(policy) jobs += self.build_replication_jobs( policy, ips, override_devices=override_devices, override_partitions=override_partitions) random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs def replicate(self, override_devices=None, override_partitions=None, override_policies=None, start_time=None): """Run a replication pass""" if start_time is None: start_time = time.time() self.start = start_time self.last_replication_count = 0 self.replication_cycle = (self.replication_cycle + 1) % 10 self.partition_times = [] self.my_replication_ips = self._get_my_replication_ips() self.all_devs_info = set() self.handoffs_remaining = 0 stats = eventlet.spawn(self.heartbeat) eventlet.sleep() # Give spawns a cycle current_nodes = None dev_stats = None num_jobs = 0 try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) for job in jobs: dev_stats = self.stats_for_dev[job['device']] num_jobs += 1 current_nodes = job['nodes'] dev_path = check_drive(self.devices_dir, job['device'], self.mount_check) if not dev_path: dev_stats.add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes'] ]) self.logger.warning(_('%s is not mounted'), job['device']) continue if self.handoffs_first and not job['delete']: # in handoffs first mode, we won't process primary # partitions until rebalance was successful! if self.handoffs_remaining: self.logger.warning( _("Handoffs first mode still has handoffs " "remaining. Aborting current " "replication pass.")) break if not self.check_ring(job['policy'].object_ring): self.logger.info( _("Ring change detected. Aborting " "current replication pass.")) return try: if isfile(job['path']): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job['path']) os.remove(job['path']) continue except OSError: continue if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) current_nodes = None self.run_pool.waitall() except (Exception, Timeout) as err: if dev_stats: if current_nodes: dev_stats.add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in current_nodes ]) else: dev_stats.add_failure_stats(self.all_devs_info) self.logger.exception( _("Exception in top-level replication loop: %s"), err) finally: stats.kill() self.stats_line() def update_recon(self, total, end_time, override_devices): # Called at the end of a replication pass to update recon stats. if self.is_multiprocess_worker: # If it weren't for the failure_nodes field, we could do this as # a bunch of shared memory using multiprocessing.Value, which # would be nice because it'd avoid dealing with existing data # during an upgrade. update = { 'object_replication_per_disk': { od: { 'replication_stats': self.stats_for_dev[od].to_recon(), 'replication_time': total, 'replication_last': end_time, 'object_replication_time': total, 'object_replication_last': end_time } for od in override_devices } } else: update = { 'replication_stats': self.total_stats.to_recon(), 'replication_time': total, 'replication_last': end_time, 'object_replication_time': total, 'object_replication_last': end_time } dump_recon_cache(update, self.rcache, self.logger) def aggregate_recon_update(self): per_disk_stats = load_recon_cache(self.rcache).get( 'object_replication_per_disk', {}) recon_update = {} min_repl_last = float('inf') min_repl_time = float('inf') # If every child has reported some stats, then aggregate things. if all(ld in per_disk_stats for ld in self.all_local_devices): aggregated = Stats() for device_name, data in per_disk_stats.items(): aggregated += Stats.from_recon(data['replication_stats']) min_repl_time = min(min_repl_time, data['object_replication_time']) min_repl_last = min(min_repl_last, data['object_replication_last']) recon_update['replication_stats'] = aggregated.to_recon() recon_update['replication_last'] = min_repl_last recon_update['replication_time'] = min_repl_time recon_update['object_replication_last'] = min_repl_last recon_update['object_replication_time'] = min_repl_time # Clear out entries for old local devices that we no longer have devices_to_remove = set(per_disk_stats) - set(self.all_local_devices) if devices_to_remove: recon_update['object_replication_per_disk'] = { dtr: {} for dtr in devices_to_remove } return recon_update def run_once(self, multiprocess_worker_index=None, have_overrides=False, *args, **kwargs): if multiprocess_worker_index is not None: self.is_multiprocess_worker = True self._emplace_log_prefix(multiprocess_worker_index) rsync_reaper = eventlet.spawn(self._child_process_reaper) self._zero_stats() self.logger.info(_("Running object replicator in script mode.")) override_opts = parse_override_options(once=True, **kwargs) devices = override_opts.devices or None partitions = override_opts.partitions or None policies = override_opts.policies or None start_time = time.time() self.replicate(override_devices=devices, override_partitions=partitions, override_policies=policies, start_time=start_time) end_time = time.time() total = (end_time - start_time) / 60 self.logger.info( _("Object replication complete (once). (%.02f minutes)"), total) # If we've been manually run on a subset of # policies/devices/partitions, then our recon stats are not # representative of how replication is doing, so we don't publish # them. if self.is_multiprocess_worker: # The main process checked for overrides and determined that # there were none should_update_recon = not have_overrides else: # We are single-process, so update recon only if we worked on # everything should_update_recon = not (partitions or devices or policies) if should_update_recon: self.update_recon(total, end_time, devices) # Give rsync processes one last chance to exit, then bail out and # let them be init's problem self._child_process_reaper_queue.put(None) rsync_reaper.wait() def run_forever(self, multiprocess_worker_index=None, override_devices=None, *args, **kwargs): if multiprocess_worker_index is not None: self.is_multiprocess_worker = True self._emplace_log_prefix(multiprocess_worker_index) self.logger.info(_("Starting object replicator in daemon mode.")) eventlet.spawn_n(self._child_process_reaper) # Run the replicator continually while True: self._zero_stats() self.logger.info(_("Starting object replication pass.")) # Run the replicator start = time.time() self.replicate(override_devices=override_devices) end = time.time() total = (end - start) / 60 self.logger.info(_("Object replication complete. (%.02f minutes)"), total) self.update_recon(total, end, override_devices) self.logger.debug('Replication sleeping for %s seconds.', self.interval) sleep(self.interval) def post_multiprocess_run(self): # This method is called after run_once using multiple workers. update = self.aggregate_recon_update() dump_recon_cache(update, self.rcache, self.logger)
class Crawler(object): def __init__(self, max_connections, input_is_plain): self.max_connections = max_connections self.input_is_plain = input_is_plain self.queue = Queue(1) self.closed = False self._handler_pool = GreenPool(self.max_connections) self._robots_cache = PoolMap(self.get_robots_checker, pool_max_size=1, timeout=600) # Start IO worker and die if he does. self.io_worker = io.Worker(lambda: self.closed) t = spawn(self.io_worker.run_loop) t.link(reraise_errors, greenthread.getcurrent()) log.debug(u"Crawler started. Max connections: %d.", self.max_connections) def crawl(self, forever=True): # TODO: do something special about signals? if forever: self.start_queue_updater() while not self.closed: # `get_nowait` will only work together with sleep(0) here # because we need greenlet switch to reraise exception from `do_process`. sleep() try: item = self.queue.get_nowait() except Empty: if not forever: self.graceful_stop() sleep(0.01) continue t = self._handler_pool.spawn(self.do_process, item) t.link(reraise_errors, greenthread.getcurrent()) def stop(self): self.closed = True def graceful_stop(self, timeout=None): """Stops crawler and waits for all already started crawling requests to finish. If `timeout` is supplied, it waits for at most `timeout` time to finish and returns True if allocated time was enough. Returns False if `timeout` was not enough. """ self.closed = True if timeout is not None: with eventlet.Timeout(timeout, False): if hasattr(self, "_queue_updater_thread"): self._queue_updater_thread.kill() self._handler_pool.waitall() return True return False else: if hasattr(self, "_queue_updater_thread"): self._queue_updater_thread.kill() self._handler_pool.waitall() def start_queue_updater(self): self._queue_updater_thread = spawn(self.queue_updater) self._queue_updater_thread.link(reraise_errors, greenthread.getcurrent()) def queue_updater(self): log.debug("Waiting for crawl jobs on stdin.") for line in sys.stdin: if self.closed: break line = line.strip() if self.input_is_plain: job = {"url": line} else: try: job = json.loads(line) except ValueError: log.error(u"Decoding input line: %s", line) continue # extend worker queue # 1. skip duplicate URLs for queue_item in self.queue.queue: if queue_item["url"] == job["url"]: # compare URLs break else: # 2. extend queue with new items # May block here, when queue is full. This is a feature. self.queue.put(job) # Stdin exhausted -> stop. while not self.queue.empty(): sleep(0.01) sleep(2) # FIXME: Crutch to prevent stopping too early. self.graceful_stop() def get_robots_checker(self, scheme, authority): """PoolMap func :: scheme, authority -> (agent, uri -> bool).""" robots_uri = "%s://%s/robots.txt" % (scheme, authority) fetch_result = self.io_worker.fetch(robots_uri) # Graceful stop thing. if fetch_result is None: return None if fetch_result["success"]: # TODO: set expiration time from headers # but this must be done after `self._robots_cache.put` or somehow else... if 200 <= fetch_result["status_code"] < 300: parser = robotparser.RobotFileParser() content_lines = fetch_result["content"].splitlines() try: parser.parse(content_lines) except KeyError: raise RobotsError(u"Known robotparser bug: KeyError at urllib.quote(path).") return parser.can_fetch # Authorization required and Forbidden are considered Disallow all. elif fetch_result["status_code"] in (401, 403): return lambda _agent, _uri: False # /robots.txt Not Found is considered Allow all. elif fetch_result["status_code"] == 404: return lambda _agent, _uri: True # FIXME: this is an optimistic rule and probably should be detailed with more specific checks elif fetch_result["status_code"] >= 400: return lambda _agent, _uri: True # What other cases left? 100 and redirects. Consider it Disallow all. else: return lambda _agent, _uri: False else: raise FetchError(u"/robots.txt fetch problem: %s" % (fetch_result["result"])) def ask_robots(self, uri, scheme, authority): key = scheme + ":" + authority with self._robots_cache.getc(key, scheme, authority) as checker: try: # Graceful stop thing. if checker is None: return None return checker(settings.identity["name"], uri) except Exception, e: log.exception(u"Get rid of this. ask_robots @ %s", uri) raise RobotsError(u"Error checking robots.txt permissions for URI '%s': %s" % (uri, unicode(e)))
class AccountReaper(Daemon): """ Removes data from status=DELETED accounts. These are accounts that have been asked to be removed by the reseller via services remove_storage_account XMLRPC call. The account is not deleted immediately by the services call, but instead the account is simply marked for deletion by setting the status column in the account_stat table of the account database. This account reaper scans for such accounts and removes the data in the background. The background deletion process will occur on the primary account server for the account. :param server_conf: The [account-server] dictionary of the account server configuration file :param reaper_conf: The [account-reaper] dictionary of the account server configuration file See the etc/account-server.conf-sample for information on the possible configuration parameters. """ def __init__(self, conf, logger=None): self.conf = conf self.logger = logger or get_logger(conf, log_route='account-reaper') self.devices = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.interval = int(conf.get('interval', 3600)) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.account_ring = None self.container_ring = None self.object_ring = None self.node_timeout = int(conf.get('node_timeout', 10)) self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.myips = whataremyips() self.concurrency = int(conf.get('concurrency', 25)) self.container_concurrency = self.object_concurrency = \ sqrt(self.concurrency) self.container_pool = GreenPool(size=self.container_concurrency) swift.common.db.DB_PREALLOCATION = \ config_true_value(conf.get('db_preallocation', 'f')) self.delay_reaping = int(conf.get('delay_reaping') or 0) reap_warn_after = float(conf.get('reap_warn_after') or 86400 * 30) self.reap_not_done_after = reap_warn_after + self.delay_reaping def get_account_ring(self): """The account :class:`swift.common.ring.Ring` for the cluster.""" if not self.account_ring: self.account_ring = Ring(self.swift_dir, ring_name='account') return self.account_ring def get_container_ring(self): """The container :class:`swift.common.ring.Ring` for the cluster.""" if not self.container_ring: self.container_ring = Ring(self.swift_dir, ring_name='container') return self.container_ring def get_object_ring(self, policy_idx): """ Get the ring identified by the policy index :param policy_idx: Storage policy index :returns: A ring matching the storage policy """ return POLICIES.get_object_ring(policy_idx, self.swift_dir) def run_forever(self, *args, **kwargs): """Main entry point when running the reaper in normal daemon mode. This repeatedly calls :func:`reap_once` no quicker than the configuration interval. """ self.logger.debug('Daemon started.') sleep(random.random() * self.interval) while True: begin = time() self.run_once() elapsed = time() - begin if elapsed < self.interval: sleep(self.interval - elapsed) def run_once(self, *args, **kwargs): """ Main entry point when running the reaper in 'once' mode, where it will do a single pass over all accounts on the server. This is called repeatedly by :func:`run_forever`. This will call :func:`reap_device` once for each device on the server. """ self.logger.debug('Begin devices pass: %s', self.devices) begin = time() try: for device in os.listdir(self.devices): if self.mount_check and not ismount( os.path.join(self.devices, device)): self.logger.increment('errors') self.logger.debug( _('Skipping %s as it is not mounted'), device) continue self.reap_device(device) except (Exception, Timeout): self.logger.exception(_("Exception in top-level account reaper " "loop")) elapsed = time() - begin self.logger.info(_('Devices pass completed: %.02fs'), elapsed) def reap_device(self, device): """ Called once per pass for each device on the server. This will scan the accounts directory for the device, looking for partitions this device is the primary for, then looking for account databases that are marked status=DELETED and still have containers and calling :func:`reap_account`. Account databases marked status=DELETED that no longer have containers will eventually be permanently removed by the reclaim process within the account replicator (see :mod:`swift.db_replicator`). :param device: The device to look for accounts to be deleted. """ datadir = os.path.join(self.devices, device, DATADIR) if not os.path.exists(datadir): return for partition in os.listdir(datadir): partition_path = os.path.join(datadir, partition) if not partition.isdigit(): continue nodes = self.get_account_ring().get_part_nodes(int(partition)) if nodes[0]['ip'] not in self.myips or \ not os.path.isdir(partition_path): continue for suffix in os.listdir(partition_path): suffix_path = os.path.join(partition_path, suffix) if not os.path.isdir(suffix_path): continue for hsh in os.listdir(suffix_path): hsh_path = os.path.join(suffix_path, hsh) if not os.path.isdir(hsh_path): continue for fname in sorted(os.listdir(hsh_path), reverse=True): if fname.endswith('.ts'): break elif fname.endswith('.db'): self.start_time = time() broker = \ AccountBroker(os.path.join(hsh_path, fname)) if broker.is_status_deleted() and \ not broker.empty(): self.reap_account(broker, partition, nodes) def reset_stats(self): self.stats_return_codes = {} self.stats_containers_deleted = 0 self.stats_objects_deleted = 0 self.stats_containers_remaining = 0 self.stats_objects_remaining = 0 self.stats_containers_possibly_remaining = 0 self.stats_objects_possibly_remaining = 0 def reap_account(self, broker, partition, nodes): """ Called once per pass for each account this server is the primary for and attempts to delete the data for the given account. The reaper will only delete one account at any given time. It will call :func:`reap_container` up to sqrt(self.concurrency) times concurrently while reaping the account. If there is any exception while deleting a single container, the process will continue for any other containers and the failed containers will be tried again the next time this function is called with the same parameters. If there is any exception while listing the containers for deletion, the process will stop (but will obviously be tried again the next time this function is called with the same parameters). This isn't likely since the listing comes from the local database. After the process completes (successfully or not) statistics about what was accomplished will be logged. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param broker: The AccountBroker for the account to delete. :param partition: The partition in the account ring the account is on. :param nodes: The primary node dicts for the account to delete. .. seealso:: :class:`swift.account.backend.AccountBroker` for the broker class. .. seealso:: :func:`swift.common.ring.Ring.get_nodes` for a description of the node dicts. """ begin = time() info = broker.get_info() if time() - float(Timestamp(info['delete_timestamp'])) <= \ self.delay_reaping: return False account = info['account'] self.logger.info(_('Beginning pass on account %s'), account) self.reset_stats() try: marker = '' while True: containers = \ list(broker.list_containers_iter(1000, marker, None, None, None)) if not containers: break try: for (container, _junk, _junk, _junk) in containers: self.container_pool.spawn(self.reap_container, account, partition, nodes, container) self.container_pool.waitall() except (Exception, Timeout): self.logger.exception( _('Exception with containers for account %s'), account) marker = containers[-1][0] if marker == '': break log = 'Completed pass on account %s' % account except (Exception, Timeout): self.logger.exception( _('Exception with account %s'), account) log = _('Incomplete pass on account %s') % account if self.stats_containers_deleted: log += _(', %s containers deleted') % self.stats_containers_deleted if self.stats_objects_deleted: log += _(', %s objects deleted') % self.stats_objects_deleted if self.stats_containers_remaining: log += _(', %s containers remaining') % \ self.stats_containers_remaining if self.stats_objects_remaining: log += _(', %s objects remaining') % self.stats_objects_remaining if self.stats_containers_possibly_remaining: log += _(', %s containers possibly remaining') % \ self.stats_containers_possibly_remaining if self.stats_objects_possibly_remaining: log += _(', %s objects possibly remaining') % \ self.stats_objects_possibly_remaining if self.stats_return_codes: log += _(', return codes: ') for code in sorted(self.stats_return_codes): log += '%s %sxxs, ' % (self.stats_return_codes[code], code) log = log[:-2] log += _(', elapsed: %.02fs') % (time() - begin) self.logger.info(log) self.logger.timing_since('timing', self.start_time) delete_timestamp = Timestamp(info['delete_timestamp']) if self.stats_containers_remaining and \ begin - float(delete_timestamp) >= self.reap_not_done_after: self.logger.warn(_('Account %s has not been reaped since %s') % (account, delete_timestamp.isoformat)) return True def reap_container(self, account, account_partition, account_nodes, container): """ Deletes the data and the container itself for the given container. This will call :func:`reap_object` up to sqrt(self.concurrency) times concurrently for the objects in the container. If there is any exception while deleting a single object, the process will continue for any other objects in the container and the failed objects will be tried again the next time this function is called with the same parameters. If there is any exception while listing the objects for deletion, the process will stop (but will obviously be tried again the next time this function is called with the same parameters). This is a possibility since the listing comes from querying just the primary remote container server. Once all objects have been attempted to be deleted, the container itself will be attempted to be deleted by sending a delete request to all container nodes. The format of the delete request is such that each container server will update a corresponding account server, removing the container from the account's listing. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param account: The name of the account for the container. :param account_partition: The partition for the account on the account ring. :param account_nodes: The primary node dicts for the account. :param container: The name of the container to delete. * See also: :func:`swift.common.ring.Ring.get_nodes` for a description of the account node dicts. """ account_nodes = list(account_nodes) part, nodes = self.get_container_ring().get_nodes(account, container) node = nodes[-1] pool = GreenPool(size=self.object_concurrency) marker = '' while True: objects = None try: headers, objects = direct_get_container( node, part, account, container, marker=marker, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout) self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 self.logger.increment('return_codes.2') except ClientException as err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.logger.increment( 'return_codes.%d' % (err.http_status / 100,)) if not objects: break try: policy_index = headers.get('X-Backend-Storage-Policy-Index', 0) for obj in objects: if isinstance(obj['name'], unicode): obj['name'] = obj['name'].encode('utf8') pool.spawn(self.reap_object, account, container, part, nodes, obj['name'], policy_index) pool.waitall() except (Exception, Timeout): self.logger.exception(_('Exception with objects for container ' '%(container)s for account %(account)s' ), {'container': container, 'account': account}) marker = objects[-1]['name'] if marker == '': break successes = 0 failures = 0 for node in nodes: anode = account_nodes.pop() try: direct_delete_container( node, part, account, container, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout, headers={'X-Account-Host': '%(ip)s:%(port)s' % anode, 'X-Account-Partition': str(account_partition), 'X-Account-Device': anode['device'], 'X-Account-Override-Deleted': 'yes'}) successes += 1 self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 self.logger.increment('return_codes.2') except ClientException as err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) failures += 1 self.logger.increment('containers_failures') self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.logger.increment( 'return_codes.%d' % (err.http_status / 100,)) if successes > failures: self.stats_containers_deleted += 1 self.logger.increment('containers_deleted') elif not successes: self.stats_containers_remaining += 1 self.logger.increment('containers_remaining') else: self.stats_containers_possibly_remaining += 1 self.logger.increment('containers_possibly_remaining') def reap_object(self, account, container, container_partition, container_nodes, obj, policy_index): """ Deletes the given object by issuing a delete request to each node for the object. The format of the delete request is such that each object server will update a corresponding container server, removing the object from the container's listing. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param account: The name of the account for the object. :param container: The name of the container for the object. :param container_partition: The partition for the container on the container ring. :param container_nodes: The primary node dicts for the container. :param obj: The name of the object to delete. :param policy_index: The storage policy index of the object's container * See also: :func:`swift.common.ring.Ring.get_nodes` for a description of the container node dicts. """ container_nodes = list(container_nodes) ring = self.get_object_ring(policy_index) part, nodes = ring.get_nodes(account, container, obj) successes = 0 failures = 0 for node in nodes: cnode = container_nodes.pop() try: direct_delete_object( node, part, account, container, obj, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout, headers={'X-Container-Host': '%(ip)s:%(port)s' % cnode, 'X-Container-Partition': str(container_partition), 'X-Container-Device': cnode['device'], 'X-Backend-Storage-Policy-Index': policy_index}) successes += 1 self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 self.logger.increment('return_codes.2') except ClientException as err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) failures += 1 self.logger.increment('objects_failures') self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 self.logger.increment( 'return_codes.%d' % (err.http_status / 100,)) if successes > failures: self.stats_objects_deleted += 1 self.logger.increment('objects_deleted') elif not successes: self.stats_objects_remaining += 1 self.logger.increment('objects_remaining') else: self.stats_objects_possibly_remaining += 1 self.logger.increment('objects_possibly_remaining')
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = get_logger(conf, log_route='object-replicator') self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.vm_test_mode = config_true_value(conf.get('vm_test_mode', 'no')) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.port = int(conf.get('bind_port', 6000)) self.concurrency = int(conf.get('concurrency', 1)) self.stats_interval = int(conf.get('stats_interval', '300')) self.object_ring = Ring(self.swift_dir, ring_name='object') self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7)) self.partition_times = [] self.run_pause = int(conf.get('run_pause', 30)) self.rsync_timeout = int(conf.get('rsync_timeout', 900)) self.rsync_io_timeout = conf.get('rsync_io_timeout', '30') self.rsync_bwlimit = conf.get('rsync_bwlimit', '0') self.http_timeout = int(conf.get('http_timeout', 60)) self.lockup_timeout = int(conf.get('lockup_timeout', 1800)) self.recon_cache_path = conf.get('recon_cache_path', '/var/cache/swift') self.rcache = os.path.join(self.recon_cache_path, "object.recon") self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.node_timeout = float(conf.get('node_timeout', 10)) self.sync_method = getattr(self, conf.get('sync_method') or 'rsync') self.network_chunk_size = int(conf.get('network_chunk_size', 65536)) self.disk_chunk_size = int(conf.get('disk_chunk_size', 65536)) self.headers = { 'Content-Length': '0', 'user-agent': 'obj-replicator %s' % os.getpid()} self.rsync_error_log_line_length = \ int(conf.get('rsync_error_log_line_length', 0)) self.handoffs_first = config_true_value(conf.get('handoffs_first', False)) self.handoff_delete = config_auto_int_value( conf.get('handoff_delete', 'auto'), 0) self._diskfile_mgr = DiskFileManager(conf, self.logger) def sync(self, node, job, suffixes): # Just exists for doc anchor point """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean indicating success or failure """ return self.sync_method(node, job, suffixes) def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() ret_val = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error(_("Killing long-running rsync: %s"), str(args)) proc.kill() return 1 # failure response code total_time = time.time() - start_time for result in results.split('\n'): if result == '': continue if result.startswith('cd+'): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: error_line = _('Bad rsync return code: %(ret)d <- %(args)s') % \ {'args': str(args), 'ret': ret_val} if self.rsync_error_log_line_length: error_line = error_line[:self.rsync_error_log_line_length] self.logger.error(error_line) elif results: self.logger.info( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {'src': args[-2], 'dst': args[-1], 'time': total_time}) else: self.logger.debug( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {'src': args[-2], 'dst': args[-1], 'time': total_time}) return ret_val def rsync(self, node, job, suffixes): """ Uses rsync to implement the sync method. This was the first sync method in Swift. """ if not os.path.exists(job['path']): return False args = [ 'rsync', '--recursive', '--whole-file', '--human-readable', '--xattrs', '--itemize-changes', '--ignore-existing', '--timeout=%s' % self.rsync_io_timeout, '--contimeout=%s' % self.rsync_io_timeout, '--bwlimit=%s' % self.rsync_bwlimit, ] node_ip = rsync_ip(node['replication_ip']) if self.vm_test_mode: rsync_module = '%s::object%s' % (node_ip, node['replication_port']) else: rsync_module = '%s::object' % node_ip had_any = False for suffix in suffixes: spath = join(job['path'], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False args.append(join(rsync_module, node['device'], 'objects', job['partition'])) return self._rsync(args) == 0 def ssync(self, node, job, suffixes): return ssync_sender.Sender(self, node, job, suffixes)() def check_ring(self): """ Check to see if the ring has been updated :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if self.object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff))] self.replication_count += 1 self.logger.increment('partition.delete.count.%s' % (job['device'],)) begin = time.time() try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job['path']) if suffixes: for node in job['nodes']: success = self.sync(node, job, suffixes) if success: with Timeout(self.http_timeout): conn = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=self.headers) conn.getresponse().read() responses.append(success) if self.handoff_delete: # delete handoff if we have had handoff_delete successes delete_handoff = len([resp for resp in responses if resp]) >= \ self.handoff_delete else: # delete handoff if all syncs were successful delete_handoff = len(responses) == len(job['nodes']) and \ all(responses) if not suffixes or delete_handoff: self.logger.info(_("Removing partition: %s"), job['path']) tpool.execute(shutil.rmtree, job['path'], ignore_errors=True) except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) finally: self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.delete.timing', begin) def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ self.replication_count += 1 self.logger.increment('partition.update.count.%s' % (job['device'],)) begin = time.time() try: #MODIFIED LightSync local_hash = None part_hash_local = tpool_reraise(get_part_hash, job['path']) #Partition has been modified if part_hash_local is None: hashed, local_hash = tpool_reraise( get_hashes, job['path'], do_listdir=(self.replication_count % 10) == 0, reclaim_age=self.reclaim_age) self.suffix_hash += hashed self.logger.update_stats('suffix.hashes', hashed) part_hash_local = tpool_reraise(get_part_hash, job['path']) """hashed, local_hash = tpool_reraise( get_hashes, job['path'], do_listdir=(self.replication_count % 10) == 0, reclaim_age=self.reclaim_age) self.suffix_hash += hashed self.logger.update_stats('suffix.hashes', hashed)""" attempts_left = True nodes = itertools.chain(job['nodes']) while (True): ## # If this throws StopIterator it will be caught way below node = next(nodes) try: #MODIFIED LightSync req_suff = '' if part_hash_local is None else '/_SHORTREP_-'\ +part_hash_local with Timeout(self.http_timeout): resp = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', req_suff, headers=self.headers).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error(_('%(ip)s/%(device)s responded' ' as unmounted'), node) if(attempts_left): attempts_left = False ########To modify to start from current node's hand-off: Hash node info to get hand-off position nodes = itertools.chain( self.object_ring.get_more_nodes(int(job['partition'])), nodes) continue if resp.status != HTTP_OK: self.logger.error(_("Invalid response %(resp)s " "from %(ip)s"), {'resp': resp.status, 'ip': node['replication_ip']}) continue part_hash_remote = pickle.loads(resp.read()) del resp if part_hash_remote == "OK": break remote_hash = part_hash_remote if local_hash is None: hashed, local_hash = tpool_reraise( get_hashes, job['path'], do_listdir=(self.replication_count % 10) == 0, reclaim_age=self.reclaim_age) self.suffix_hash += hashed self.logger.update_stats('suffix.hashes', hashed) ''' with Timeout(self.http_timeout): resp = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '', headers=self.headers).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error(_('%(ip)s/%(device)s responded' ' as unmounted'), node) attempts_left += 1 continue if resp.status != HTTP_OK: self.logger.error(_("Invalid response %(resp)s " "from %(ip)s"), {'resp': resp.status, 'ip': node['replication_ip']}) continue remote_hash = pickle.loads(resp.read()) del resp ''' ## suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] if not suffixes: #MODIFIED LightSync break ## hashed, recalc_hash = tpool_reraise( get_hashes, job['path'], recalculate=suffixes, reclaim_age=self.reclaim_age) self.logger.update_stats('suffix.hashes', hashed) local_hash = recalc_hash suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] self.sync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=self.headers) conn.getresponse().read() self.suffix_sync += len(suffixes) self.logger.update_stats('suffix.syncs', len(suffixes)) #MODIFIED LightSync break ## except (Exception, Timeout): self.logger.exception(_("Error syncing with node: %s") % node) #MODIFIED LightSync (after if) self.suffix_count += len(local_hash) if local_hash is not None else 0 ## except (Exception, Timeout): self.logger.exception(_("Error syncing partition")) finally: self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.update.timing', begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ if self.replication_count: elapsed = (time.time() - self.start) or 0.000001 rate = self.replication_count / elapsed self.logger.info( _("%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)"), {'replicated': self.replication_count, 'total': self.job_count, 'percentage': self.replication_count * 100.0 / self.job_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, self.replication_count, self.job_count)}) if self.suffix_count: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), {'checked': self.suffix_count, 'hashed': (self.suffix_hash * 100.0) / self.suffix_count, 'synced': (self.suffix_sync * 100.0) / self.suffix_count}) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), {'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[ len(self.partition_times) // 2]}) else: self.logger.info( _("Nothing replicated for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the replicator finishes its replication pass in some eventuality. """ while True: eventlet.sleep(self.lockup_timeout) if self.replication_count == self.last_replication_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_replication_count = self.replication_count def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be synced. """ jobs = [] ips = whataremyips() for local_dev in [dev for dev in self.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning('Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = \ self.object_ring.get_part_nodes(int(partition)) #MODIFIED LightSync for mypos in range(len(part_nodes)): if part_nodes[mypos]['id'] == local_dev['id']: break nodes = part_nodes[mypos+1:]+part_nodes[:mypos] ## jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition)) except (ValueError, OSError): continue random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs def replicate(self, override_devices=None, override_partitions=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.partition_times = [] if override_devices is None: override_devices = [] if override_partitions is None: override_partitions = [] stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs() for job in jobs: if override_devices and job['device'] not in override_devices: continue if override_partitions and \ job['partition'] not in override_partitions: continue dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), job['device']) continue if not self.check_ring(): self.logger.info(_("Ring change detected. Aborting " "current replication pass.")) return if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() def run_once(self, *args, **kwargs): start = time.time() self.logger.info(_("Running object replicator in script mode.")) override_devices = list_from_csv(kwargs.get('devices')) override_partitions = list_from_csv(kwargs.get('partitions')) self.replicate( override_devices=override_devices, override_partitions=override_partitions) total = (time.time() - start) / 60 self.logger.info( _("Object replication complete (once). (%.02f minutes)"), total) if not (override_partitions or override_devices): dump_recon_cache({'object_replication_time': total, 'object_replication_last': time.time()}, self.rcache, self.logger) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object replicator in daemon mode.")) # Run the replicator continually while True: start = time.time() self.logger.info(_("Starting object replication pass.")) # Run the replicator self.replicate() total = (time.time() - start) / 60 self.logger.info( _("Object replication complete. (%.02f minutes)"), total) dump_recon_cache({'object_replication_time': total, 'object_replication_last': time.time()}, self.rcache, self.logger) self.logger.debug('Replication sleeping for %s seconds.', self.run_pause) sleep(self.run_pause)
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf, logger=None): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = logger or get_logger(conf, log_route='object-replicator') self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.bind_ip = conf.get('bind_ip', '0.0.0.0') self.servers_per_port = int(conf.get('servers_per_port', '0') or 0) self.port = None if self.servers_per_port else \ int(conf.get('bind_port', 6200)) self.concurrency = int(conf.get('concurrency', 1)) self.stats_interval = int(conf.get('stats_interval', '300')) self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.replication_cycle = random.randint(0, 9) self.partition_times = [] self.interval = int(conf.get('interval') or conf.get('run_pause') or 30) self.rsync_timeout = int(conf.get('rsync_timeout', DEFAULT_RSYNC_TIMEOUT)) self.rsync_io_timeout = conf.get('rsync_io_timeout', '30') self.rsync_bwlimit = conf.get('rsync_bwlimit', '0') self.rsync_compress = config_true_value( conf.get('rsync_compress', 'no')) self.rsync_module = conf.get('rsync_module', '').rstrip('/') if not self.rsync_module: self.rsync_module = '{replication_ip}::object' self.http_timeout = int(conf.get('http_timeout', 60)) self.lockup_timeout = int(conf.get('lockup_timeout', 1800)) self.recon_cache_path = conf.get('recon_cache_path', '/var/cache/swift') self.rcache = os.path.join(self.recon_cache_path, "object.recon") self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.node_timeout = float(conf.get('node_timeout', 10)) self.sync_method = getattr(self, conf.get('sync_method') or 'rsync') self.network_chunk_size = int(conf.get('network_chunk_size', 65536)) self.default_headers = { 'Content-Length': '0', 'user-agent': 'object-replicator %s' % os.getpid()} self.rsync_error_log_line_length = \ int(conf.get('rsync_error_log_line_length', 0)) self.handoffs_first = config_true_value(conf.get('handoffs_first', False)) self.handoff_delete = config_auto_int_value( conf.get('handoff_delete', 'auto'), 0) if any((self.handoff_delete, self.handoffs_first)): self.logger.warning('Handoff only mode is not intended for normal ' 'operation, please disable handoffs_first and ' 'handoff_delete before the next ' 'normal rebalance') self._df_router = DiskFileRouter(conf, self.logger) def _zero_stats(self): """Zero out the stats.""" self.stats = {'attempted': 0, 'success': 0, 'failure': 0, 'hashmatch': 0, 'rsync': 0, 'remove': 0, 'start': time.time(), 'failure_nodes': {}} def _add_failure_stats(self, failure_devs_info): for node, dev in failure_devs_info: self.stats['failure'] += 1 failure_devs = self.stats['failure_nodes'].setdefault(node, {}) failure_devs.setdefault(dev, 0) failure_devs[dev] += 1 def _get_my_replication_ips(self): my_replication_ips = set() ips = whataremyips() for policy in POLICIES: self.load_object_ring(policy) for local_dev in [dev for dev in policy.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port]: my_replication_ips.add(local_dev['replication_ip']) return list(my_replication_ips) # Just exists for doc anchor point def sync(self, node, job, suffixes, *args, **kwargs): """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean and dictionary, boolean indicating success or failure """ return self.sync_method(node, job, suffixes, *args, **kwargs) def load_object_ring(self, policy): """ Make sure the policy's rings are loaded. :param policy: the StoragePolicy instance :returns: appropriate ring object """ policy.load_ring(self.swift_dir) return policy.object_ring def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() ret_val = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error(_("Killing long-running rsync: %s"), str(args)) proc.kill() return 1 # failure response code total_time = time.time() - start_time for result in results.split('\n'): if result == '': continue if result.startswith('cd+'): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: error_line = _('Bad rsync return code: %(ret)d <- %(args)s') % \ {'args': str(args), 'ret': ret_val} if self.rsync_error_log_line_length: error_line = error_line[:self.rsync_error_log_line_length] self.logger.error(error_line) else: log_method = self.logger.info if results else self.logger.debug log_method( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {'src': args[-2], 'dst': args[-1], 'time': total_time}) return ret_val def rsync(self, node, job, suffixes): """ Uses rsync to implement the sync method. This was the first sync method in Swift. """ if not os.path.exists(job['path']): return False, {} args = [ 'rsync', '--recursive', '--whole-file', '--human-readable', '--xattrs', '--itemize-changes', '--ignore-existing', '--timeout=%s' % self.rsync_io_timeout, '--contimeout=%s' % self.rsync_io_timeout, '--bwlimit=%s' % self.rsync_bwlimit, '--exclude=.*.%s' % ''.join('[0-9a-zA-Z]' for i in range(6)) ] if self.rsync_compress and \ job['region'] != node['region']: # Allow for compression, but only if the remote node is in # a different region than the local one. args.append('--compress') rsync_module = rsync_module_interpolation(self.rsync_module, node) had_any = False for suffix in suffixes: spath = join(job['path'], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False, {} data_dir = get_data_dir(job['policy']) args.append(join(rsync_module, node['device'], data_dir, job['partition'])) return self._rsync(args) == 0, {} def ssync(self, node, job, suffixes, remote_check_objs=None): return ssync_sender.Sender( self, node, job, suffixes, remote_check_objs)() def check_ring(self, object_ring): """ Check to see if the ring has been updated :param object_ring: the ring to check :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff))] self.replication_count += 1 self.logger.increment('partition.delete.count.%s' % (job['device'],)) headers = dict(self.default_headers) headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) failure_devs_info = set() begin = time.time() handoff_partition_deleted = False try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job['path']) synced_remote_regions = {} delete_objs = None if suffixes: for node in job['nodes']: self.stats['rsync'] += 1 kwargs = {} if node['region'] in synced_remote_regions and \ self.conf.get('sync_method', 'rsync') == 'ssync': kwargs['remote_check_objs'] = \ synced_remote_regions[node['region']] # candidates is a dict(hash=>timestamp) of objects # for deletion success, candidates = self.sync( node, job, suffixes, **kwargs) if success: with Timeout(self.http_timeout): conn = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=headers) conn.getresponse().read() if node['region'] != job['region']: synced_remote_regions[node['region']] = viewkeys( candidates) else: failure_devs_info.add((node['replication_ip'], node['device'])) responses.append(success) for cand_objs in synced_remote_regions.values(): if delete_objs is None: delete_objs = cand_objs else: delete_objs = delete_objs & cand_objs if self.handoff_delete: # delete handoff if we have had handoff_delete successes delete_handoff = len([resp for resp in responses if resp]) >= \ self.handoff_delete else: # delete handoff if all syncs were successful delete_handoff = len(responses) == len(job['nodes']) and \ all(responses) if delete_handoff: self.stats['remove'] += 1 if (self.conf.get('sync_method', 'rsync') == 'ssync' and delete_objs is not None): self.logger.info(_("Removing %s objects"), len(delete_objs)) _junk, error_paths = self.delete_handoff_objs( job, delete_objs) # if replication works for a hand-off device and it failed, # the remote devices which are target of the replication # from the hand-off device will be marked. Because cleanup # after replication failed means replicator needs to # replicate again with the same info. if error_paths: failure_devs_info.update( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes']]) else: self.delete_partition(job['path']) handoff_partition_deleted = True elif not suffixes: self.delete_partition(job['path']) handoff_partition_deleted = True except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) self._add_failure_stats(failure_devs_info) finally: target_devs_info = set([(target_dev['replication_ip'], target_dev['device']) for target_dev in job['nodes']]) self.stats['success'] += len(target_devs_info - failure_devs_info) if not handoff_partition_deleted: self.handoffs_remaining += 1 self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.delete.timing', begin) def delete_partition(self, path): self.logger.info(_("Removing partition: %s"), path) tpool.execute(shutil.rmtree, path) def delete_handoff_objs(self, job, delete_objs): success_paths = [] error_paths = [] for object_hash in delete_objs: object_path = storage_directory(job['obj_path'], job['partition'], object_hash) tpool.execute(shutil.rmtree, object_path, ignore_errors=True) suffix_dir = dirname(object_path) try: os.rmdir(suffix_dir) success_paths.append(object_path) except OSError as e: if e.errno not in (errno.ENOENT, errno.ENOTEMPTY): error_paths.append(object_path) self.logger.exception( "Unexpected error trying to cleanup suffix dir:%r", suffix_dir) return success_paths, error_paths def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ self.replication_count += 1 self.logger.increment('partition.update.count.%s' % (job['device'],)) headers = dict(self.default_headers) headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) target_devs_info = set() failure_devs_info = set() begin = time.time() df_mgr = self._df_router[job['policy']] try: hashed, local_hash = tpool_reraise( df_mgr._get_hashes, job['device'], job['partition'], job['policy'], do_listdir=_do_listdir( int(job['partition']), self.replication_cycle)) self.suffix_hash += hashed self.logger.update_stats('suffix.hashes', hashed) attempts_left = len(job['nodes']) synced_remote_regions = set() random.shuffle(job['nodes']) nodes = itertools.chain( job['nodes'], job['policy'].object_ring.get_more_nodes( int(job['partition']))) while attempts_left > 0: # If this throws StopIteration it will be caught way below node = next(nodes) target_devs_info.add((node['replication_ip'], node['device'])) attempts_left -= 1 # if we have already synced to this remote region, # don't sync again on this replication pass if node['region'] in synced_remote_regions: continue try: with Timeout(self.http_timeout): resp = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '', headers=headers).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error( _('%(replication_ip)s/%(device)s ' 'responded as unmounted'), node) attempts_left += 1 failure_devs_info.add((node['replication_ip'], node['device'])) continue if resp.status != HTTP_OK: self.logger.error(_("Invalid response %(resp)s " "from %(ip)s"), {'resp': resp.status, 'ip': node['replication_ip']}) failure_devs_info.add((node['replication_ip'], node['device'])) continue remote_hash = pickle.loads(resp.read()) del resp suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] if not suffixes: self.stats['hashmatch'] += 1 continue hashed, recalc_hash = tpool_reraise( df_mgr._get_hashes, job['device'], job['partition'], job['policy'], recalculate=suffixes) self.logger.update_stats('suffix.hashes', hashed) local_hash = recalc_hash suffixes = [suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1)] self.stats['rsync'] += 1 success, _junk = self.sync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect( node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=headers) conn.getresponse().read() if not success: failure_devs_info.add((node['replication_ip'], node['device'])) # add only remote region when replicate succeeded if success and node['region'] != job['region']: synced_remote_regions.add(node['region']) self.suffix_sync += len(suffixes) self.logger.update_stats('suffix.syncs', len(suffixes)) except (Exception, Timeout): failure_devs_info.add((node['replication_ip'], node['device'])) self.logger.exception(_("Error syncing with node: %s") % node) self.suffix_count += len(local_hash) except StopIteration: self.logger.error('Ran out of handoffs while replicating ' 'partition %s of policy %d', job['partition'], int(job['policy'])) except (Exception, Timeout): failure_devs_info.update(target_devs_info) self.logger.exception(_("Error syncing partition")) finally: self._add_failure_stats(failure_devs_info) self.stats['success'] += len(target_devs_info - failure_devs_info) self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.update.timing', begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ if self.replication_count: elapsed = (time.time() - self.start) or 0.000001 rate = self.replication_count / elapsed self.logger.info( _("%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)"), {'replicated': self.replication_count, 'total': self.job_count, 'percentage': self.replication_count * 100.0 / self.job_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, self.replication_count, self.job_count)}) self.logger.info(_('%(success)s successes, %(failure)s failures') % self.stats) if self.suffix_count: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), {'checked': self.suffix_count, 'hashed': (self.suffix_hash * 100.0) / self.suffix_count, 'synced': (self.suffix_sync * 100.0) / self.suffix_count}) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), {'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[ len(self.partition_times) // 2]}) else: self.logger.info( _("Nothing replicated for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the replicator finishes its replication pass in some eventuality. """ while True: eventlet.sleep(self.lockup_timeout) if self.replication_count == self.last_replication_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_replication_count = self.replication_count def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] df_mgr = self._df_router[policy] self.all_devs_info.update( [(dev['replication_ip'], dev['device']) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices))]: found_local = True dev_path = check_drive(self.devices_dir, local_dev['device'], self.mount_check) if not dev_path: self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev]) self.logger.warning( _('%s is not mounted'), local_dev['device']) continue obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) unlink_older_than(tmp_path, time.time() - df_mgr.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue if (partition.startswith('auditor_status_') and partition.endswith('.json')): # ignore auditor status files continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: if part_nodes: self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in nodes]) else: self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev]) continue if not found_local: self.logger.error("Can't find itself in policy with index %d with" " ips %s and with port %s in ring file, not" " replicating", int(policy), ", ".join(ips), self.port) return jobs def collect_jobs(self, override_devices=None, override_partitions=None, override_policies=None): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. :param override_devices: if set, only jobs on these devices will be returned :param override_partitions: if set, only jobs on these partitions will be returned :param override_policies: if set, only jobs in these storage policies will be returned """ jobs = [] ips = whataremyips(self.bind_ip) for policy in POLICIES: # Skip replication if next_part_power is set. In this case # every object is hard-linked twice, but the replicator can't # detect them and would create a second copy of the file if not # yet existing - and this might double the actual transferred # and stored data next_part_power = getattr( policy.object_ring, 'next_part_power', None) if next_part_power is not None: self.logger.warning( _("next_part_power set in policy '%s'. Skipping"), policy.name) continue if policy.policy_type == REPL_POLICY: if (override_policies is not None and str(policy.idx) not in override_policies): continue # ensure rings are loaded for policy self.load_object_ring(policy) jobs += self.build_replication_jobs( policy, ips, override_devices=override_devices, override_partitions=override_partitions) random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs def replicate(self, override_devices=None, override_partitions=None, override_policies=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.replication_cycle = (self.replication_cycle + 1) % 10 self.partition_times = [] self.my_replication_ips = self._get_my_replication_ips() self.all_devs_info = set() self.handoffs_remaining = 0 stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle current_nodes = None try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) for job in jobs: current_nodes = job['nodes'] if override_devices and job['device'] not in override_devices: continue if override_partitions and \ job['partition'] not in override_partitions: continue dev_path = check_drive(self.devices_dir, job['device'], self.mount_check) if not dev_path: self._add_failure_stats([(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes']]) self.logger.warning(_('%s is not mounted'), job['device']) continue if self.handoffs_first and not job['delete']: # in handoffs first mode, we won't process primary # partitions until rebalance was successful! if self.handoffs_remaining: self.logger.warning(_( "Handoffs first mode still has handoffs " "remaining. Aborting current " "replication pass.")) break if not self.check_ring(job['policy'].object_ring): self.logger.info(_("Ring change detected. Aborting " "current replication pass.")) return try: if isfile(job['path']): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job['path']) os.remove(job['path']) continue except OSError: continue if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) current_nodes = None with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): if current_nodes: self._add_failure_stats([(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in current_nodes]) else: self._add_failure_stats(self.all_devs_info) self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() self.stats['attempted'] = self.replication_count def run_once(self, *args, **kwargs): self._zero_stats() self.logger.info(_("Running object replicator in script mode.")) override_devices = list_from_csv(kwargs.get('devices')) override_partitions = list_from_csv(kwargs.get('partitions')) override_policies = list_from_csv(kwargs.get('policies')) if not override_devices: override_devices = None if not override_partitions: override_partitions = None if not override_policies: override_policies = None self.replicate( override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) total = (time.time() - self.stats['start']) / 60 self.logger.info( _("Object replication complete (once). (%.02f minutes)"), total) if not (override_partitions or override_devices): replication_last = time.time() dump_recon_cache({'replication_stats': self.stats, 'replication_time': total, 'replication_last': replication_last, 'object_replication_time': total, 'object_replication_last': replication_last}, self.rcache, self.logger) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object replicator in daemon mode.")) # Run the replicator continually while True: self._zero_stats() self.logger.info(_("Starting object replication pass.")) # Run the replicator self.replicate() total = (time.time() - self.stats['start']) / 60 self.logger.info( _("Object replication complete. (%.02f minutes)"), total) replication_last = time.time() dump_recon_cache({'replication_stats': self.stats, 'replication_time': total, 'replication_last': replication_last, 'object_replication_time': total, 'object_replication_last': replication_last}, self.rcache, self.logger) self.logger.debug('Replication sleeping for %s seconds.', self.interval) sleep(self.interval)
class Service(object): def __init__(self, check_task_internal='', *task_info): """ task_info should include the dictionary which have the keys of 'task_name' and 'task_desc'. """ global DEBUG if DEBUG: self.logger = get_debug_logger("Service") else: self.logger = get_default_logger("Service") self.config = get_default_config() if check_task_internal and int(check_task_internal): self.check_task_internal = int(check_task_internal) else: self.check_task_internal = int( self.config.get_option_value('default', 'check_task_internal')) self.enable_backdoor = self.config.get_option_value( 'eventlet_backdoor', 'enable') self.backdoor_port = self.config.get_option_value( 'eventlet_backdoor', 'port') self.task_info = task_info self.pool = GreenPool() self.control = None self.task_queue_list = [] self.task_status = {} self.task_thread_list = [] self._done = event.Event() def _get_task_status(self, queue): task_status = {} status = queue.get() if status: for item in status.split(','): key, value = item.split(':') task_status[key.strip()] = value.strip() return task_status def launch_control_task(self, *queue_list, **task_status): while True: eventlet.sleep(self.check_task_internal) for queue in queue_list: status = self._get_task_status(queue) if status: task_status[status['task_name']] = status def _get_task_obj(self, task_name): task_module = import_utils(task_name) for task_subclass_name, task_subclass in get_subclass( task_module, Task): if task_subclass: return task_subclass return None def start(self): if len(self.task_info) == 0: self.logger.info("No task is here for executing!!!") self.stop() wsgi_tasks = WSGITask() wsgi_task_names = [] #wsgi_url_map_app = {} for task_info in self.task_info: task_name = task_info['task_name'] task_obj = self._get_task_obj(task_name) task_queue = eventlet.queue.LifoQueue() self.task_queue_list.append(task_queue) task = task_obj(task_queue) if task.task_type == 'standalone': task_thread = self.pool.spawn(task.start) self.task_thread_list.append(task_thread) elif task.task_type == 'subwsgi': #if task.url_map_app: # # If there is the same url mapping to the different apps, # then the url will direct to the last app # # wsgi_url_map_app.update(task.url_map_app) task.register(wsgi_tasks.mapper, wsgi_tasks.loader) wsgi_task_names.append(task.task_name) # Start the wsgi tasks binding to the whole port #for url, app in url_map_app.items(): # wsgi_tasks.register(url, app) if len(wsgi_task_names) >= 1: self.logger.info("Will run WSGI tasks:%s in a singal thread..." % wsgi_task_names) task_thread = self.pool.spawn(wsgi_tasks.start) self.task_thread_list.append(task_thread) # launch the control task task_thread = self.pool.spawn(self.launch_control_task, *self.task_queue_list, **self.task_status) self.task_thread_list.append(task_thread) if self.enable_backdoor.lower() == 'true': self.open_backdoor() def wait(self): self._done.wait() def stop(self): for task_thread in self.task_thread_list: task_thread.kill() if not self._done.ready(): self._done.send() def restart(self): pass def list_task_threads(self): print self.task_thread_list def list_task_queues(self): print self.task_queue_list def list_task_status(self): print self.task_status def open_backdoor(self): backdoor_locals = { 'list_task_threads': self.list_task_threads, 'list_task_queues': self.list_task_queues, 'list_task_status': self.list_task_status, 'stop': self.stop } self.backdoor_port = self.config.get_option_value( 'eventlet_backdoor', 'port') self.pool.spawn(backdoor.backdoor_server, eventlet.listen( ('localhost', int(self.backdoor_port))), locals=backdoor_locals)
class ObjectReplicator(Daemon): """ Replicate objects. Encapsulates most logic and data needed by the object replication process. Each call to .replicate() performs one replication pass. It's up to the caller to do this in a loop. """ def __init__(self, conf, logger=None): """ :param conf: configuration object obtained from ConfigParser :param logger: logging object """ self.conf = conf self.logger = logger or get_logger(conf, log_route='object-replicator') self.devices_dir = conf.get('devices', '/srv/node') self.mount_check = config_true_value(conf.get('mount_check', 'true')) self.swift_dir = conf.get('swift_dir', '/etc/swift') self.bind_ip = conf.get('bind_ip', '0.0.0.0') self.servers_per_port = int(conf.get('servers_per_port', '0') or 0) self.port = None if self.servers_per_port else \ int(conf.get('bind_port', 6200)) self.concurrency = int(conf.get('concurrency', 1)) self.stats_interval = int(conf.get('stats_interval', '300')) self.ring_check_interval = int(conf.get('ring_check_interval', 15)) self.next_check = time.time() + self.ring_check_interval self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7)) self.replication_cycle = random.randint(0, 9) self.partition_times = [] self.interval = int( conf.get('interval') or conf.get('run_pause') or 30) self.rsync_timeout = int( conf.get('rsync_timeout', DEFAULT_RSYNC_TIMEOUT)) self.rsync_io_timeout = conf.get('rsync_io_timeout', '30') self.rsync_bwlimit = conf.get('rsync_bwlimit', '0') self.rsync_compress = config_true_value( conf.get('rsync_compress', 'no')) self.rsync_module = conf.get('rsync_module', '').rstrip('/') if not self.rsync_module: self.rsync_module = '{replication_ip}::object' if config_true_value(conf.get('vm_test_mode', 'no')): self.logger.warning('Option object-replicator/vm_test_mode ' 'is deprecated and will be removed in a ' 'future version. Update your ' 'configuration to use option ' 'object-replicator/rsync_module.') self.rsync_module += '{replication_port}' self.http_timeout = int(conf.get('http_timeout', 60)) self.lockup_timeout = int(conf.get('lockup_timeout', 1800)) self.recon_cache_path = conf.get('recon_cache_path', '/var/cache/swift') self.rcache = os.path.join(self.recon_cache_path, "object.recon") self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.node_timeout = float(conf.get('node_timeout', 10)) self.sync_method = getattr(self, conf.get('sync_method') or 'rsync') self.network_chunk_size = int(conf.get('network_chunk_size', 65536)) self.default_headers = { 'Content-Length': '0', 'user-agent': 'object-replicator %s' % os.getpid() } self.rsync_error_log_line_length = \ int(conf.get('rsync_error_log_line_length', 0)) self.handoffs_first = config_true_value( conf.get('handoffs_first', False)) self.handoff_delete = config_auto_int_value( conf.get('handoff_delete', 'auto'), 0) if any((self.handoff_delete, self.handoffs_first)): self.logger.warning('Handoff only mode is not intended for normal ' 'operation, please disable handoffs_first and ' 'handoff_delete before the next ' 'normal rebalance') self._diskfile_mgr = DiskFileManager(conf, self.logger) def _zero_stats(self): """Zero out the stats.""" self.stats = { 'attempted': 0, 'success': 0, 'failure': 0, 'hashmatch': 0, 'rsync': 0, 'remove': 0, 'start': time.time(), 'failure_nodes': {} } def _add_failure_stats(self, failure_devs_info): for node, dev in failure_devs_info: self.stats['failure'] += 1 failure_devs = self.stats['failure_nodes'].setdefault(node, {}) failure_devs.setdefault(dev, 0) failure_devs[dev] += 1 def _get_my_replication_ips(self): my_replication_ips = set() ips = whataremyips() for policy in POLICIES: self.load_object_ring(policy) for local_dev in [ dev for dev in policy.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port ]: my_replication_ips.add(local_dev['replication_ip']) return list(my_replication_ips) # Just exists for doc anchor point def sync(self, node, job, suffixes, *args, **kwargs): """ Synchronize local suffix directories from a partition with a remote node. :param node: the "dev" entry for the remote node to sync with :param job: information about the partition being synced :param suffixes: a list of suffixes which need to be pushed :returns: boolean and dictionary, boolean indicating success or failure """ return self.sync_method(node, job, suffixes, *args, **kwargs) def load_object_ring(self, policy): """ Make sure the policy's rings are loaded. :param policy: the StoragePolicy instance :returns: appropriate ring object """ policy.load_ring(self.swift_dir) return policy.object_ring def _rsync(self, args): """ Execute the rsync binary to replicate a partition. :returns: return code of rsync process. 0 is successful """ start_time = time.time() ret_val = None try: with Timeout(self.rsync_timeout): proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) results = proc.stdout.read() ret_val = proc.wait() except Timeout: self.logger.error(_("Killing long-running rsync: %s"), str(args)) proc.kill() return 1 # failure response code total_time = time.time() - start_time for result in results.split('\n'): if result == '': continue if result.startswith('cd+'): continue if not ret_val: self.logger.info(result) else: self.logger.error(result) if ret_val: error_line = _('Bad rsync return code: %(ret)d <- %(args)s') % \ {'args': str(args), 'ret': ret_val} if self.rsync_error_log_line_length: error_line = error_line[:self.rsync_error_log_line_length] self.logger.error(error_line) else: log_method = self.logger.info if results else self.logger.debug log_method( _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), { 'src': args[-2], 'dst': args[-1], 'time': total_time }) return ret_val def rsync(self, node, job, suffixes): """ Uses rsync to implement the sync method. This was the first sync method in Swift. """ if not os.path.exists(job['path']): return False, {} args = [ 'rsync', '--recursive', '--whole-file', '--human-readable', '--xattrs', '--itemize-changes', '--ignore-existing', '--timeout=%s' % self.rsync_io_timeout, '--contimeout=%s' % self.rsync_io_timeout, '--bwlimit=%s' % self.rsync_bwlimit, '--exclude=.*.%s' % ''.join('[0-9a-zA-Z]' for i in range(6)) ] if self.rsync_compress and \ job['region'] != node['region']: # Allow for compression, but only if the remote node is in # a different region than the local one. args.append('--compress') rsync_module = rsync_module_interpolation(self.rsync_module, node) had_any = False for suffix in suffixes: spath = join(job['path'], suffix) if os.path.exists(spath): args.append(spath) had_any = True if not had_any: return False, {} data_dir = get_data_dir(job['policy']) args.append( join(rsync_module, node['device'], data_dir, job['partition'])) return self._rsync(args) == 0, {} def ssync(self, node, job, suffixes, remote_check_objs=None): return ssync_sender.Sender(self, node, job, suffixes, remote_check_objs)() def check_ring(self, object_ring): """ Check to see if the ring has been updated :param object_ring: the ring to check :returns: boolean indicating whether or not the ring has changed """ if time.time() > self.next_check: self.next_check = time.time() + self.ring_check_interval if object_ring.has_changed(): return False return True def update_deleted(self, job): """ High-level method that replicates a single partition that doesn't belong on this node. :param job: a dict containing info about the partition to be replicated """ def tpool_get_suffixes(path): return [ suff for suff in os.listdir(path) if len(suff) == 3 and isdir(join(path, suff)) ] self.replication_count += 1 self.logger.increment('partition.delete.count.%s' % (job['device'], )) headers = dict(self.default_headers) headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) failure_devs_info = set() begin = time.time() handoff_partition_deleted = False try: responses = [] suffixes = tpool.execute(tpool_get_suffixes, job['path']) synced_remote_regions = {} delete_objs = None if suffixes: for node in job['nodes']: self.stats['rsync'] += 1 kwargs = {} if node['region'] in synced_remote_regions and \ self.conf.get('sync_method', 'rsync') == 'ssync': kwargs['remote_check_objs'] = \ synced_remote_regions[node['region']] # candidates is a dict(hash=>timestamp) of objects # for deletion success, candidates = self.sync(node, job, suffixes, **kwargs) if success: with Timeout(self.http_timeout): conn = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=headers) conn.getresponse().read() if node['region'] != job['region']: synced_remote_regions[node['region']] = viewkeys( candidates) else: failure_devs_info.add( (node['replication_ip'], node['device'])) responses.append(success) for region, cand_objs in synced_remote_regions.items(): if delete_objs is None: delete_objs = cand_objs else: delete_objs = delete_objs & cand_objs if self.handoff_delete: # delete handoff if we have had handoff_delete successes delete_handoff = len([resp for resp in responses if resp]) >= \ self.handoff_delete else: # delete handoff if all syncs were successful delete_handoff = len(responses) == len(job['nodes']) and \ all(responses) if delete_handoff: self.stats['remove'] += 1 if (self.conf.get('sync_method', 'rsync') == 'ssync' and delete_objs is not None): self.logger.info(_("Removing %s objects"), len(delete_objs)) _junk, error_paths = self.delete_handoff_objs( job, delete_objs) # if replication works for a hand-off device and it failed, # the remote devices which are target of the replication # from the hand-off device will be marked. Because cleanup # after replication failed means replicator needs to # replicate again with the same info. if error_paths: failure_devs_info.update([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes'] ]) else: self.delete_partition(job['path']) handoff_partition_deleted = True elif not suffixes: self.delete_partition(job['path']) handoff_partition_deleted = True except (Exception, Timeout): self.logger.exception(_("Error syncing handoff partition")) self._add_failure_stats(failure_devs_info) finally: target_devs_info = set([(target_dev['replication_ip'], target_dev['device']) for target_dev in job['nodes']]) self.stats['success'] += len(target_devs_info - failure_devs_info) if not handoff_partition_deleted: self.handoffs_remaining += 1 self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.delete.timing', begin) def delete_partition(self, path): self.logger.info(_("Removing partition: %s"), path) tpool.execute(shutil.rmtree, path) def delete_handoff_objs(self, job, delete_objs): success_paths = [] error_paths = [] for object_hash in delete_objs: object_path = storage_directory(job['obj_path'], job['partition'], object_hash) tpool.execute(shutil.rmtree, object_path, ignore_errors=True) suffix_dir = dirname(object_path) try: os.rmdir(suffix_dir) success_paths.append(object_path) except OSError as e: if e.errno not in (errno.ENOENT, errno.ENOTEMPTY): error_paths.append(object_path) self.logger.exception( "Unexpected error trying to cleanup suffix dir:%r", suffix_dir) return success_paths, error_paths def update(self, job): """ High-level method that replicates a single partition. :param job: a dict containing info about the partition to be replicated """ self.replication_count += 1 self.logger.increment('partition.update.count.%s' % (job['device'], )) headers = dict(self.default_headers) headers['X-Backend-Storage-Policy-Index'] = int(job['policy']) target_devs_info = set() failure_devs_info = set() begin = time.time() try: hashed, local_hash = tpool_reraise(self._diskfile_mgr._get_hashes, job['path'], do_listdir=_do_listdir( int(job['partition']), self.replication_cycle), reclaim_age=self.reclaim_age) self.suffix_hash += hashed self.logger.update_stats('suffix.hashes', hashed) attempts_left = len(job['nodes']) synced_remote_regions = set() random.shuffle(job['nodes']) nodes = itertools.chain( job['nodes'], job['policy'].object_ring.get_more_nodes( int(job['partition']))) while attempts_left > 0: # If this throws StopIteration it will be caught way below node = next(nodes) target_devs_info.add((node['replication_ip'], node['device'])) attempts_left -= 1 # if we have already synced to this remote region, # don't sync again on this replication pass if node['region'] in synced_remote_regions: continue try: with Timeout(self.http_timeout): resp = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '', headers=headers).getresponse() if resp.status == HTTP_INSUFFICIENT_STORAGE: self.logger.error( _('%(replication_ip)s/%(device)s ' 'responded as unmounted'), node) attempts_left += 1 failure_devs_info.add( (node['replication_ip'], node['device'])) continue if resp.status != HTTP_OK: self.logger.error( _("Invalid response %(resp)s " "from %(ip)s"), { 'resp': resp.status, 'ip': node['replication_ip'] }) failure_devs_info.add( (node['replication_ip'], node['device'])) continue remote_hash = pickle.loads(resp.read()) del resp suffixes = [ suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1) ] if not suffixes: self.stats['hashmatch'] += 1 continue hashed, recalc_hash = tpool_reraise( self._diskfile_mgr._get_hashes, job['path'], recalculate=suffixes, reclaim_age=self.reclaim_age) self.logger.update_stats('suffix.hashes', hashed) local_hash = recalc_hash suffixes = [ suffix for suffix in local_hash if local_hash[suffix] != remote_hash.get(suffix, -1) ] self.stats['rsync'] += 1 success, _junk = self.sync(node, job, suffixes) with Timeout(self.http_timeout): conn = http_connect(node['replication_ip'], node['replication_port'], node['device'], job['partition'], 'REPLICATE', '/' + '-'.join(suffixes), headers=headers) conn.getresponse().read() if not success: failure_devs_info.add( (node['replication_ip'], node['device'])) # add only remote region when replicate succeeded if success and node['region'] != job['region']: synced_remote_regions.add(node['region']) self.suffix_sync += len(suffixes) self.logger.update_stats('suffix.syncs', len(suffixes)) except (Exception, Timeout): failure_devs_info.add( (node['replication_ip'], node['device'])) self.logger.exception( _("Error syncing with node: %s") % node) self.suffix_count += len(local_hash) except (Exception, Timeout): failure_devs_info.update(target_devs_info) self._add_failure_stats(failure_devs_info) self.logger.exception(_("Error syncing partition")) finally: self.stats['success'] += len(target_devs_info - failure_devs_info) self.partition_times.append(time.time() - begin) self.logger.timing_since('partition.update.timing', begin) def stats_line(self): """ Logs various stats for the currently running replication pass. """ if self.replication_count: elapsed = (time.time() - self.start) or 0.000001 rate = self.replication_count / elapsed self.logger.info( _("%(replicated)d/%(total)d (%(percentage).2f%%)" " partitions replicated in %(time).2fs (%(rate).2f/sec, " "%(remaining)s remaining)"), { 'replicated': self.replication_count, 'total': self.job_count, 'percentage': self.replication_count * 100.0 / self.job_count, 'time': time.time() - self.start, 'rate': rate, 'remaining': '%d%s' % compute_eta(self.start, self.replication_count, self.job_count) }) self.logger.info( _('%(success)s successes, %(failure)s failures') % self.stats) if self.suffix_count: self.logger.info( _("%(checked)d suffixes checked - " "%(hashed).2f%% hashed, %(synced).2f%% synced"), { 'checked': self.suffix_count, 'hashed': (self.suffix_hash * 100.0) / self.suffix_count, 'synced': (self.suffix_sync * 100.0) / self.suffix_count }) self.partition_times.sort() self.logger.info( _("Partition times: max %(max).4fs, " "min %(min).4fs, med %(med).4fs"), { 'max': self.partition_times[-1], 'min': self.partition_times[0], 'med': self.partition_times[len(self.partition_times) // 2] }) else: self.logger.info(_("Nothing replicated for %s seconds."), (time.time() - self.start)) def kill_coros(self): """Utility function that kills all coroutines currently running.""" for coro in list(self.run_pool.coroutines_running): try: coro.kill(GreenletExit) except GreenletExit: pass def heartbeat(self): """ Loop that runs in the background during replication. It periodically logs progress. """ while True: eventlet.sleep(self.stats_interval) self.stats_line() def detect_lockups(self): """ In testing, the pool.waitall() call very occasionally failed to return. This is an attempt to make sure the replicator finishes its replication pass in some eventuality. """ while True: eventlet.sleep(self.lockup_timeout) if self.replication_count == self.last_replication_count: self.logger.error(_("Lockup detected.. killing live coros.")) self.kill_coros() self.last_replication_count = self.replication_count def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] self.all_devs_info.update([(dev['replication_ip'], dev['device']) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [ dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices)) ]: found_local = True dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev ]) self.logger.warning(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue if (partition.startswith('auditor_status_') and partition.endswith('.json')): # ignore auditor status files continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: if part_nodes: self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in nodes ]) else: self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev ]) continue if not found_local: self.logger.error( "Can't find itself in policy with index %d with" " ips %s and with port %s in ring file, not" " replicating", int(policy), ", ".join(ips), self.port) return jobs def collect_jobs(self, override_devices=None, override_partitions=None, override_policies=None): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. :param override_devices: if set, only jobs on these devices will be returned :param override_partitions: if set, only jobs on these partitions will be returned :param override_policies: if set, only jobs in these storage policies will be returned """ jobs = [] ips = whataremyips(self.bind_ip) for policy in POLICIES: if policy.policy_type == REPL_POLICY: if (override_policies is not None and str(policy.idx) not in override_policies): continue # ensure rings are loaded for policy self.load_object_ring(policy) jobs += self.build_replication_jobs( policy, ips, override_devices=override_devices, override_partitions=override_partitions) random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs def replicate(self, override_devices=None, override_partitions=None, override_policies=None): """Run a replication pass""" self.start = time.time() self.suffix_count = 0 self.suffix_sync = 0 self.suffix_hash = 0 self.replication_count = 0 self.last_replication_count = -1 self.replication_cycle = (self.replication_cycle + 1) % 10 self.partition_times = [] self.my_replication_ips = self._get_my_replication_ips() self.all_devs_info = set() self.handoffs_remaining = 0 stats = eventlet.spawn(self.heartbeat) lockup_detector = eventlet.spawn(self.detect_lockups) eventlet.sleep() # Give spawns a cycle current_nodes = None try: self.run_pool = GreenPool(size=self.concurrency) jobs = self.collect_jobs(override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) for job in jobs: current_nodes = job['nodes'] if override_devices and job['device'] not in override_devices: continue if override_partitions and \ job['partition'] not in override_partitions: continue dev_path = join(self.devices_dir, job['device']) if self.mount_check and not ismount(dev_path): self._add_failure_stats([(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in job['nodes']]) self.logger.warning(_('%s is not mounted'), job['device']) continue if self.handoffs_first and not job['delete']: # in handoffs first mode, we won't process primary # partitions until rebalance was successful! if self.handoffs_remaining: self.logger.warning( _("Handoffs first mode still has handoffs " "remaining. Aborting current " "replication pass.")) break if not self.check_ring(job['policy'].object_ring): self.logger.info( _("Ring change detected. Aborting " "current replication pass.")) return try: if isfile(job['path']): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job['path']) os.remove(job['path']) continue except OSError: continue if job['delete']: self.run_pool.spawn(self.update_deleted, job) else: self.run_pool.spawn(self.update, job) current_nodes = None with Timeout(self.lockup_timeout): self.run_pool.waitall() except (Exception, Timeout): if current_nodes: self._add_failure_stats([(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in current_nodes]) else: self._add_failure_stats(self.all_devs_info) self.logger.exception(_("Exception in top-level replication loop")) self.kill_coros() finally: stats.kill() lockup_detector.kill() self.stats_line() self.stats['attempted'] = self.replication_count def run_once(self, *args, **kwargs): self._zero_stats() self.logger.info(_("Running object replicator in script mode.")) override_devices = list_from_csv(kwargs.get('devices')) override_partitions = list_from_csv(kwargs.get('partitions')) override_policies = list_from_csv(kwargs.get('policies')) if not override_devices: override_devices = None if not override_partitions: override_partitions = None if not override_policies: override_policies = None self.replicate(override_devices=override_devices, override_partitions=override_partitions, override_policies=override_policies) total = (time.time() - self.stats['start']) / 60 self.logger.info( _("Object replication complete (once). (%.02f minutes)"), total) if not (override_partitions or override_devices): replication_last = time.time() dump_recon_cache( { 'replication_stats': self.stats, 'replication_time': total, 'replication_last': replication_last, 'object_replication_time': total, 'object_replication_last': replication_last }, self.rcache, self.logger) def run_forever(self, *args, **kwargs): self.logger.info(_("Starting object replicator in daemon mode.")) # Run the replicator continually while True: self._zero_stats() self.logger.info(_("Starting object replication pass.")) # Run the replicator self.replicate() total = (time.time() - self.stats['start']) / 60 self.logger.info(_("Object replication complete. (%.02f minutes)"), total) replication_last = time.time() dump_recon_cache( { 'replication_stats': self.stats, 'replication_time': total, 'replication_last': replication_last, 'object_replication_time': total, 'object_replication_last': replication_last }, self.rcache, self.logger) self.logger.debug('Replication sleeping for %s seconds.', self.interval) sleep(self.interval)
class Chewie: """Facilitates EAP supplicant and RADIUS server communication""" RADIUS_UDP_PORT = 1812 PAE_GROUP_ADDRESS = MacAddress.from_string("01:80:C2:00:00:03") DEFAULT_PORT_UP_IDENTITY_REQUEST_WAIT_PERIOD = 20 DEFAULT_PREEMPTIVE_IDENTITY_REQUEST_INTERVAL = 60 # pylint: disable=too-many-arguments def __init__(self, interface_name, logger=None, auth_handler=None, failure_handler=None, logoff_handler=None, radius_server_ip=None, radius_server_port=None, radius_server_secret=None, chewie_id=None): self.interface_name = interface_name self.log_name = Chewie.__name__ if logger: self.log_name = logger.name + "." + Chewie.__name__ self.logger = get_logger(self.log_name) self.auth_handler = auth_handler self.failure_handler = failure_handler self.logoff_handler = logoff_handler self.radius_server_ip = radius_server_ip self.radius_secret = radius_server_secret self.radius_server_port = self.RADIUS_UDP_PORT if radius_server_port: self.radius_server_port = radius_server_port self.radius_listen_ip = "0.0.0.0" self.radius_listen_port = 0 self.chewie_id = "44-44-44-44-44-44:" # used by the RADIUS Attribute # 'Called-Station' in Access-Request if chewie_id: self.chewie_id = chewie_id self.state_machines = {} # port_id_str: { mac : state_machine} self.port_to_eapol_id = { } # port_id: last ID used in preemptive identity request. # TODO for port_to_eapol_id - may want to set ID to null (-1...) if sent from the # state machine. self.port_status = {} # port_id: status (true=up, false=down) self.port_to_identity_job = {} # port_id: timerJob self.eap_output_messages = Queue() self.radius_output_messages = Queue() self.radius_lifecycle = RadiusLifecycle(self.radius_secret, self.chewie_id, self.logger) self.timer_scheduler = timer_scheduler.TimerScheduler(self.logger) self.eap_socket = None self.mab_socket = None self.pool = None self.eventlets = None self.radius_socket = None self.interface_index = None self.eventlets = [] def run(self): """setup chewie and start socket eventlet threads""" self.logger.info("Starting") self.setup_eap_socket() self.setup_mab_socket() self.setup_radius_socket() self.start_threads_and_wait() def running(self): # pylint: disable=no-self-use """Used to nicely exit the event loops""" return True def shutdown(self): """kill eventlets and quit""" for eventlet in self.eventlets: eventlet.kill() def start_threads_and_wait(self): """Start the thread and wait until they complete (hopefully never)""" self.pool = GreenPool() self.eventlets.append(self.pool.spawn(self.send_eap_messages)) self.eventlets.append(self.pool.spawn(self.receive_eap_messages)) self.eventlets.append(self.pool.spawn(self.receive_mab_messages)) self.eventlets.append(self.pool.spawn(self.send_radius_messages)) self.eventlets.append(self.pool.spawn(self.receive_radius_messages)) self.eventlets.append(self.pool.spawn(self.timer_scheduler.run)) self.pool.waitall() def auth_success(self, src_mac, port_id, period, *args, **kwargs): # pylint: disable=unused-variable """authentication shim between faucet and chewie Args: src_mac (MacAddress): the mac of the successful supplicant port_id (MacAddress): the 'mac' identifier of what switch port the success is on period (int): time (seconds) until the session times out. """ if self.auth_handler: self.auth_handler(src_mac, port_id, *args, **kwargs) self.port_to_identity_job[port_id] = self.timer_scheduler.call_later( period, self.reauth_port, src_mac, port_id) def auth_failure(self, src_mac, port_id): """failure shim between faucet and chewie Args: src_mac (MacAddress): the mac of the failed supplicant port_id (MacAddress): the 'mac' identifier of what switch port the failure is on""" if self.failure_handler: self.failure_handler(src_mac, port_id) def auth_logoff(self, src_mac, port_id): """logoff shim between faucet and chewie Args: src_mac (MacAddress): the mac of the logoff supplicant port_id (MacAddress): the 'mac' identifier of what switch port the logoff is on""" if self.logoff_handler: self.logoff_handler(src_mac, port_id) def port_down(self, port_id): """ should be called by faucet when port has gone down. Args: port_id (str): id of port. """ # all chewie needs to do is change its internal state. # faucet will remove the acls by itself. self.set_port_status(port_id, False) job = self.port_to_identity_job.get(port_id, None) if port_id in self.state_machines: del self.state_machines[port_id] if job: job.cancel() self.port_to_eapol_id.pop(port_id, None) def port_up(self, port_id): """ should be called by faucet when port has come up Args: port_id (str): id of port. """ self.logger.info("port %s up", port_id) self.set_port_status(port_id, True) self.port_to_identity_job[port_id] = self.timer_scheduler.call_later( self.DEFAULT_PORT_UP_IDENTITY_REQUEST_WAIT_PERIOD, self.send_preemptive_identity_request_if_no_active_on_port, port_id) def send_preemptive_identity_request_if_no_active_on_port(self, port_id): """ If there is no active (in progress, or in state success(2)) supplicant send out the preemptive identity request message. Args: port_id (str): """ self.logger.debug( "thinking about executing timer preemptive on port %s", port_id) # schedule next request. self.port_to_identity_job[port_id] = self.timer_scheduler.call_later( self.DEFAULT_PREEMPTIVE_IDENTITY_REQUEST_INTERVAL, self.send_preemptive_identity_request_if_no_active_on_port, port_id) if not self.port_status.get(port_id, False): self.logger.debug('cant send output on port %s is down', port_id) return state_machines = self.state_machines.get(port_id, {}) # pylint: disable=invalid-name for sm in state_machines.values(): if sm.is_in_progress() or sm.is_success(): self.logger.debug('port is active not sending on port %s', port_id) break else: self.logger.debug("executing timer premptive on port %s", port_id) self.send_preemptive_identity_request(port_id) def send_preemptive_identity_request(self, port_id, state_machine=None): """ Message (EAP Identity Request) that notifies supplicant that port is using 802.1X Args: port_id (str): """ _id = get_random_id() # ID of preemptive reauth attempt must be different to ID of initial authentication. if state_machine is not None and hasattr(state_machine, 'current_id'): while _id == state_machine.current_id: _id = get_random_id() data = IdentityMessage(self.PAE_GROUP_ADDRESS, _id, Eap.REQUEST, "") self.port_to_eapol_id[port_id] = _id self.eap_output_messages.put_nowait( EapQueueMessage(data, self.PAE_GROUP_ADDRESS, MacAddress.from_string(port_id))) self.logger.info("sending premptive on port %s with ID %s", port_id, _id) def reauth_port(self, src_mac, port_id): """ Send an Identity Request to src_mac, on port_id. prompting the supplicant to re authenticate. Args: src_mac (MacAddress): port_id (str): """ state_machine = self.state_machines.get(port_id, {}).get(str(src_mac), None) if state_machine and state_machine.is_success(): self.logger.info('reauthenticating src_mac: %s on port: %s', src_mac, port_id) self.send_preemptive_identity_request(port_id, state_machine) elif state_machine is None: self.logger.debug( 'not reauthing. state machine on port: %s, mac: %s is none', port_id, src_mac) else: self.logger.debug( "not reauthing, authentication is not in success(2) (state: %s)'", state_machine.state) def set_port_status(self, port_id, status): """ Send status of a port at port_id Args: port_id (): status (): """ port_id_str = str(port_id) self.port_status[port_id] = status if port_id_str not in self.state_machines: self.state_machines[port_id_str] = {} for _, state_machine in self.state_machines[port_id_str].items(): event = EventPortStatusChange(status) state_machine.event(event) def setup_eap_socket(self): """Setup EAP socket""" log_prefix = "%s.EapSocket" % self.logger.name self.eap_socket = EapSocket(self.interface_name, log_prefix) self.eap_socket.setup() def setup_mab_socket(self): """Setup Mab socket""" log_prefix = "%s.MabSocket" % self.logger.name self.mab_socket = MabSocket(self.interface_name, log_prefix) self.mab_socket.setup() def setup_radius_socket(self): """Setup Radius socket""" log_prefix = "%s.RadiusSocket" % self.logger.name self.radius_socket = RadiusSocket(self.radius_listen_ip, self.radius_listen_port, self.radius_server_ip, self.radius_server_port, log_prefix) self.radius_socket.setup() self.logger.info("Radius Listening on %s:%d", self.radius_listen_ip, self.radius_listen_port) def send_eap_messages(self): """Send EAP messages to Supplicant forever.""" while self.running(): sleep(0) eap_queue_message = self.eap_output_messages.get() self.logger.info("Sending message %s from %s to %s", eap_queue_message.message, str(eap_queue_message.port_mac), str(eap_queue_message.src_mac)) self.eap_socket.send( MessagePacker.ethernet_pack(eap_queue_message.message, eap_queue_message.port_mac, eap_queue_message.src_mac)) def send_eth_to_state_machine(self, packed_message): """Send an ethernet frame to MAB State Machine""" ethernet_packet = EthernetPacket.parse(packed_message) port_id = ethernet_packet.dst_mac src_mac = ethernet_packet.src_mac self.logger.info("Sending MAC to MAB State Machine: %s", src_mac) message_id = -2 state_machine = self.get_state_machine(src_mac, port_id, message_id) event = EventMessageReceived(ethernet_packet, port_id) state_machine.event(event) # NOTE: Should probably throttle packets in once one is received def receive_eap_messages(self): """receive eap messages from supplicant forever.""" while self.running(): sleep(0) self.logger.info("waiting for eap.") packed_message = self.eap_socket.receive() self.logger.info("Received packed_message: %s", str(packed_message)) try: eap, dst_mac = MessageParser.ethernet_parse(packed_message) except MessageParseError as exception: self.logger.warning( "MessageParser.ethernet_parse threw exception.\n" " packed_message: '%s'.\n" " exception: '%s'.", packed_message, exception) continue self.logger.info("Received eap message: %s", str(eap)) self.send_eap_to_state_machine(eap, dst_mac) def receive_mab_messages(self): """Receive DHCP request for MAB.""" while self.running(): sleep(0) self.logger.info("waiting for MAB activity.") packed_message = self.mab_socket.receive() self.logger.info( "Received DHCP packet for MAB. packed_message: %s", str(packed_message)) self.send_eth_to_state_machine(packed_message) def send_eap_to_state_machine(self, eap, dst_mac): """sends an eap message to the state machine""" self.logger.info("eap EAP(): %s", eap) message_id = getattr(eap, 'message_id', -1) state_machine = self.get_state_machine(eap.src_mac, dst_mac, message_id) # Check for response to preemptive_eap preemptive_eap_message_id = self.port_to_eapol_id.get(str(dst_mac), -2) if message_id != -1 and message_id == preemptive_eap_message_id: self.logger.debug( 'eap packet is response to chewie initiated authentication') event = EventPreemptiveEAPResponseMessageReceived( eap, dst_mac, preemptive_eap_message_id) else: event = EventMessageReceived(eap, dst_mac) state_machine.event(event) def send_radius_messages(self): """send RADIUS messages to RADIUS Server forever.""" while self.running(): sleep(0) radius_output_bits = self.radius_output_messages.get() packed_message = self.radius_lifecycle.process_outbound( radius_output_bits) self.radius_socket.send(packed_message) self.logger.info("sent radius message.") def receive_radius_messages(self): """receive RADIUS messages from RADIUS server forever.""" while self.running(): sleep(0) self.logger.info("waiting for radius.") packed_message = self.radius_socket.receive() try: radius = MessageParser.radius_parse(packed_message, self.radius_secret, self.radius_lifecycle) except MessageParseError as exception: self.logger.warning( "MessageParser.radius_parse threw exception.\n" " packed_message: '%s'.\n" " exception: '%s'.", packed_message, exception) continue self.logger.info("Received RADIUS message: %s", str(radius)) self.send_radius_to_state_machine(radius) def send_radius_to_state_machine(self, radius): """sends a radius message to the state machine""" event = self.radius_lifecycle.build_event_radius_message_received( radius) state_machine = self.get_state_machine_from_radius_packet_id( radius.packet_id) state_machine.event(event) def get_state_machine_from_radius_packet_id(self, packet_id): """Gets a FullEAPStateMachine from the RADIUS message packet_id Args: packet_id (int): id of the received RADIUS message Returns: FullEAPStateMachine """ return self.get_state_machine( **self.radius_lifecycle.packet_id_to_mac[packet_id]) # TODO change message_id functionality def get_state_machine(self, src_mac, port_id, message_id=-1): """Gets or creates if it does not already exist an FullEAPStateMachine for the src_mac. Args: message_id (int): eap message id, -1 means none found. src_mac (MacAddress): who's to get. port_id (MacAddress): ID of the port where the src_mac is. Returns: FullEAPStateMachine """ port_id_str = str(port_id) src_mac_str = str(src_mac) port_state_machines = self.state_machines.get(port_id_str, None) if port_state_machines is None: self.state_machines[port_id_str] = {} self.logger.info("Port based state machines are as follows: %s", self.state_machines[port_id_str]) state_machine = self.state_machines[port_id_str].get(src_mac_str, None) if not state_machine and message_id == -2: # Do MAB self.logger.info("Creating MAB State Machine") log_prefix = "%s.SM - port: %s, client: %s" % ( self.logger.name, port_id_str, src_mac) state_machine = MacAuthenticationBypassStateMachine( self.radius_output_messages, src_mac, self.timer_scheduler, self.auth_success, self.auth_failure, log_prefix) self.state_machines[port_id_str][src_mac_str] = state_machine return state_machine if not state_machine: self.logger.info("Creating EAP FULL State Machine") log_prefix = "%s.SM - port: %s, client: %s" % ( self.logger.name, port_id_str, src_mac) state_machine = FullEAPStateMachine(self.eap_output_messages, self.radius_output_messages, src_mac, self.timer_scheduler, self.auth_success, self.auth_failure, self.auth_logoff, log_prefix) self.state_machines[port_id_str][src_mac_str] = state_machine self.logger.debug( "created new state machine for '%s' on port '%s'", src_mac_str, port_id_str) return state_machine
def run(self, *args, **kwargs): try: self.logger.info('event agent: starting') pool = GreenPool(len(self.workers)) for worker in self.workers: pool.spawn(worker.start) def front(server, backend): while True: msg = server.recv_multipart() if validate_msg(msg): try: event_id = sqlite3.Binary(msg[2]) data = msg[3] self.queue.put(event_id, data) event = ['', msg[2], msg[3]] backend.send_multipart(event) except Exception: pass finally: ack = msg[0:3] server.send_multipart(ack) def back(backend): while True: msg = backend.recv_multipart() event_id = msg[1] success = msg[2] event_id = sqlite3.Binary(event_id) if not success: self.queue.failed(event_id) else: self.queue.delete(event_id) boss_pool = GreenPool(2) boss_pool.spawn_n(front, self.server, self.backend) boss_pool.spawn_n(back, self.backend) while True: sleep(1) now = time.time() if now - self.last_retry > self.retry_interval: self.retry() self.last_retry = now for w in self.workers: if w.failed: self.workers.remove(w) self.logger.warn('restart worker "%s"', w.name) new_w = EventWorker(self.conf, w.name, self.context) self.workers.append(new_w) pool.spawn(new_w.start) except Exception as e: self.logger.error('ERROR in main loop %s', e) raise e finally: self.logger.warn('event agent: stopping') self.stop_workers()
def reap_container(self, account, account_partition, account_nodes, container): """ Deletes the data and the container itself for the given container. This will call :func:`reap_object` up to sqrt(self.concurrency) times concurrently for the objects in the container. If there is any exception while deleting a single object, the process will continue for any other objects in the container and the failed objects will be tried again the next time this function is called with the same parameters. If there is any exception while listing the objects for deletion, the process will stop (but will obviously be tried again the next time this function is called with the same parameters). This is a possibility since the listing comes from querying just the primary remote container server. Once all objects have been attempted to be deleted, the container itself will be attempted to be deleted by sending a delete request to all container nodes. The format of the delete request is such that each container server will update a corresponding account server, removing the container from the account's listing. This function returns nothing and should raise no exception but only update various self.stats_* values for what occurs. :param account: The name of the account for the container. :param account_partition: The partition for the account on the account ring. :param account_nodes: The primary node dicts for the account. :param container: The name of the container to delete. * See also: :func:`chase.common.ring.Ring.get_nodes` for a description of the account node dicts. """ account_nodes = list(account_nodes) part, nodes = self.get_container_ring().get_nodes(account, container) node = nodes[-1] pool = GreenPool(size=self.object_concurrency) marker = '' while True: objects = None try: objects = direct_get_container(node, part, account, container, marker=marker, conn_timeout=self.conn_timeout, response_timeout=self.node_timeout)[1] self.stats_return_codes[2] = \ self.stats_return_codes.get(2, 0) + 1 except ClientException, err: if self.logger.getEffectiveLevel() <= DEBUG: self.logger.exception( _('Exception with %(ip)s:%(port)s/%(device)s'), node) self.stats_return_codes[err.http_status / 100] = \ self.stats_return_codes.get(err.http_status / 100, 0) + 1 if not objects: break try: for obj in objects: if isinstance(obj['name'], unicode): obj['name'] = obj['name'].encode('utf8') pool.spawn(self.reap_object, account, container, part, nodes, obj['name']) pool.waitall() except (Exception, Timeout): self.logger.exception(_('Exception with objects for container ' '%(container)s for account %(account)s'), {'container': container, 'account': account}) marker = objects[-1]['name']