def waitStoreResponses(self, txn_context, tryToResolveConflict): result = [] append = result.append resolved_oid_set = set() update = resolved_oid_set.update _handleConflicts = self._handleConflicts queue = txn_context['queue'] conflict_serial_dict = txn_context['conflict_serial_dict'] pending = self.dispatcher.pending _waitAnyTransactionMessage = self._waitAnyTransactionMessage while pending(queue) or conflict_serial_dict: # Note: handler data can be overwritten by _handleConflicts # so we must set it for each iteration. _waitAnyTransactionMessage(txn_context) if conflict_serial_dict: conflicts = _handleConflicts(txn_context, tryToResolveConflict) if conflicts: update(conflicts) # Check for never-stored objects, and update result for all others for oid, store_dict in \ txn_context['object_stored_counter_dict'].iteritems(): if not store_dict: logging.error('tpc_store failed') raise NEOStorageError('tpc_store failed') elif oid in resolved_oid_set: append((oid, ResolvedSerial)) return result
def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port handler = identification.IdentificationHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) self.server = self.listening_conn.getAddress() # Connect to a primary master node, verify data, and # start the operation. This cycle will be executed permanently, # until the user explicitly requests a shutdown. self.operational = False while True: self.cluster_state = None if self.master_node is None: # look for the primary master self.connectToPrimary() self.checker = Checker(self) self.replicator = Replicator(self) self.tm = TransactionManager(self) try: self.initialize() self.doOperation() raise RuntimeError, 'should not reach here' except StoppedOperation, msg: logging.error('operation stopped: %s', msg) except PrimaryFailure, msg: logging.error('primary master is down: %s', msg)
def _connectToPrimaryNode(self): """ Lookup for the current primary master node """ logging.debug('connecting to primary master...') self.start() index = -1 ask = self._ask handler = self.primary_bootstrap_handler while 1: # Get network connection to primary master while 1: if self.primary_master_node is not None: # If I know a primary master node, pinpoint it. self.trying_master_node = self.primary_master_node self.primary_master_node = None else: # Otherwise, check one by one. master_list = self.nm.getMasterList() index = (index + 1) % len(master_list) self.trying_master_node = master_list[index] # Connect to master conn = MTClientConnection(self, self.notifications_handler, node=self.trying_master_node, dispatcher=self.dispatcher) # Query for primary master node if conn.getConnector() is None: # This happens if a connection could not be established. logging.error('Connection to master node %s failed', self.trying_master_node) continue try: ask(conn, Packets.RequestIdentification(NodeTypes.CLIENT, self.uuid, None, self.name), handler=handler) except ConnectionClosed: continue # If we reached the primary master node, mark as connected if self.primary_master_node is not None and \ self.primary_master_node is self.trying_master_node: break logging.info('Connected to %s', self.primary_master_node) try: # Request identification and required informations to be # operational. Might raise ConnectionClosed so that the new # primary can be looked-up again. logging.info('Initializing from master') ask(conn, Packets.AskNodeInformation(), handler=handler) ask(conn, Packets.AskPartitionTable(), handler=handler) ask(conn, Packets.AskLastTransaction(), handler=handler) if self.pt.operational(): break except ConnectionClosed: logging.error('Connection to %s lost', self.trying_master_node) self.primary_master_node = None logging.info("Connected and ready") return conn
def repair(self, weak_app, dry_run): t = self._repairing if t and t.is_alive(): logging.error('already repairing') return def repair(): l = threading.Lock() l.acquire() def finalize(): try: if data_id_list and not dry_run: self.commit() logging.info("repair: deleted %s orphan records", self._pruneData(data_id_list)) self.commit() finally: l.release() try: with self._duplicate() as db: data_id_list = db.getOrphanList() logging.info("repair: found %s records that may be orphan", len(data_id_list)) weak_app().em.wakeup(finalize) l.acquire() finally: del self._repairing logging.info("repair: done") t = self._repairing = threading.Thread(target=repair) t.daemon = 1 t.start()
def _connectToPrimaryNode(self): """ Lookup for the current primary master node """ logging.debug('connecting to primary master...') self.start() index = -1 ask = self._ask handler = self.primary_bootstrap_handler while 1: # Get network connection to primary master while 1: if self.primary_master_node is not None: # If I know a primary master node, pinpoint it. self.trying_master_node = self.primary_master_node self.primary_master_node = None else: # Otherwise, check one by one. master_list = self.nm.getMasterList() index = (index + 1) % len(master_list) self.trying_master_node = master_list[index] # Connect to master conn = MTClientConnection(self, self.notifications_handler, node=self.trying_master_node, dispatcher=self.dispatcher) # Query for primary master node if conn.getConnector() is None: # This happens if a connection could not be established. logging.error('Connection to master node %s failed', self.trying_master_node) continue try: ask(conn, Packets.RequestIdentification( NodeTypes.CLIENT, self.uuid, None, self.name), handler=handler) except ConnectionClosed: continue # If we reached the primary master node, mark as connected if self.primary_master_node is not None and \ self.primary_master_node is self.trying_master_node: break logging.info('Connected to %s', self.primary_master_node) try: # Request identification and required informations to be # operational. Might raise ConnectionClosed so that the new # primary can be looked-up again. logging.info('Initializing from master') ask(conn, Packets.AskNodeInformation(), handler=handler) ask(conn, Packets.AskPartitionTable(), handler=handler) ask(conn, Packets.AskLastTransaction(), handler=handler) if self.pt.operational(): break except ConnectionClosed: logging.error('Connection to %s lost', self.trying_master_node) self.primary_master_node = None logging.info("Connected and ready") return conn
def _connectToPrimaryNode(self): """ Lookup for the current primary master node """ logging.debug('connecting to primary master...') self.start() index = -1 fail_count = 0 ask = self._ask handler = self.primary_bootstrap_handler while 1: self.ignore_invalidations = True # Get network connection to primary master while fail_count < self.max_reconnection_to_master: self.nm.reset() if self.primary_master_node is not None: # If I know a primary master node, pinpoint it. node = self.primary_master_node self.primary_master_node = None else: # Otherwise, check one by one. master_list = self.nm.getMasterList() index = (index + 1) % len(master_list) node = master_list[index] # Connect to master conn = MTClientConnection(self, self.notifications_handler, node=node, dispatcher=self.dispatcher) p = Packets.RequestIdentification(NodeTypes.CLIENT, self.uuid, None, self.name, None) try: ask(conn, p, handler=handler) except ConnectionClosed: fail_count += 1 else: self.primary_master_node = node break else: raise NEOPrimaryMasterLost( "Too many connection failures to the primary master") logging.info('Connected to %s', self.primary_master_node) try: # Request identification and required informations to be # operational. Might raise ConnectionClosed so that the new # primary can be looked-up again. logging.info('Initializing from master') ask(conn, Packets.AskPartitionTable(), handler=handler) ask(conn, Packets.AskLastTransaction(), handler=handler) if self.pt.operational(): break except ConnectionClosed: logging.error('Connection to %s lost', self.trying_master_node) self.primary_master_node = None fail_count += 1 logging.info("Connected and ready") return conn
def askStorage(conn, packet): tid, next_tid, compression, checksum, data, data_tid \ = self._askStorage(conn, packet) if data or checksum != ZERO_HASH: if checksum != makeChecksum(data): logging.error('wrong checksum from %s for oid %s', conn, dump(oid)) raise NEOStorageReadRetry(False) return (decompress_list[compression](data), tid, next_tid, data_tid) raise NEOStorageCreationUndoneError(dump(oid))
def getDataTID(tid=None, before_tid=None): tid, data_tid = self._getDataTID(oid, tid, before_tid) current_tid = tid while data_tid: if data_tid < tid: tid, data_tid = self._getDataTID(oid, data_tid) if tid is not None: continue logging.error("Incorrect data serial for oid %s at tid %s", oid, current_tid) return current_tid, current_tid return current_tid, tid
def requestIdentification(self, conn, node_type, uuid, address, name): self.checkClusterName(name) # reject any incoming connections if not ready if not self.app.ready: raise NotReadyError app = self.app if uuid is None: if node_type != NodeTypes.STORAGE: raise ProtocolError('reject anonymous non-storage node') handler = StorageOperationHandler(self.app) conn.setHandler(handler) else: if uuid == app.uuid: raise ProtocolError("uuid conflict or loopback connection") node = app.nm.getByUUID(uuid) # If this node is broken, reject it. if node is not None and node.isBroken(): raise BrokenNodeDisallowedError # choose the handler according to the node type if node_type == NodeTypes.CLIENT: handler = ClientOperationHandler if node is None: node = app.nm.createClient(uuid=uuid) elif node.isConnected(): # This can happen if we haven't processed yet a notification # from the master, telling us the existing node is not # running anymore. If we accept the new client, we won't # know what to do with this late notification. raise NotReadyError('uuid conflict: retry later') node.setRunning() elif node_type == NodeTypes.STORAGE: if node is None: logging.error('reject an unknown storage node %s', uuid_str(uuid)) raise NotReadyError handler = StorageOperationHandler else: raise ProtocolError('reject non-client-or-storage node') # apply the handler and set up the connection handler = handler(self.app) conn.setHandler(handler) node.setConnection(conn, app.uuid < uuid) # accept the identification and trigger an event conn.answer( Packets.AcceptIdentification(NodeTypes.STORAGE, uuid and app.uuid, app.pt.getPartitions(), app.pt.getReplicas(), uuid, app.master_node.getAddress(), ())) handler.connectionCompleted(conn)
def requestIdentification(self, conn, node_type, uuid, address, name): self.checkClusterName(name) # reject any incoming connections if not ready if not self.app.ready: raise NotReadyError app = self.app if uuid is None: if node_type != NodeTypes.STORAGE: raise ProtocolError('reject anonymous non-storage node') handler = StorageOperationHandler(self.app) conn.setHandler(handler) else: if uuid == app.uuid: raise ProtocolError("uuid conflict or loopback connection") node = app.nm.getByUUID(uuid) # If this node is broken, reject it. if node is not None and node.isBroken(): raise BrokenNodeDisallowedError # choose the handler according to the node type if node_type == NodeTypes.CLIENT: handler = ClientOperationHandler if node is None: node = app.nm.createClient(uuid=uuid) elif node.isConnected(): # This can happen if we haven't processed yet a notification # from the master, telling us the existing node is not # running anymore. If we accept the new client, we won't # know what to do with this late notification. raise NotReadyError('uuid conflict: retry later') node.setRunning() elif node_type == NodeTypes.STORAGE: if node is None: logging.error('reject an unknown storage node %s', uuid_str(uuid)) raise NotReadyError handler = StorageOperationHandler else: raise ProtocolError('reject non-client-or-storage node') # apply the handler and set up the connection handler = handler(self.app) conn.setHandler(handler) node.setConnection(conn, app.uuid < uuid) # accept the identification and trigger an event conn.answer(Packets.AcceptIdentification(NodeTypes.STORAGE, uuid and app.uuid, app.pt.getPartitions(), app.pt.getReplicas(), uuid, app.master_node.getAddress(), ())) handler.connectionCompleted(conn)
def _initNodeConnection(self, node): """Init a connection to a given storage node.""" app = self.app logging.debug('trying to connect to %s - %s', node, node.getState()) conn = MTClientConnection(app, app.storage_event_handler, node, dispatcher=app.dispatcher) p = Packets.RequestIdentification(NodeTypes.CLIENT, app.uuid, None, app.name) try: app._ask(conn, p, handler=app.storage_bootstrap_handler) except ConnectionClosed: logging.error('Connection to %r failed', node) except NodeNotReady: logging.info('%r not ready', node) else: logging.info('Connected %r', node) return conn self.notifyFailure(node)
def _loadFromStorage(self, oid, at_tid, before_tid): packet = Packets.AskObject(oid, at_tid, before_tid) for node, conn in self.cp.iterateForObject(oid, readable=True): try: tid, next_tid, compression, checksum, data, data_tid \ = self._askStorage(conn, packet) except ConnectionClosed: continue if data or checksum != ZERO_HASH: if checksum != makeChecksum(data): logging.error('wrong checksum from %s for oid %s', conn, dump(oid)) continue return (decompress(data) if compression else data, tid, next_tid, data_tid) raise NEOStorageCreationUndoneError(dump(oid)) raise NEOStorageError("storage down or corrupted data")
def _run(self): """Make sure that the status is sane and start a loop.""" if len(self.name) == 0: raise RuntimeError, 'cluster name must be non-empty' # Make a listening port. handler = AdminEventHandler(self) self.listening_conn = ListeningConnection(self, handler, self.server) while self.cluster_state != ClusterStates.STOPPING: self.connectToPrimary() try: while True: self.em.poll(1) except PrimaryFailure: logging.error('primary master is down') self.listening_conn.close() while not self.em.isIdle(): self.em.poll(1)
def _initNodeConnection(self, node): """Init a connection to a given storage node.""" app = self.app if app.master_conn is None: raise NEOPrimaryMasterLost conn = MTClientConnection(app, app.storage_event_handler, node, dispatcher=app.dispatcher) p = Packets.RequestIdentification(NodeTypes.CLIENT, app.uuid, None, app.name, app.id_timestamp) try: app._ask(conn, p, handler=app.storage_bootstrap_handler) except ConnectionClosed: logging.error('Connection to %r failed', node) except NodeNotReady: logging.info('%r not ready', node) else: logging.info('Connected %r', node) return conn self.node_failure_dict[node.getUUID()] = time.time() + MAX_FAILURE_AGE
def tpc_vote(self, transaction, tryToResolveConflict): """Store current transaction.""" txn_context = self._txn_container.get(transaction) result = self.waitStoreResponses(txn_context, tryToResolveConflict) ttid = txn_context['ttid'] # Store data on each node assert not txn_context['data_dict'], txn_context packet = Packets.AskStoreTransaction(ttid, str(transaction.user), str(transaction.description), dumps(transaction._extension), txn_context['cache_dict']) queue = txn_context['queue'] trans_nodes = [] for node, conn in self.cp.iterateForObject(ttid): logging.debug("voting transaction %s on %s", dump(ttid), dump(conn.getUUID())) try: conn.ask(packet, queue=queue) except ConnectionClosed: continue trans_nodes.append(node) # check at least one storage node accepted if trans_nodes: involved_nodes = txn_context['involved_nodes'] packet = Packets.AskVoteTransaction(ttid) for node in involved_nodes.difference(trans_nodes): conn = self.cp.getConnForNode(node) if conn is not None: try: conn.ask(packet, queue=queue) except ConnectionClosed: pass involved_nodes.update(trans_nodes) self.waitResponses(queue) txn_context['voted'] = None # We must not go further if connection to master was lost since # tpc_begin, to lower the probability of failing during tpc_finish. if 'error' in txn_context: raise NEOStorageError(txn_context['error']) return result logging.error('tpc_vote failed') raise NEOStorageError('tpc_vote failed')
def _connectToStorageNode(self, node): if self.master_conn is None: raise NEOPrimaryMasterLost conn = MTClientConnection(self, self.storage_event_handler, node, dispatcher=self.dispatcher) p = Packets.RequestIdentification(NodeTypes.CLIENT, self.uuid, None, self.name, self.id_timestamp, {}) try: self._ask(conn, p, handler=self.storage_bootstrap_handler) except ConnectionClosed: logging.error('Connection to %r failed', node) except NodeNotReady: logging.info('%r not ready', node) else: logging.info('Connected %r', node) # Make sure this node will be considered for the next reads # even if there was a previous recent failure. self._node_failure_dict.pop(node.getUUID(), None) return conn self._node_failure_dict[node.getUUID()] = time.time() + MAX_FAILURE_AGE
def provideService(self): logging.info('provide backup') poll = self.em.poll app = self.app pt = app.pt while True: app.changeClusterState(ClusterStates.STARTING_BACKUP) bootstrap = BootstrapManager(self, NodeTypes.CLIENT, backup=app.name) # {offset -> node} self.primary_partition_dict = {} # [[tid]] self.tid_list = tuple([] for _ in xrange(pt.getPartitions())) try: while True: for node in pt.getNodeSet(readable=True): if not app.isStorageReady(node.getUUID()): break else: break poll(1) node, conn = bootstrap.getPrimaryConnection() try: app.changeClusterState(ClusterStates.BACKINGUP) del bootstrap, node self.ignore_invalidations = True conn.setHandler(BackupHandler(self)) conn.ask(Packets.AskLastTransaction()) # debug variable to log how big 'tid_list' can be. self.debug_tid_count = 0 while True: poll(1) except PrimaryFailure, msg: logging.error('upstream master is down: %s', msg) finally: app.backup_tid = pt.getBackupTid() try: conn.close() except PrimaryFailure: pass try: del self.pt except AttributeError: pass for node in app.nm.getClientList(True): node.getConnection().close() except StateChangedException, e: if e.args[0] != ClusterStates.STOPPING_BACKUP: raise app.changeClusterState(*e.args) tid = app.backup_tid # Wait for non-primary partitions to catch up, # so that all UP_TO_DATE cells are really UP_TO_DATE. # XXX: Another possibility could be to outdate such cells, and # they would be quickly updated at the beginning of the # RUNNING phase. This may simplify code. # Any unfinished replication from upstream will be truncated. while pt.getBackupTid(min) < tid: poll(1) last_tid = app.getLastTransaction() handler = EventHandler(app) if tid < last_tid: assert tid != ZERO_TID logging.warning("Truncating at %s (last_tid was %s)", dump(app.backup_tid), dump(last_tid)) else: # We will do a dummy truncation, just to leave backup mode, # so it's fine to start automatically if there's any # missing storage. # XXX: Consider using another method to leave backup mode, # at least when there's nothing to truncate. Because # in case of StoppedOperation during VERIFYING state, # this flag will be wrongly set to False. app._startup_allowed = True # If any error happened before reaching this line, we'd go back # to backup mode, which is the right mode to recover. del app.backup_tid # Now back to RECOVERY... return tid
def electPrimary(self): """Elect a primary master node. The difficulty is that a master node must accept connections from others while attempting to connect to other master nodes at the same time. Note that storage nodes and client nodes may connect to self as well as master nodes.""" logging.info('begin the election of a primary master') client_handler = election.ClientElectionHandler(self) self.unconnected_master_node_set.clear() self.negotiating_master_node_set.clear() self.master_address_dict.clear() self.listening_conn.setHandler(election.ServerElectionHandler(self)) getByAddress = self.nm.getByAddress while True: # handle new connected masters for node in self.nm.getMasterList(): node.setUnknown() self.unconnected_master_node_set.add(node.getAddress()) # start the election process self.primary = None self.primary_master_node = None try: while (self.unconnected_master_node_set or self.negotiating_master_node_set): for addr in self.unconnected_master_node_set: self.negotiating_master_node_set.add(addr) ClientConnection(self, client_handler, # XXX: Ugly, but the whole election code will be # replaced soon getByAddress(addr)) self.unconnected_master_node_set.clear() self.em.poll(1) except ElectionFailure, m: # something goes wrong, clean then restart logging.error('election failed: %s', m) # Ask all connected nodes to reelect a single primary master. for conn in self.em.getClientList(): conn.notify(Packets.ReelectPrimary()) conn.abort() # Wait until the connections are closed. self.primary = None self.primary_master_node = None # XXX: Since poll does not wake up anymore every second, # the following time condition should be reviewed. # See also playSecondaryRole. t = time() + 10 while self.em.getClientList() and time() < t: try: self.em.poll(1) except ElectionFailure: pass # Close all connections. for conn in self.em.getClientList() + self.em.getServerList(): conn.close() else: # election succeed, stop the process self.primary = self.primary is None break
def _nextPartition(self): app = self.app def connect(node, uuid=app.uuid, name=app.name): if node.getUUID() == app.uuid: return if node.isConnected(connecting=True): conn = node.getConnection() conn.asClient() else: conn = ClientConnection(app, StorageOperationHandler(app), node) conn.ask(Packets.RequestIdentification( NodeTypes.STORAGE, uuid, app.server, name)) self.conn_dict[conn] = node.isIdentified() conn_set = set(self.conn_dict) conn_set.discard(None) try: self.conn_dict.clear() while True: try: partition, (name, source), min_tid, max_tid = \ self.queue.popleft() except IndexError: return cell = app.pt.getCell(partition, app.uuid) if cell is None or cell.isOutOfDate(): msg = "discarded or out-of-date" else: try: for cell in app.pt.getCellList(partition): # XXX: Ignore corrupted cells for the moment # because we're still unable to fix them # (see also AdministrationHandler of master) if cell.isReadable(): #if not cell.isOutOfDate(): connect(cell.getNode()) if source: node = app.nm.getByAddress(source) if name: source = app.nm.createStorage(address=source) \ if node is None else node connect(source, None, name) elif (node.getUUID() == app.uuid or node.isConnected(connecting=True) and node.getConnection() in self.conn_dict): source = node else: msg = "unavailable source" if self.conn_dict: break msg = "no replica" except ConnectionClosed: msg = "connection closed" finally: conn_set.update(self.conn_dict) self.conn_dict.clear() logging.error("Failed to start checking partition %u (%s)", partition, msg) conn_set.difference_update(self.conn_dict) finally: for conn in conn_set: app.closeClient(conn) logging.debug("start checking partition %u from %s to %s", partition, dump(min_tid), dump(max_tid)) self.min_tid = self.next_tid = min_tid self.max_tid = max_tid self.next_oid = None self.partition = partition self.source = source def start(): if app.tm.isLockedTid(max_tid): app.queueEvent(start) return args = partition, CHECK_COUNT, min_tid, max_tid p = Packets.AskCheckTIDRange(*args) for conn, identified in self.conn_dict.items(): self.conn_dict[conn] = conn.ask(p) if identified else None self.conn_dict[None] = app.dm.checkTIDRange(*args) start()
def _notify(self, ask_ids=True): if ask_ids: self.askLastIds(self.master_conn) self.notifying = notifying = {None} for name, monitor in self.backup_dict.iteritems(): if monitor.operational: monitor.askLastIds(monitor.conn) notifying.add(name) if self.notifying or self.cluster_state is None is not self.master_conn: return severity = [], [], [] my_severity = self.severity severity[my_severity].append(self.name) changed = set() if self.monitor_changed: self.monitor_changed = False changed.add(self.name) if self.master_conn is None: body = NOT_CONNECTED_MESSAGE else: upstream, body = self.formatSummary() body = [body] for name, backup in self.backup_dict.iteritems(): body += '', name, ' ' + backup.formatSummary(upstream)[1] severity[backup.severity or backup.lagging].append(name) if backup.monitor_changed: backup.monitor_changed = False changed.add(name) body = '\n'.join(body) if changed or self.smtp_retry < time(): logging.debug('monitor notification') email_list = self.email_list while email_list: # not a loop msg = MIMEText(body + (self.smtp_exc or '')) msg['Date'] = formatdate() clusters, x = severity[1:] while 1: if x: clusters = clusters + x x = 'PROBLEM' elif clusters: x = 'WARNING' else: x = 'OK' break clusters = changed.intersection(clusters) if clusters: x += ' (%s)' % ', '.join(sorted(clusters)) break msg['Subject'] = 'NEO monitoring: ' + x msg['From'] = self.email_from msg['To'] = ', '.join(email_list) s = self.SMTP() try: s.connect(self.smtp_host) if self.smtp_tls: s.starttls() if self.smtp_login: s.login(*self.smtp_login) s.sendmail(None, email_list, msg.as_string()) except Exception: x = format_exc() logging.error(x) if changed or not self.smtp_exc: self.smtp_exc = ( "\n\nA notification could not be sent at %s:\n\n%s" % (msg['Date'], x)) retry = self.smtp_retry = time() + 600 else: self.smtp_exc = None self.smtp_retry = INF if not (self.operational and any( monitor.operational for monitor in self.backup_dict.itervalues())): break retry = time() + 600 finally: s.close() self.em.setTimeout(retry, self._notify) break neoctl = self.asking_monitor_information if neoctl: del severity[my_severity][0] if self.smtp_exc: my_severity = 2 body += self.smtp_exc severity[1].sort() severity[2].sort() severity[my_severity].insert(0, None) p = Packets.AnswerMonitorInformation(severity[1], severity[2], body) for conn, msg_id in neoctl: try: conn.send(p, msg_id) except ConnectionClosed: pass del self.asking_monitor_information[:]
def _nextPartition(self): app = self.app def connect(node, uuid=app.uuid, name=app.name): if node.getUUID() == app.uuid: return if node.isConnected(connecting=True): conn = node.getConnection() conn.asClient() else: conn = ClientConnection(app, StorageOperationHandler(app), node) conn.ask( Packets.RequestIdentification(NodeTypes.STORAGE, uuid, app.server, name, app.id_timestamp, {})) self.conn_dict[conn] = node.isIdentified() conn_set = set(self.conn_dict) conn_set.discard(None) try: self.conn_dict.clear() while True: try: partition, (name, source), min_tid, max_tid = \ self.queue.popleft() except IndexError: return cell = app.pt.getCell(partition, app.uuid) if cell is None or cell.isOutOfDate(): msg = "discarded or out-of-date" else: try: for cell in app.pt.getCellList(partition): # XXX: Ignore corrupted cells for the moment # because we're still unable to fix them # (see also AdministrationHandler of master) if cell.isReadable(): #if not cell.isOutOfDate(): connect(cell.getNode()) if source: node = app.nm.getByAddress(source) if name: source = app.nm.createStorage(address=source) \ if node is None else node connect(source, None, name) elif (node.getUUID() == app.uuid or node.isConnected(connecting=True) and node.getConnection() in self.conn_dict): source = node else: msg = "unavailable source" if self.conn_dict: break msg = "no replica" except ConnectionClosed: msg = "connection closed" finally: conn_set.update(self.conn_dict) self.conn_dict.clear() logging.error("Failed to start checking partition %u (%s)", partition, msg) conn_set.difference_update(self.conn_dict) finally: for conn in conn_set: app.closeClient(conn) logging.debug("start checking partition %u from %s to %s", partition, dump(min_tid), dump(max_tid)) self.min_tid = self.next_tid = min_tid self.max_tid = max_tid self.next_oid = None self.partition = partition self.source = source def start(): if app.tm.isLockedTid(max_tid): app.tm.read_queue.queueEvent(start) return args = partition, CHECK_COUNT, min_tid, max_tid p = Packets.AskCheckTIDRange(*args) for conn, identified in self.conn_dict.items(): self.conn_dict[conn] = conn.ask(p) if identified else None self.conn_dict[None] = app.dm.checkTIDRange(*args) start()
def electPrimary(self): """Elect a primary master node. The difficulty is that a master node must accept connections from others while attempting to connect to other master nodes at the same time. Note that storage nodes and client nodes may connect to self as well as master nodes.""" logging.info('begin the election of a primary master') client_handler = election.ClientElectionHandler(self) self.unconnected_master_node_set.clear() self.negotiating_master_node_set.clear() self.master_address_dict.clear() self.listening_conn.setHandler(election.ServerElectionHandler(self)) getByAddress = self.nm.getByAddress while True: # handle new connected masters for node in self.nm.getMasterList(): node.setUnknown() self.unconnected_master_node_set.add(node.getAddress()) # start the election process self.primary = None self.primary_master_node = None try: while (self.unconnected_master_node_set or self.negotiating_master_node_set): for addr in self.unconnected_master_node_set: self.negotiating_master_node_set.add(addr) ClientConnection( self, client_handler, # XXX: Ugly, but the whole election code will be # replaced soon getByAddress(addr)) self.unconnected_master_node_set.clear() self.em.poll(1) except ElectionFailure, m: # something goes wrong, clean then restart logging.error('election failed: %s', m) # Ask all connected nodes to reelect a single primary master. for conn in self.em.getClientList(): conn.notify(Packets.ReelectPrimary()) conn.abort() # Wait until the connections are closed. self.primary = None self.primary_master_node = None # XXX: Since poll does not wake up anymore every second, # the following time condition should be reviewed. # See also playSecondaryRole. t = time() + 10 while self.em.getClientList() and time() < t: try: self.em.poll(1) except ElectionFailure: pass # Close all connections. for conn in self.em.getClientList() + self.em.getServerList(): conn.close() else: # election succeed, stop the process self.primary = self.primary is None break
def __init__(self, msg=None): logging.error(msg) AssertionError.__init__(self, msg)
def _run(self): stdscr = self.stdscr r, w = os.pipe() l = threading.Lock() stdscr.nodelay(1) input_queue = deque() def input_read(): x = [] while 1: c = stdscr.getch() if c < 0: if x: input_queue.append(x) return input_queue x.append(c) def input_thread(): try: poll = select.poll() poll.register(0, select.POLLIN) poll.register(r, select.POLLIN) while 1: for fd, _ in poll.poll(): if fd: return with l: empty = not input_queue if input_read() and empty: self.em.wakeup() finally: os.close(r) t = threading.Thread(target=input_thread) t.deamon = True wait = None try: t.start() self.startCluster() self.refresh('stress', False) while 1: self.failing.clear() try: self.connectToPrimary() self.askLastIDs() while 1: self.em.poll(1) with l: if input_read(): for x in input_queue: try: x, = x except ValueError: continue if x == curses.KEY_RESIZE: self.refresh() elif x == curses.KEY_F1: self.stress() else: try: x = chr(x) except ValueError: continue if x == 'q': return input_queue.clear() except PrimaryFailure: logging.error('primary master is down') if self.cluster_state == ClusterStates.STOPPING: break self.primaryFailure() finally: if self._stress: self.stress() wait = time.time() finally: os.write(w, '\0') os.close(w) t.join() self.stopCluster(wait)
def provideService(self): logging.info('provide backup') poll = self.em.poll app = self.app pt = app.pt while True: app.changeClusterState(ClusterStates.STARTING_BACKUP) bootstrap = BootstrapManager(self, self.name, NodeTypes.CLIENT) # {offset -> node} self.primary_partition_dict = {} # [[tid]] self.tid_list = tuple([] for _ in xrange(pt.getPartitions())) try: while True: for node in pt.getNodeSet(readable=True): if not app.isStorageReady(node.getUUID()): break else: break poll(1) node, conn, uuid, num_partitions, num_replicas = \ bootstrap.getPrimaryConnection() try: app.changeClusterState(ClusterStates.BACKINGUP) del bootstrap, node if num_partitions != pt.getPartitions(): raise RuntimeError("inconsistent number of partitions") self.pt = PartitionTable(num_partitions, num_replicas) conn.setHandler(BackupHandler(self)) conn.ask(Packets.AskNodeInformation()) conn.ask(Packets.AskPartitionTable()) conn.ask(Packets.AskLastTransaction()) # debug variable to log how big 'tid_list' can be. self.debug_tid_count = 0 while True: poll(1) except PrimaryFailure, msg: logging.error('upstream master is down: %s', msg) finally: app.backup_tid = pt.getBackupTid() try: conn.close() except PrimaryFailure: pass try: del self.pt except AttributeError: pass except StateChangedException, e: if e.args[0] != ClusterStates.STOPPING_BACKUP: raise app.changeClusterState(*e.args) tid = app.backup_tid # Wait for non-primary partitions to catch up, # so that all UP_TO_DATE cells are really UP_TO_DATE. # XXX: Another possibility could be to outdate such cells, and # they would be quickly updated at the beginning of the # RUNNING phase. This may simplify code. # Any unfinished replication from upstream will be truncated. while pt.getBackupTid(min) < tid: poll(1) last_tid = app.getLastTransaction() handler = EventHandler(app) if tid < last_tid: assert tid != ZERO_TID logging.warning("Truncating at %s (last_tid was %s)", dump(app.backup_tid), dump(last_tid)) else: # We will do a dummy truncation, just to leave backup mode, # so it's fine to start automatically if there's any # missing storage. # XXX: Consider using another method to leave backup mode, # at least when there's nothing to truncate. Because # in case of StoppedOperation during VERIFYING state, # this flag will be wrongly set to False. app._startup_allowed = True # If any error happened before reaching this line, we'd go back # to backup mode, which is the right mode to recover. del app.backup_tid # Now back to RECOVERY... return tid