def notifyReplicationDone(self, node, offset, tid): app = self.app cell = app.pt.getCell(offset, node.getUUID()) tid_list = self.tid_list[offset] if tid_list: # may be empty if the cell is out-of-date # or if we're not fully initialized if tid < tid_list[0]: cell.replicating = tid else: try: tid = add64(tid_list[bisect(tid_list, tid)], -1) except IndexError: last_tid = app.getLastTransaction() if tid < last_tid: tid = last_tid node.send(Packets.Replicate(tid, '', {offset: None})) logging.debug("partition %u: updating backup_tid of %r to %s", offset, cell, dump(tid)) cell.backup_tid = tid # TODO: Provide invalidation feedback about new txns to read-only # clients connected to backup cluster. Not only here but also # hooked to in-progress feedback from fetchObjects (storage). # Forget tids we won't need anymore. cell_list = app.pt.getCellList(offset, readable=True) del tid_list[:bisect(tid_list, min(x.backup_tid for x in cell_list))] primary_node = self.primary_partition_dict.get(offset) primary = primary_node is node result = None if primary else app.pt.setUpToDate(node, offset) assert cell.isReadable() if result: # was out-of-date if primary_node is not None: max_tid, = [ x.backup_tid for x in cell_list if x.getNode() is primary_node ] if tid < max_tid: cell.replicating = max_tid logging.debug( "ask %s to replicate partition %u up to %s from %s", uuid_str(node.getUUID()), offset, dump(max_tid), uuid_str(primary_node.getUUID())) node.send( Packets.Replicate(max_tid, '', {offset: primary_node.getAddress()})) else: if app.getClusterState() == ClusterStates.BACKINGUP: self.triggerBackup(node) if primary: # Notify secondary storages that they can replicate from # primary ones, even if they are already replicating. p = Packets.Replicate(tid, '', {offset: node.getAddress()}) for cell in cell_list: if max(cell.backup_tid, cell.replicating) < tid: cell.replicating = tid logging.debug( "ask %s to replicate partition %u up to %s from %s", uuid_str(cell.getUUID()), offset, dump(tid), uuid_str(node.getUUID())) cell.getNode().send(p) return result
def notifyReplicationDone(self, node, offset, tid): app = self.app cell = app.pt.getCell(offset, node.getUUID()) tid_list = self.tid_list[offset] if tid_list: # may be empty if the cell is out-of-date # or if we're not fully initialized if tid < tid_list[0]: cell.replicating = tid else: try: tid = add64(tid_list[bisect(tid_list, tid)], -1) except IndexError: last_tid = app.getLastTransaction() if tid < last_tid: tid = last_tid node.notify(Packets.Replicate(tid, '', {offset: None})) logging.debug("partition %u: updating backup_tid of %r to %s", offset, cell, dump(tid)) cell.backup_tid = tid # Forget tids we won't need anymore. cell_list = app.pt.getCellList(offset, readable=True) del tid_list[:bisect(tid_list, min(x.backup_tid for x in cell_list))] primary_node = self.primary_partition_dict.get(offset) primary = primary_node is node result = None if primary else app.pt.setUpToDate(node, offset) assert cell.isReadable() if result: # was out-of-date if primary_node is not None: max_tid, = [x.backup_tid for x in cell_list if x.getNode() is primary_node] if tid < max_tid: cell.replicating = max_tid logging.debug( "ask %s to replicate partition %u up to %s from %s", uuid_str(node.getUUID()), offset, dump(max_tid), uuid_str(primary_node.getUUID())) node.notify(Packets.Replicate(max_tid, '', {offset: primary_node.getAddress()})) else: if app.getClusterState() == ClusterStates.BACKINGUP: self.triggerBackup(node) if primary: # Notify secondary storages that they can replicate from # primary ones, even if they are already replicating. p = Packets.Replicate(tid, '', {offset: node.getAddress()}) for cell in cell_list: if max(cell.backup_tid, cell.replicating) < tid: cell.replicating = tid logging.debug( "ask %s to replicate partition %u up to %s from %s", uuid_str(cell.getUUID()), offset, dump(tid), uuid_str(node.getUUID())) cell.getNode().notify(p) return result
def deadlock(self, storage_id, ttid, locking_tid): try: txn = self._ttid_dict[ttid] except KeyError: return if txn.locking_tid <= locking_tid: client = txn.getNode() txn.locking_tid = locking_tid = self._nextTID() logging.info('Deadlock avoidance triggered by %s for %s:' ' new locking tid for TXN %s is %s', uuid_str(storage_id), uuid_str(client.getUUID()), dump(ttid), dump(locking_tid)) client.send(Packets.NotifyDeadlock(ttid, locking_tid))
def _triggerSecondary(self, node, offset, tid, cell_list): # Notify secondary storages that they can replicate from # primary ones, even if they are already replicating. p = Packets.Replicate(tid, '', {offset: node.getAddress()}) for cell in cell_list: if max(cell.backup_tid, cell.replicating) < tid: cell.replicating = tid logging.debug( "ask %s to replicate partition %u up to %s from %s", uuid_str(cell.getUUID()), offset, dump(tid), uuid_str(node.getUUID())) cell.getNode().send(p)
def setNodeState(self, conn, uuid, state): logging.info("set node state for %s: %s", uuid_str(uuid), state) app = self.app node = app.nm.getByUUID(uuid) if node is None: raise ProtocolError("unknown node") if state not in NODE_STATE_WORKFLOW.get(node.getType(), ()): raise ProtocolError("can not switch node to this state") if uuid == app.uuid: raise ProtocolError("can not kill primary master node") state_changed = state != node.getState() message = "state changed" if state_changed else "node already in %s state" % state if node.isStorage(): keep = state == NodeStates.UNKNOWN try: cell_list = app.pt.dropNodeList([node], keep) except PartitionTableException, e: raise ProtocolError(str(e)) node.setState(state) if node.isConnected(): # notify itself so it can shutdown node.notify(Packets.NotifyNodeInformation([node.asTuple()])) # close to avoid handle the closure as a connection lost node.getConnection().abort() if keep: cell_list = app.pt.outdate() elif cell_list: message = "node permanently removed" app.broadcastPartitionChanges(cell_list)
def corrupt(offset): s0, s1, s2 = (storage_dict[cell.getUUID()] for cell in cluster.master.pt.getCellList(offset, True)) logging.info('corrupt partition %u of %s', offset, uuid_str(s1.uuid)) s1.dm.deleteObject(p64(np+offset), p64(corrupt_tid)) return s0.uuid
def connectionFailed(self, conn): addr = conn.getAddress() node = self.app.nm.getByAddress(addr) assert node is not None, (uuid_str(self.app.uuid), addr) # node may still be in unknown state self.app.negotiating_master_node_set.discard(addr) super(ClientElectionHandler, self).connectionFailed(conn)
def addPendingNodes(self, conn, uuid_list): uuids = ', '.join(map(uuid_str, uuid_list)) logging.debug('Add nodes %s', uuids) app = self.app state = app.getClusterState() # XXX: Would it be safe to allow more states ? if state not in (ClusterStates.RUNNING, ClusterStates.STARTING_BACKUP, ClusterStates.BACKINGUP): raise ProtocolError('Can not add nodes in %s state' % state) # take all pending nodes node_list = list( app.pt.addNodeList( node for node in app.nm.getStorageList() if node.isPending() and node.getUUID() in uuid_list)) if node_list: p = Packets.StartOperation(bool(app.backup_tid)) for node in node_list: node.setRunning() node.notify(p) app.broadcastNodesInformation(node_list) conn.answer( Errors.Ack('Nodes added: %s' % ', '.join(uuid_str(x.getUUID()) for x in node_list))) else: logging.warning('No node added') conn.answer(Errors.Ack('No node added'))
def setNodeState(self, conn, uuid, state): logging.info("set node state for %s: %s", uuid_str(uuid), state) app = self.app node = app.nm.getByUUID(uuid) if node is None: raise ProtocolError('unknown node') if state not in NODE_STATE_WORKFLOW.get(node.getType(), ()): raise ProtocolError('can not switch node to this state') if uuid == app.uuid: raise ProtocolError('can not kill primary master node') state_changed = state != node.getState() message = ('state changed' if state_changed else 'node already in %s state' % state) if node.isStorage(): keep = state == NodeStates.DOWN try: cell_list = app.pt.dropNodeList([node], keep) except PartitionTableException, e: raise ProtocolError(str(e)) node.setState(state) if node.isConnected(): # notify itself so it can shutdown node.send(Packets.NotifyNodeInformation( monotonic_time(), [node.asTuple()])) # close to avoid handle the closure as a connection lost node.getConnection().abort() if keep: cell_list = app.pt.outdate() elif cell_list: message = 'node permanently removed' app.broadcastPartitionChanges(cell_list)
def triggerBackup(self, node): tid_list = self.tid_list tid = self.app.getLastTransaction() replicate_list = [] for offset, cell in self.app.pt.iterNodeCell(node): max_tid = tid_list[offset] if max_tid and self.primary_partition_dict[offset] is node and \ max(cell.backup_tid, cell.replicating) < max_tid[-1]: cell.replicating = tid replicate_list.append(offset) if not replicate_list: return getCellList = self.pt.getCellList source_dict = {} address_set = set() for offset in replicate_list: cell_list = getCellList(offset, readable=True) random.shuffle(cell_list) assert cell_list, offset for cell in cell_list: addr = cell.getAddress() if addr in address_set: break else: address_set.add(addr) source_dict[offset] = addr logging.debug("ask %s to replicate partition %u up to %s from %r", uuid_str(node.getUUID()), offset, dump(tid), addr) node.send(Packets.Replicate(tid, self.name, source_dict))
def connectionLost(self, conn, new_state): app = self.app node = app.nm.getByUUID(conn.getUUID()) if node is None: return # for example, when a storage is removed by an admin assert node.isStorage(), node logging.info('storage node lost') if new_state != NodeStates.BROKEN: new_state = DISCONNECTED_STATE_DICT.get(node.getType(), NodeStates.DOWN) assert new_state in (NodeStates.TEMPORARILY_DOWN, NodeStates.DOWN, NodeStates.BROKEN), new_state assert node.getState() not in (NodeStates.TEMPORARILY_DOWN, NodeStates.DOWN, NodeStates.BROKEN), (uuid_str( self.app.uuid), node.whoSetState(), new_state) was_pending = node.isPending() node.setState(new_state) if new_state != NodeStates.BROKEN and was_pending: # was in pending state, so drop it from the node manager to forget # it and do not set in running state when it comes back logging.info('drop a pending node from the node manager') app.nm.remove(node) app.broadcastNodesInformation([node]) if app.truncate_tid: raise StoppedOperation app.broadcastPartitionChanges(app.pt.outdate(node)) if not app.pt.operational(): raise StoppedOperation
def triggerBackup(self, node): tid_list = self.tid_list tid = self.app.getLastTransaction() replicate_list = [] for offset, cell in self.app.pt.iterNodeCell(node): max_tid = tid_list[offset] if max_tid and self.primary_partition_dict[offset] is node and \ max(cell.backup_tid, cell.replicating) < max_tid[-1]: cell.replicating = tid replicate_list.append(offset) if not replicate_list: return getCellList = self.pt.getCellList source_dict = {} address_set = set() for offset in replicate_list: cell_list = getCellList(offset, readable=True) random.shuffle(cell_list) assert cell_list, offset for cell in cell_list: addr = cell.getAddress() if addr in address_set: break else: address_set.add(addr) source_dict[offset] = addr logging.debug("ask %s to replicate partition %u up to %s from %r", uuid_str(node.getUUID()), offset, dump(tid), addr) node.getConnection().notify(Packets.Replicate( tid, self.name, source_dict))
def connectionLost(self, conn, new_state): app = self.app node = app.nm.getByUUID(conn.getUUID()) if node is None: return # for example, when a storage is removed by an admin assert node.isStorage(), node logging.info("storage node lost") if new_state != NodeStates.BROKEN: new_state = DISCONNECTED_STATE_DICT.get(node.getType(), NodeStates.DOWN) assert new_state in (NodeStates.TEMPORARILY_DOWN, NodeStates.DOWN, NodeStates.BROKEN), new_state assert node.getState() not in (NodeStates.TEMPORARILY_DOWN, NodeStates.DOWN, NodeStates.BROKEN), ( uuid_str(self.app.uuid), node.whoSetState(), new_state, ) was_pending = node.isPending() node.setState(new_state) if new_state != NodeStates.BROKEN and was_pending: # was in pending state, so drop it from the node manager to forget # it and do not set in running state when it comes back logging.info("drop a pending node from the node manager") app.nm.remove(node) app.broadcastNodesInformation([node]) if app.truncate_tid: raise StoppedOperation app.broadcastPartitionChanges(app.pt.outdate(node)) if not app.pt.operational(): raise StoppedOperation
def register(self, conn, ttid): """ Register a transaction, it may be already registered """ if ttid not in self._transaction_dict: uuid = conn.getUUID() logging.debug('Register TXN %s for %s', dump(ttid), uuid_str(uuid)) self._transaction_dict[ttid] = Transaction(uuid, ttid)
def formatNodeList(self, node_list, _sort_key=itemgetter(2, 0, 1)): if not node_list: return 'Empty list!' node_list.sort(key=_sort_key) return '\n'.join( '%s - %s - %s - %s' % (node_type, uuid_str(uuid), address and '%s:%s' % address, state) for node_type, address, uuid, state in node_list)
def corrupt(offset): s0, s1, s2 = ( storage_dict[cell.getUUID()] for cell in cluster.master.pt.getCellList(offset, True)) logging.info('corrupt partition %u of %s', offset, uuid_str(s1.uuid)) s1.dm.deleteObject(p64(np + offset), p64(corrupt_tid)) return s0.uuid
def abortFor(self, uuid): """ Abort any non-locked transaction of a node """ logging.debug('Abort for %s', uuid_str(uuid)) # abort any non-locked transaction of this node for ttid, transaction in self._transaction_dict.items(): if transaction.uuid == uuid: self.abort(ttid)
def __repr__(self): return "<%s(ttid=%r, tid=%r, uuid=%r, locked=%r, age=%.2fs) at 0x%x>" \ % (self.__class__.__name__, dump(self._ttid), dump(self._tid), uuid_str(self._uuid), self.isLocked(), time() - self._birth, id(self))
def abortFor(self, uuid): """ Abort any non-locked transaction of a node """ logging.debug('Abort for %s', uuid_str(uuid)) # abort any non-locked transaction of this node for transaction in self._transaction_dict.values(): if transaction.getUUID() == uuid: self.abort(transaction.getTTID())
def abort(self, ttid, uuid): """ Abort a transaction """ logging.debug('Abort TXN %s for %s', dump(ttid), uuid_str(uuid)) if self[ttid].isPrepared(): raise ProtocolError("commit already requested for ttid %s" % dump(ttid)) del self[ttid]
def lock(self, ttid, uuid): """ Set that a node has locked the transaction. If transaction is completely locked, calls function given at instantiation time. """ logging.debug('Lock TXN %s for %s', dump(ttid), uuid_str(uuid)) if self[ttid].lock(uuid) and self._queue[0] == ttid: # all storage are locked and we unlock the commit queue self._unlockPending()
def lock(self, ttid, uuid): """ Set that a node has locked the transaction. If transaction is completely locked, calls function given at instanciation time. """ logging.debug('Lock TXN %s for %s', dump(ttid), uuid_str(uuid)) if self[ttid].lock(uuid) and self._queue[0] == ttid: # all storage are locked and we unlock the commit queue self._unlockPending()
def _acceptIdentification(self, node, uuid, num_partitions, num_replicas, your_uuid, primary, known_master_list): app = self.app if primary != app.primary_master_node.getAddress(): raise PrimaryFailure("unexpected primary uuid") if your_uuid != app.uuid: app.uuid = your_uuid logging.info("My UUID: " + uuid_str(your_uuid)) node.setUUID(uuid)
def register(self, uuid, ttid): """ Register a transaction, it may be already registered """ logging.debug('Register TXN %s for %s', dump(ttid), uuid_str(uuid)) transaction = self._transaction_dict.get(ttid, None) if transaction is None: transaction = Transaction(uuid, ttid) self._uuid_dict.setdefault(uuid, set()).add(transaction) self._transaction_dict[ttid] = transaction return transaction
def _acceptIdentification(self, node, uuid, num_partitions, num_replicas, your_uuid, primary, known_master_list): app = self.app if primary != app.primary_master_node.getAddress(): raise PrimaryFailure('unexpected primary uuid') if your_uuid != app.uuid: app.uuid = your_uuid logging.info('My UUID: ' + uuid_str(your_uuid)) node.setUUID(uuid)
def abort(self, ttid, uuid): """ Abort a transaction """ logging.debug('Abort TXN %s for %s', dump(ttid), uuid_str(uuid)) txn = self[ttid] if txn.isPrepared(): raise ProtocolError("commit already requested for ttid %s" % dump(ttid)) del self[ttid] return txn._notification_set
def repair(self, conn, uuid_list, *args): getByUUID = self.app.nm.getByUUID node_list = [] for uuid in uuid_list: node = getByUUID(uuid) if node is None or not (node.isStorage() and node.isIdentified()): raise ProtocolError("invalid storage node %s" % uuid_str(uuid)) node_list.append(node) repair = Packets.NotifyRepair(*args) for node in node_list: node.send(repair) conn.answer(Errors.Ack(''))
def log(cls): try: if cls.filter_queue: logging.info('%s:', cls.__name__) for conn, queue in cls.filter_queue.iteritems(): app = NEOThreadedTest.getConnectionApp(conn) logging.info(' %s %s:', uuid_str(app.uuid), conn) for p in queue: logging.info(' #0x%04x %s', p.getId(), p.__class__.__name__) except Exception: logging.exception('')
def nodeLost(self, node): getCellList = self.app.pt.getCellList trigger_set = set() for offset, primary_node in self.primary_partition_dict.items(): if primary_node is not node: continue cell_list = getCellList(offset, readable=True) cell = max(cell_list, key=lambda cell: cell.backup_tid) tid = cell.backup_tid self.primary_partition_dict[offset] = primary_node = cell.getNode() p = Packets.Replicate(tid, '', {offset: primary_node.getAddress()}) for cell in cell_list: cell.replicating = tid if cell.backup_tid < tid: logging.debug( "ask %s to replicate partition %u up to %s from %s", uuid_str(cell.getUUID()), offset, dump(tid), uuid_str(primary_node.getUUID())) cell.getNode().send(p) trigger_set.add(primary_node) for node in trigger_set: self.triggerBackup(node)
def nodeLost(self, node): getCellList = self.app.pt.getCellList trigger_set = set() for offset, primary_node in self.primary_partition_dict.items(): if primary_node is not node: continue cell_list = getCellList(offset, readable=True) cell = max(cell_list, key=lambda cell: cell.backup_tid) tid = cell.backup_tid self.primary_partition_dict[offset] = primary_node = cell.getNode() p = Packets.Replicate(tid, '', {offset: primary_node.getAddress()}) for cell in cell_list: cell.replicating = tid if cell.backup_tid < tid: logging.debug( "ask %s to replicate partition %u up to %s from %s", uuid_str(cell.getUUID()), offset, dump(tid), uuid_str(primary_node.getUUID())) cell.getNode().getConnection().notify(p) trigger_set.add(primary_node) for node in trigger_set: self.triggerBackup(node)
def _acceptIdentification(self, node, peer_uuid, num_partitions, num_replicas, your_uuid, primary, known_master_list): app = self.app # Register new master nodes. for address, uuid in known_master_list: if app.server == address: # This is self. assert node.getAddress() != primary or uuid == your_uuid, ( uuid_str(uuid), uuid_str(your_uuid)) continue n = app.nm.getByAddress(address) if n is None: n = app.nm.createMaster(address=address) if primary is not None: # The primary master is defined. if app.primary_master_node is not None \ and app.primary_master_node.getAddress() != primary: # There are multiple primary master nodes. This is # dangerous. raise ElectionFailure, 'multiple primary master nodes' primary_node = app.nm.getByAddress(primary) if primary_node is None: # I don't know such a node. Probably this information # is old. So ignore it. logging.warning('received an unknown primary node') else: # Whatever the situation is, I trust this master. app.primary = False app.primary_master_node = primary_node # Stop waiting for connections than primary master's to # complete to exit election phase ASAP. app.negotiating_master_node_set.clear() return self.elect(None, node.getAddress())
def requestIdentification(self, conn, node_type, uuid, address, name): self.checkClusterName(name) # reject any incoming connections if not ready if not self.app.ready: raise NotReadyError app = self.app if uuid is None: if node_type != NodeTypes.STORAGE: raise ProtocolError('reject anonymous non-storage node') handler = StorageOperationHandler(self.app) conn.setHandler(handler) else: if uuid == app.uuid: raise ProtocolError("uuid conflict or loopback connection") node = app.nm.getByUUID(uuid) # If this node is broken, reject it. if node is not None and node.isBroken(): raise BrokenNodeDisallowedError # choose the handler according to the node type if node_type == NodeTypes.CLIENT: handler = ClientOperationHandler if node is None: node = app.nm.createClient(uuid=uuid) elif node.isConnected(): # This can happen if we haven't processed yet a notification # from the master, telling us the existing node is not # running anymore. If we accept the new client, we won't # know what to do with this late notification. raise NotReadyError('uuid conflict: retry later') node.setRunning() elif node_type == NodeTypes.STORAGE: if node is None: logging.error('reject an unknown storage node %s', uuid_str(uuid)) raise NotReadyError handler = StorageOperationHandler else: raise ProtocolError('reject non-client-or-storage node') # apply the handler and set up the connection handler = handler(self.app) conn.setHandler(handler) node.setConnection(conn, app.uuid < uuid) # accept the identification and trigger an event conn.answer( Packets.AcceptIdentification(NodeTypes.STORAGE, uuid and app.uuid, app.pt.getPartitions(), app.pt.getReplicas(), uuid, app.master_node.getAddress(), ())) handler.connectionCompleted(conn)
def requestIdentification(self, conn, node_type, uuid, address, name): self.checkClusterName(name) # reject any incoming connections if not ready if not self.app.ready: raise NotReadyError app = self.app if uuid is None: if node_type != NodeTypes.STORAGE: raise ProtocolError('reject anonymous non-storage node') handler = StorageOperationHandler(self.app) conn.setHandler(handler) else: if uuid == app.uuid: raise ProtocolError("uuid conflict or loopback connection") node = app.nm.getByUUID(uuid) # If this node is broken, reject it. if node is not None and node.isBroken(): raise BrokenNodeDisallowedError # choose the handler according to the node type if node_type == NodeTypes.CLIENT: handler = ClientOperationHandler if node is None: node = app.nm.createClient(uuid=uuid) elif node.isConnected(): # This can happen if we haven't processed yet a notification # from the master, telling us the existing node is not # running anymore. If we accept the new client, we won't # know what to do with this late notification. raise NotReadyError('uuid conflict: retry later') node.setRunning() elif node_type == NodeTypes.STORAGE: if node is None: logging.error('reject an unknown storage node %s', uuid_str(uuid)) raise NotReadyError handler = StorageOperationHandler else: raise ProtocolError('reject non-client-or-storage node') # apply the handler and set up the connection handler = handler(self.app) conn.setHandler(handler) node.setConnection(conn, app.uuid < uuid) # accept the identification and trigger an event conn.answer(Packets.AcceptIdentification(NodeTypes.STORAGE, uuid and app.uuid, app.pt.getPartitions(), app.pt.getReplicas(), uuid, app.master_node.getAddress(), ())) handler.connectionCompleted(conn)
def notifyNodeInformation(self, conn, timestamp, node_list): """Store information on nodes, only if this is sent by a primary master node.""" super(BaseMasterHandler, self).notifyNodeInformation(conn, timestamp, node_list) for node_type, _, uuid, state, _ in node_list: if uuid == self.app.uuid: # This is me, do what the master tell me logging.info("I was told I'm %s", state) if state in (NodeStates.UNKNOWN, NodeStates.DOWN): erase = state == NodeStates.UNKNOWN self.app.shutdown(erase=erase) elif node_type == NodeTypes.CLIENT and state != NodeStates.RUNNING: logging.info('Notified of non-running client, abort (%s)', uuid_str(uuid)) self.app.tm.abortFor(uuid)
def abortFor(self, uuid): """ Abort any non-locked transaction of a node """ logging.debug('Abort for %s', uuid_str(uuid)) # BUG: Discarding voted transactions must only be a decision of the # master, and for this, we'll need to review how transactions are # aborted. As a workaround, we rely on the fact that lock() will # disconnect from the master in case of LockInformation. # abort any non-locked transaction of this node for ttid in [x.getTTID() for x in self._uuid_dict.get(uuid, [])]: self.abort(ttid) # cleanup _uuid_dict if no transaction remains for this node transaction_set = self._uuid_dict.get(uuid) if transaction_set is not None and not transaction_set: del self._uuid_dict[uuid]
def _setupNode(self, conn, node_type, uuid, address, node): app = self.app if node: if node.isRunning(): # cloned/evil/buggy node connecting to us raise ProtocolError('already connected') else: assert not node.isConnected() node.setAddress(address) node.setRunning() state = NodeStates.RUNNING if node_type == NodeTypes.CLIENT: if app.cluster_state != ClusterStates.RUNNING: raise NotReadyError handler = app.client_service_handler human_readable_node_type = ' client ' elif node_type == NodeTypes.STORAGE: if app.cluster_state == ClusterStates.STOPPING_BACKUP: raise NotReadyError manager = app._current_manager if manager is None: manager = app state, handler = manager.identifyStorageNode( uuid is not None and node is not None) human_readable_node_type = ' storage (%s) ' % (state, ) elif node_type == NodeTypes.MASTER: handler = app.secondary_master_handler human_readable_node_type = ' master ' elif node_type == NodeTypes.ADMIN: handler = app.administration_handler human_readable_node_type = 'n admin ' else: raise NotImplementedError(node_type) uuid = app.getNewUUID(uuid, address, node_type) logging.info('Accept a' + human_readable_node_type + uuid_str(uuid)) if node is None: node = app.nm.createFromNodeType(node_type, uuid=uuid, address=address) node.setUUID(uuid) node.setState(state) node.setConnection(conn) conn.setHandler(handler) app.broadcastNodesInformation([node], node) return uuid
def notifyNodeInformation(self, conn, node_list): """Store information on nodes, only if this is sent by a primary master node.""" self.app.nm.update(node_list) for node_type, addr, uuid, state in node_list: if uuid == self.app.uuid: # This is me, do what the master tell me logging.info("I was told I'm %s", state) if state in (NodeStates.DOWN, NodeStates.TEMPORARILY_DOWN, NodeStates.BROKEN, NodeStates.UNKNOWN): erase = state == NodeStates.DOWN self.app.shutdown(erase=erase) elif state == NodeStates.HIDDEN: raise StoppedOperation elif node_type == NodeTypes.CLIENT and state != NodeStates.RUNNING: logging.info('Notified of non-running client, abort (%s)', uuid_str(uuid)) self.app.tm.abortFor(uuid)
def notifyReplicationDone(self, conn, offset, tid): app = self.app uuid = conn.getUUID() node = app.nm.getByUUID(uuid) if app.backup_tid: cell_list = app.backup_app.notifyReplicationDone(node, offset, tid) if not cell_list: return else: try: cell_list = self.app.pt.setUpToDate(node, offset) except PartitionTableException, e: raise ProtocolError(str(e)) if not cell_list: logging.info("ignored late notification that" " %s has replicated partition %s up to %s", uuid_str(uuid), offset, dump(tid)) return
def answerStoreObject(self, conn, conflict, oid): txn_context = self.app.getHandlerData() if conflict: # Conflicts can not be resolved now because 'conn' is locked. # We must postpone the resolution (by queuing the conflict in # 'conflict_dict') to avoid any deadlock with another thread that # also resolves a conflict successfully to the same storage nodes. # Warning: if a storage (S1) is much faster than another (S2), then # we may process entirely a conflict with S1 (i.e. we received the # answer to the store of the resolved object on S1) before we # receive the conflict answer from the first store on S2. logging.info('%s reports a conflict on %s:%s with %s', uuid_str(conn.getUUID()), dump(oid), dump(txn_context.ttid), dump(conflict)) # If this conflict is not already resolved, mark it for # resolution. if txn_context.resolved_dict.get(oid, '') < conflict: txn_context.conflict_dict[oid] = conflict else: txn_context.written(self.app, conn.getUUID(), oid)
def addPendingNodes(self, conn, uuid_list): uuids = ', '.join(map(uuid_str, uuid_list)) logging.debug('Add nodes %s', uuids) app = self.app # take all pending nodes node_list = list( app.pt.addNodeList( node for node in app.nm.getStorageList() if node.isPending() and node.getUUID() in uuid_list)) if node_list: for node in node_list: node.setRunning() app.startStorage(node) app.broadcastNodesInformation(node_list) conn.answer( Errors.Ack('Nodes added: %s' % ', '.join(uuid_str(x.getUUID()) for x in node_list))) else: logging.warning('No node added') conn.answer(Errors.Ack('No node added'))
def addPendingNodes(self, conn, uuid_list): uuids = ", ".join(map(uuid_str, uuid_list)) logging.debug("Add nodes %s", uuids) app = self.app state = app.getClusterState() # XXX: Would it be safe to allow more states ? if state not in (ClusterStates.RUNNING, ClusterStates.STARTING_BACKUP, ClusterStates.BACKINGUP): raise ProtocolError("Can not add nodes in %s state" % state) # take all pending nodes node_list = list( app.pt.addNodeList( node for node in app.nm.getStorageList() if node.isPending() and node.getUUID() in uuid_list ) ) if node_list: p = Packets.StartOperation(bool(app.backup_tid)) for node in node_list: node.setRunning() node.notify(p) app.broadcastNodesInformation(node_list) conn.answer(Errors.Ack("Nodes added: %s" % ", ".join(uuid_str(x.getUUID()) for x in node_list))) else: logging.warning("No node added") conn.answer(Errors.Ack("No node added"))
def invalidatePartitions(self, tid, partition_set): app = self.app prev_tid = app.getLastTransaction() app.setLastTransaction(tid) pt = app.pt trigger_set = set() untouched_dict = defaultdict(dict) for offset in xrange(pt.getPartitions()): try: last_max_tid = self.tid_list[offset][-1] except IndexError: last_max_tid = prev_tid if offset in partition_set: self.tid_list[offset].append(tid) node_list = [] for cell in pt.getCellList(offset, readable=True): node = cell.getNode() assert node.isConnected(), node if cell.backup_tid == prev_tid: # Let's given 4 TID t0,t1,t2,t3: if a cell is only # modified by t0 & t3 and has all data for t0, 4 values # are possible for its 'backup_tid' until it replicates # up to t3: t0, t1, t2 or t3 - 1 # Choosing the smallest one (t0) is easier to implement # but when leaving backup mode, we would always lose # data if the last full transaction does not modify # all partitions. t1 is wrong for the same reason. # So we have chosen the highest one (t3 - 1). # t2 should also work but maybe harder to implement. cell.backup_tid = add64(tid, -1) logging.debug( "partition %u: updating backup_tid of %r to %s", offset, cell, dump(cell.backup_tid)) else: assert cell.backup_tid < last_max_tid, ( cell.backup_tid, last_max_tid, prev_tid, tid) if app.isStorageReady(node.getUUID()): node_list.append(node) assert node_list trigger_set.update(node_list) # Make sure we have a primary storage for this partition. if offset not in self.primary_partition_dict: self.primary_partition_dict[offset] = \ random.choice(node_list) else: # Partition not touched, so increase 'backup_tid' of all # "up-to-date" replicas, without having to replicate. for cell in pt.getCellList(offset, readable=True): if last_max_tid <= cell.backup_tid: cell.backup_tid = tid untouched_dict[cell.getNode()][offset] = None elif last_max_tid <= cell.replicating: # Same for 'replicating' to avoid useless orders. logging.debug("silently update replicating order" " of %s for partition %u, up to %s", uuid_str(cell.getUUID()), offset, dump(tid)) cell.replicating = tid for node, untouched_dict in untouched_dict.iteritems(): if app.isStorageReady(node.getUUID()): node.notify(Packets.Replicate(tid, '', untouched_dict)) for node in trigger_set: self.triggerBackup(node) count = sum(map(len, self.tid_list)) if self.debug_tid_count < count: logging.debug("Maximum number of tracked tids: %u", count) self.debug_tid_count = count
def playPrimaryRole(self): logging.info('play the primary role with %r', self.listening_conn) self.master_address_dict.clear() em = self.em packet = Packets.AnnouncePrimary() for conn in em.getConnectionList(): if conn.isListening(): conn.setHandler(identification.IdentificationHandler(self)) else: conn.notify(packet) # Primary master should rather establish connections to all # secondaries, rather than the other way around. This requires # a bit more work when a new master joins a cluster but makes # it easier to resolve UUID conflicts with minimal cluster # impact, and ensure primary master unicity (primary masters # become noisy, in that they actively try to maintain # connections to all other master nodes, so duplicate # primaries will eventually get in touch with each other and # resolve the situation with a duel). # TODO: only abort client connections, don't close server # connections as we want to have them in the end. Secondary # masters will reconnect nevertheless, but it's dirty. # Currently, it's not trivial to preserve connected nodes, # because of poor node status tracking during election. conn.abort() # If I know any storage node, make sure that they are not in the # running state, because they are not connected at this stage. for node in self.nm.getStorageList(): if node.isRunning(): node.setTemporarilyDown() if self.uuid is None: self.uuid = self.getNewUUID(None, self.server, NodeTypes.MASTER) logging.info('My UUID: ' + uuid_str(self.uuid)) else: in_conflict = self.nm.getByUUID(self.uuid) if in_conflict is not None: logging.warning('UUID conflict at election exit with %r', in_conflict) in_conflict.setUUID(None) # Do not restart automatically if ElectionFailure is raised, in order # to avoid a split of the database. For example, with 2 machines with # a master and a storage on each one and replicas=1, the secondary # master becomes primary in case of network failure between the 2 # machines but must not start automatically: otherwise, each storage # node would diverge. self._startup_allowed = False try: while True: self.runManager(RecoveryManager) try: self.runManager(VerificationManager) if not self.backup_tid: self.provideService() # self.provideService only returns without raising # when switching to backup mode. if self.backup_app is None: raise RuntimeError("No upstream cluster to backup" " defined in configuration") truncate = Packets.Truncate( self.backup_app.provideService()) except StoppedOperation, e: logging.critical('No longer operational') truncate = Packets.Truncate(*e.args) if e.args else None # Automatic restart except if we truncate or retry to. self._startup_allowed = not (self.truncate_tid or truncate) node_list = [] for node in self.nm.getIdentifiedList(): if node.isStorage() or node.isClient(): conn = node.getConnection() conn.notify(Packets.StopOperation()) if node.isClient(): conn.abort() continue if truncate: conn.notify(truncate) if node.isRunning(): node.setPending() node_list.append(node) self.broadcastNodesInformation(node_list) except StateChangedException, e: assert e.args[0] == ClusterStates.STOPPING self.shutdown()
def getPrimary(self, params): """ Get primary master node. """ return uuid_str(self.neoctl.getPrimary())
def askPartitionList(self, conn, min_offset, max_offset, uuid): logging.info("ask partition list from %s to %s for %s", min_offset, max_offset, uuid_str(uuid)) self.app.sendPartitionTable(conn, min_offset, max_offset, uuid)
def formatRowList(self, row_list): return '\n'.join('%03d | %s' % (offset, ''.join('%s - %s |' % (uuid_str(uuid), state) for (uuid, state) in cell_list)) for (offset, cell_list) in row_list)