def connectionLost(self, conn, new_state): uuid = conn.getUUID() self.backup_tid_dict.pop(uuid, None) self.truncate_dict.pop(uuid, None) node = self.app.nm.getByUUID(uuid) try: i = self.ask_pt.index(uuid) except ValueError: pass else: del self.ask_pt[i] if not i: if self.ask_pt: self.app.nm.getByUUID(self.ask_pt[0]) \ .ask(Packets.AskPartitionTable()) else: logging.warning( "Waiting for %r to come back." " No other node has version %s of the partition table.", node, self.target_ptid) if node.getState() == new_state: return node.setState(new_state) # broadcast to all so that admin nodes gets informed self.app.broadcastNodesInformation([node])
def addPendingNodes(self, conn, uuid_list): uuids = ', '.join(map(uuid_str, uuid_list)) logging.debug('Add nodes %s', uuids) app = self.app state = app.getClusterState() # XXX: Would it be safe to allow more states ? if state not in (ClusterStates.RUNNING, ClusterStates.STARTING_BACKUP, ClusterStates.BACKINGUP): raise ProtocolError('Can not add nodes in %s state' % state) # take all pending nodes node_list = list( app.pt.addNodeList( node for node in app.nm.getStorageList() if node.isPending() and node.getUUID() in uuid_list)) if node_list: p = Packets.StartOperation(bool(app.backup_tid)) for node in node_list: node.setRunning() node.notify(p) app.broadcastNodesInformation(node_list) conn.answer( Errors.Ack('Nodes added: %s' % ', '.join(uuid_str(x.getUUID()) for x in node_list))) else: logging.warning('No node added') conn.answer(Errors.Ack('No node added'))
def setup(self, reset=False, dedup=False): self.db.setup(reset, dedup) zodb_state = self.getConfiguration("zodb") if zodb_state: logging.warning("Ignoring configuration file for oid mapping." " Reloading it from NEO storage.") zodb = cPickle.loads(zodb_state) for k, v in self.zodb: zodb[k].connect(v["storage"]) else: zodb = {k: ZODB(**v) for k, v in self.zodb} x, = (x for x in zodb.itervalues() if not x.oid) x.setup(zodb) self.setConfiguration("zodb", cPickle.dumps(zodb)) self.zodb_index, self.zodb = zip(*sorted( (x.shift_oid, x) for x in zodb.itervalues())) self.zodb_ltid = max(x.ltid for x in self.zodb) zodb = self.zodb[-1] self.zodb_loid = zodb.shift_oid + zodb.next_oid - 1 self.zodb_tid = self._getMaxPartition() is not None and \ self.db.getLastTID(self.zodb_ltid) or 0 if callable(self._import): # XXX: why ? if self.zodb_tid == self.zodb_ltid: self._finished() else: self._import = self._import()
def outdate(self, lost_node=None): """Outdate all non-working nodes Do not outdate cells of 'lost_node' for partitions it was the last node to serve. This allows a cluster restart. """ change_list = [] fully_readable = all(cell.isReadable() for row in self.partition_list for cell in row) for offset, row in enumerate(self.partition_list): lost = lost_node cell_list = [] for cell in row: if cell.isReadable(): if cell.getNode().isRunning(): lost = None else: cell_list.append(cell) for cell in cell_list: if cell.getNode() is not lost: cell.setState(CellStates.OUT_OF_DATE) change_list.append( (offset, cell.getUUID(), CellStates.OUT_OF_DATE)) if fully_readable and change_list: logging.warning(self._first_outdated_message) return change_list
def pack(self, t, referencesf, gc=False): if gc: logging.warning( 'Garbage Collection is not available in NEO,' ' please use an external tool. Packing without GC.') try: self.app.pack(t) except Exception: logging.exception('pack_time=%r', t) raise
def checkRange(self, conn, *args): if self.conn_dict.get(conn, self) != conn.getPeerId(): # Ignore answers to old requests, # because we did nothing to cancel them. logging.info("ignored AnswerCheck*Range%r", args) return self.conn_dict[conn] = args answer_set = set(self.conn_dict.itervalues()) if len(answer_set) > 1: for answer in answer_set: if type(answer) is not tuple: return # TODO: Automatically tell corrupted cells to fix their data # if we know a good source. # For the moment, tell master to put them in CORRUPTED state # and keep up checking if useful. uuid = self.app.uuid args = None if self.source is None else self.conn_dict[ None if self.source.getUUID() == uuid else self.source.getConnection()] uuid_list = [] for conn, answer in self.conn_dict.items(): if answer != args: del self.conn_dict[conn] if conn is None: uuid_list.append(uuid) else: uuid_list.append(conn.getUUID()) self.app.closeClient(conn) p = Packets.NotifyPartitionCorrupted(self.partition, uuid_list) self.app.master_conn.send(p) if len(self.conn_dict) <= 1: logging.warning("check of partition %u aborted", self.partition) self.queue.clear() self._nextPartition() return try: count, _, max_tid = args except ValueError: # AnswerCheckSerialRange count, _, self.next_tid, _, max_oid = args if count < CHECK_COUNT: logging.debug("partition %u checked from %s to %s", self.partition, dump(self.min_tid), dump(self.max_tid)) self._nextPartition() return self.next_oid = add64(max_oid, 1) else: # AnswerCheckTIDRange if count < CHECK_COUNT: self.next_tid = self.min_tid self.next_oid = ZERO_OID else: self.next_tid = add64(max_tid, 1) self._nextRange()
def checkRange(self, conn, *args): if self.conn_dict.get(conn, self) != conn.getPeerId(): # Ignore answers to old requests, # because we did nothing to cancel them. logging.info("ignored AnswerCheck*Range%r", args) return self.conn_dict[conn] = args answer_set = set(self.conn_dict.itervalues()) if len(answer_set) > 1: for answer in answer_set: if type(answer) is not tuple: return # TODO: Automatically tell corrupted cells to fix their data # if we know a good source. # For the moment, tell master to put them in CORRUPTED state # and keep up checking if useful. uuid = self.app.uuid args = None if self.source is None else self.conn_dict[ None if self.source.getUUID() == uuid else self.source.getConnection()] uuid_list = [] for conn, answer in self.conn_dict.items(): if answer != args: del self.conn_dict[conn] if conn is None: uuid_list.append(uuid) else: uuid_list.append(conn.getUUID()) self.app.closeClient(conn) p = Packets.NotifyPartitionCorrupted(self.partition, uuid_list) self.app.master_conn.notify(p) if len(self.conn_dict) <= 1: logging.warning("check of partition %u aborted", self.partition) self.queue.clear() self._nextPartition() return try: count, _, max_tid = args except ValueError: # AnswerCheckSerialRange count, _, self.next_tid, _, max_oid = args if count < CHECK_COUNT: logging.debug("partition %u checked from %s to %s", self.partition, dump(self.min_tid), dump(self.max_tid)) self._nextPartition() return self.next_oid = add64(max_oid, 1) else: # AnswerCheckTIDRange if count < CHECK_COUNT: self.next_tid = self.min_tid self.next_oid = ZERO_OID else: self.next_tid = add64(max_tid, 1) self._nextRange()
def _finished(self): logging.warning( "All data are imported. You should change" " your configuration to use the native backend and restart.") self._import = None for x in """getObject getReplicationTIDList getReplicationObjectList _fetchObject _getDataTID getLastObjectTID """.split(): setattr(self, x, getattr(self.db, x)) for zodb in self.zodb: zodb.close() self.zodb = None
def connectionLost(self, conn): try: del self.conn_dict[conn] except KeyError: return if self.source is not None and self.source.getConnection() is conn: del self.source elif len(self.conn_dict) > 1: logging.warning("node lost but keep up checking partition %u", self.partition) return logging.warning("check of partition %u aborted", self.partition) self._nextPartition()
def _acceptIdentification(self, node, uuid, num_partitions, num_replicas, your_uuid, primary, known_master_list): app = self.app # Register new master nodes. found = False conn_address = node.getAddress() for node_address, node_uuid in known_master_list: if node_address == conn_address: assert uuid == node_uuid, (dump(uuid), dump(node_uuid)) found = True n = app.nm.getByAddress(node_address) if n is None: n = app.nm.createMaster(address=node_address) if node_uuid is not None and n.getUUID() != node_uuid: n.setUUID(node_uuid) assert found, (node, dump(uuid), known_master_list) conn = node.getConnection() if primary is not None: primary_node = app.nm.getByAddress(primary) if primary_node is None: # I don't know such a node. Probably this information # is old. So ignore it. logging.warning('Unknown primary master: %s. Ignoring.', primary) return else: if app.trying_master_node is not primary_node: app.trying_master_node = None conn.close() app.primary_master_node = primary_node else: if app.primary_master_node is not None: # The primary master node is not a primary master node # any longer. app.primary_master_node = None app.trying_master_node = None conn.close() return # the master must give an UUID if your_uuid is None: raise ProtocolError('No UUID supplied') app.uuid = your_uuid logging.info('Got an UUID: %s', dump(app.uuid)) # Always create partition table app.pt = PartitionTable(num_partitions, num_replicas)
def retry_if_locked(f, *args): try: return f(*args) except sqlite3.OperationalError as e: x = e.args[0] if x != 'database is locked': raise msg = traceback.format_exception_only(type(e), e) msg += traceback.format_stack() logging.warning(''.join(msg)) while 1: try: return f(*args) except sqlite3.OperationalError as e: if e.args[0] != x: raise
def checkReplicas(self, conn, partition_dict, min_tid, max_tid): app = self.app pt = app.pt backingup = bool(app.backup_tid) if not max_tid: max_tid = pt.getCheckTid(partition_dict) if backingup else \ app.getLastTransaction() if min_tid > max_tid: logging.warning("nothing to check: min_tid=%s > max_tid=%s", dump(min_tid), dump(max_tid)) else: getByUUID = app.nm.getByUUID node_set = set() for offset, source in partition_dict.iteritems(): # XXX: For the moment, code checking replicas is unable to fix # corrupted partitions (when a good cell is known) # so only check readable ones. # (see also Checker._nextPartition of storage) cell_list = pt.getCellList(offset, True) #cell_list = [cell for cell in pt.getCellList(offset) # if not cell.isOutOfDate()] if len(cell_list) + (backingup and not source) <= 1: continue for cell in cell_list: node = cell.getNode() if node in node_set: break else: node_set.add(node) if source: source = '', getByUUID(source).getAddress() else: readable = [ cell for cell in cell_list if cell.isReadable() ] if 1 == len(readable) < len(cell_list): source = '', readable[0].getAddress() elif backingup: source = app.backup_app.name, random.choice( app.backup_app.pt.getCellList( offset, readable=True)).getAddress() else: source = '', None node.getConnection().notify( Packets.CheckPartition(offset, source, min_tid, max_tid)) conn.answer(Errors.Ack(''))
def checkReplicas(self, conn, partition_dict, min_tid, max_tid): app = self.app pt = app.pt backingup = bool(app.backup_tid) if not max_tid: max_tid = pt.getCheckTid(partition_dict) if backingup else app.getLastTransaction() if min_tid > max_tid: logging.warning("nothing to check: min_tid=%s > max_tid=%s", dump(min_tid), dump(max_tid)) else: getByUUID = app.nm.getByUUID node_set = set() for offset, source in partition_dict.iteritems(): # XXX: For the moment, code checking replicas is unable to fix # corrupted partitions (when a good cell is known) # so only check readable ones. # (see also Checker._nextPartition of storage) cell_list = pt.getCellList(offset, True) # cell_list = [cell for cell in pt.getCellList(offset) # if not cell.isOutOfDate()] if len(cell_list) + (backingup and not source) <= 1: continue for cell in cell_list: node = cell.getNode() if node in node_set: break else: node_set.add(node) if source: source = "", getByUUID(source).getAddress() else: readable = [cell for cell in cell_list if cell.isReadable()] if 1 == len(readable) < len(cell_list): source = "", readable[0].getAddress() elif backingup: source = ( app.backup_app.name, random.choice(app.backup_app.pt.getCellList(offset, readable=True)).getAddress(), ) else: source = "", None node.getConnection().notify(Packets.CheckPartition(offset, source, min_tid, max_tid)) conn.answer(Errors.Ack(""))
def abort(self, message=''): offset = self.current_partition if offset is None: return del self.current_partition logging.warning('replication aborted for partition %u%s', offset, message and ' (%s)' % message) if offset in self.partition_dict: # XXX: Try another partition if possible, to increase probability to # connect to another node. It would be better to explicitely # search for another node instead. tid = self.replicate_dict.pop(offset, None) or self.replicate_tid if self.replicate_dict: self._nextPartition() self.replicate_dict[offset] = tid else: self.replicate_dict[offset] = tid self._nextPartition() else: # partition removed self._nextPartition()
def _setup(self): self.db._setup() zodb_state = self.getConfiguration("zodb") if zodb_state: logging.warning("Ignoring configuration file for oid mapping." " Reloading it from NEO storage.") zodb = cPickle.loads(zodb_state) for k, v in self.zodb: zodb[k].connect(v["storage"]) else: zodb = {k: ZODB(**v) for k, v in self.zodb} x, = (x for x in zodb.itervalues() if not x.oid) x.setup(zodb) self.setConfiguration("zodb", cPickle.dumps(zodb)) self.zodb_index, self.zodb = zip(*sorted( (x.shift_oid, x) for x in zodb.itervalues())) self.zodb_ltid = max(x.ltid for x in self.zodb) zodb = self.zodb[-1] self.zodb_loid = zodb.shift_oid + zodb.next_oid - 1 self.zodb_tid = self.db.getLastTID(self.zodb_ltid) or 0 self._import = self._import()
def addPendingNodes(self, conn, uuid_list): uuids = ', '.join(map(uuid_str, uuid_list)) logging.debug('Add nodes %s', uuids) app = self.app # take all pending nodes node_list = list( app.pt.addNodeList( node for node in app.nm.getStorageList() if node.isPending() and node.getUUID() in uuid_list)) if node_list: for node in node_list: node.setRunning() app.startStorage(node) app.broadcastNodesInformation(node_list) conn.answer( Errors.Ack('Nodes added: %s' % ', '.join(uuid_str(x.getUUID()) for x in node_list))) else: logging.warning('No node added') conn.answer(Errors.Ack('No node added'))
def setup(self, reset=0): self.db.setup(reset) zodb_state = self.getConfiguration("zodb") if zodb_state: logging.warning("Ignoring configuration file for oid mapping." " Reloading it from NEO storage.") zodb = cPickle.loads(zodb_state) for k, v in self.zodb: zodb[k].connect(v["storage"]) else: zodb = {k: ZODB(**v) for k, v in self.zodb} x, = (x for x in zodb.itervalues() if not x.oid) x.setup(zodb) self.setConfiguration("zodb", cPickle.dumps(zodb)) self.zodb_index, self.zodb = zip(*sorted( (x.shift_oid, x) for x in zodb.itervalues())) self.zodb_ltid = max(x.ltid for x in self.zodb) zodb = self.zodb[-1] self.zodb_loid = zodb.shift_oid + zodb.next_oid - 1 self.zodb_tid = self.db.getLastTID(self.zodb_ltid) or 0 self._import = self._import()
def _acceptIdentification(self, node, peer_uuid, num_partitions, num_replicas, your_uuid, primary, known_master_list): app = self.app # Register new master nodes. for address, uuid in known_master_list: if app.server == address: # This is self. assert node.getAddress() != primary or uuid == your_uuid, ( uuid_str(uuid), uuid_str(your_uuid)) continue n = app.nm.getByAddress(address) if n is None: n = app.nm.createMaster(address=address) if primary is not None: # The primary master is defined. if app.primary_master_node is not None \ and app.primary_master_node.getAddress() != primary: # There are multiple primary master nodes. This is # dangerous. raise ElectionFailure, 'multiple primary master nodes' primary_node = app.nm.getByAddress(primary) if primary_node is None: # I don't know such a node. Probably this information # is old. So ignore it. logging.warning('received an unknown primary node') else: # Whatever the situation is, I trust this master. app.primary = False app.primary_master_node = primary_node # Stop waiting for connections than primary master's to # complete to exit election phase ASAP. app.negotiating_master_node_set.clear() return self.elect(None, node.getAddress())
def addPendingNodes(self, conn, uuid_list): uuids = ", ".join(map(uuid_str, uuid_list)) logging.debug("Add nodes %s", uuids) app = self.app state = app.getClusterState() # XXX: Would it be safe to allow more states ? if state not in (ClusterStates.RUNNING, ClusterStates.STARTING_BACKUP, ClusterStates.BACKINGUP): raise ProtocolError("Can not add nodes in %s state" % state) # take all pending nodes node_list = list( app.pt.addNodeList( node for node in app.nm.getStorageList() if node.isPending() and node.getUUID() in uuid_list ) ) if node_list: p = Packets.StartOperation(bool(app.backup_tid)) for node in node_list: node.setRunning() node.notify(p) app.broadcastNodesInformation(node_list) conn.answer(Errors.Ack("Nodes added: %s" % ", ".join(uuid_str(x.getUUID()) for x in node_list))) else: logging.warning("No node added") conn.answer(Errors.Ack("No node added"))
def connectionLost(self, conn, new_state): uuid = conn.getUUID() self.backup_tid_dict.pop(uuid, None) self.truncate_dict.pop(uuid, None) node = self.app.nm.getByUUID(uuid) try: i = self.ask_pt.index(uuid) except ValueError: pass else: del self.ask_pt[i] if not i: if self.ask_pt: self.app.nm.getByUUID(self.ask_pt[0]) \ .ask(Packets.AskPartitionTable()) else: logging.warning("Waiting for %r to come back." " No other node has version %s of the partition table.", node, self.target_ptid) if node.getState() == new_state: return node.setState(new_state) # broadcast to all so that admin nodes gets informed self.app.broadcastNodesInformation([node])
def provideService(self): logging.info('provide backup') poll = self.em.poll app = self.app pt = app.pt while True: app.changeClusterState(ClusterStates.STARTING_BACKUP) bootstrap = BootstrapManager(self, NodeTypes.CLIENT, backup=app.name) # {offset -> node} self.primary_partition_dict = {} # [[tid]] self.tid_list = tuple([] for _ in xrange(pt.getPartitions())) try: while True: for node in pt.getNodeSet(readable=True): if not app.isStorageReady(node.getUUID()): break else: break poll(1) node, conn = bootstrap.getPrimaryConnection() try: app.changeClusterState(ClusterStates.BACKINGUP) del bootstrap, node self.ignore_invalidations = True conn.setHandler(BackupHandler(self)) conn.ask(Packets.AskLastTransaction()) # debug variable to log how big 'tid_list' can be. self.debug_tid_count = 0 while True: poll(1) except PrimaryFailure, msg: logging.error('upstream master is down: %s', msg) finally: app.backup_tid = pt.getBackupTid() try: conn.close() except PrimaryFailure: pass try: del self.pt except AttributeError: pass for node in app.nm.getClientList(True): node.getConnection().close() except StateChangedException, e: if e.args[0] != ClusterStates.STOPPING_BACKUP: raise app.changeClusterState(*e.args) tid = app.backup_tid # Wait for non-primary partitions to catch up, # so that all UP_TO_DATE cells are really UP_TO_DATE. # XXX: Another possibility could be to outdate such cells, and # they would be quickly updated at the beginning of the # RUNNING phase. This may simplify code. # Any unfinished replication from upstream will be truncated. while pt.getBackupTid(min) < tid: poll(1) last_tid = app.getLastTransaction() handler = EventHandler(app) if tid < last_tid: assert tid != ZERO_TID logging.warning("Truncating at %s (last_tid was %s)", dump(app.backup_tid), dump(last_tid)) else: # We will do a dummy truncation, just to leave backup mode, # so it's fine to start automatically if there's any # missing storage. # XXX: Consider using another method to leave backup mode, # at least when there's nothing to truncate. Because # in case of StoppedOperation during VERIFYING state, # this flag will be wrongly set to False. app._startup_allowed = True # If any error happened before reaching this line, we'd go back # to backup mode, which is the right mode to recover. del app.backup_tid # Now back to RECOVERY... return tid
def pack(self, t, referencesf, gc=False): if gc: logging.warning( 'Garbage Collection is not available in NEO,' ' please use an external tool. Packing without GC.') self.app.pack(t)
def tweak(self, drop_list=()): """Optimize partition table This reassigns cells in 4 ways: - Discard cells of nodes listed in 'drop_list'. For partitions with too few readable cells, some cells are instead marked as FEEDING. This is a preliminary step to drop these nodes, otherwise the partition table could become non-operational. In fact, the code touching these cells is disabled (see NOTE below). - Other nodes must have the same number of non-feeding cells, off by 1. - When a transaction creates new objects (oids are roughly allocated sequentially), we expect better performance by maximizing the number of involved nodes (i.e. parallelizing writes). - For maximum resiliency, cells of each partition are assigned as far as possible from each other, by checking the topology path of nodes. Examples of optimal partition tables with np=10, nr=1 and 5 nodes: UU... ..UU. ..UU. U...U U...U .UU.. .UU.. ...UU ...UU UU... UU... ..UU. ..UU. U...U U...U .UU.. .UU.. ...UU ...UU UU... The above 2 PT only differ by permutation of nodes, and this method plays on it to minimize the resulting amount of replication. For performance reasons, this algorithm uses a heuristic. When (np * nr) is not a multiple of the number of nodes, some nodes have 1 extra cell compared to other. In such case, other optimal PT could be considered by rotation of the partitions. Actually np times more, but it's not worth it since they don't differ enough (if np is big enough) and we don't already do an exhaustive search. Example with np=3, nr=1 and 2 nodes: U. .U U. .U U. U. U. U. .U For the topology, let's consider an example with paths of the form (room, machine, disk): - if there are more rooms than the number of replicas, 2 cells of the same partition must not be assigned in the same room; - otherwise, topology paths are checked at a deeper depth, e.g. not on the same machine and distributed evenly (off by 1) among rooms. But the topology is expected to be optimal, otherwise it is ignored. In some cases, we could fall back to a non-optimal topology but that would cause extra replication if the user wants to fix it. """ # Collect some data in a usable form for the rest of the method. node_list = { node: {} for node in self.count_dict if node not in drop_list } if not node_list: raise neo.lib.pt.PartitionTableException("Can't remove all nodes.") drop_list = defaultdict(list) for offset, row in enumerate(self.partition_list): for cell in row: cell_dict = node_list.get(cell.getNode()) if cell_dict is None: drop_list[offset].append(cell) else: cell_dict[offset] = cell # The sort by node id is cosmetic, to prefer result like the first one # in __doc__. node_list = sorted(node_list.iteritems(), key=lambda x: x[0].getUUID()) # Generate an optimal PT. node_count = len(node_list) repeats = min(self.nr + 1, node_count) x = [[] for _ in xrange(node_count)] i = 0 for offset in xrange(self.np): for _ in xrange(repeats): x[i % node_count].append(offset) i += 1 option_dict = Counter(map(tuple, x)) # Initialize variables/functions to optimize the topology. devpath_max = [] devpaths = [()] * node_count if repeats > 1: _devpaths = [x[0].devpath for x in node_list] max_depth = min(map(len, _devpaths)) depth = 0 while 1: if depth < max_depth: depth += 1 x = Counter(x[:depth] for x in _devpaths) n = len(x) x = set(x.itervalues()) # TODO: Prove it works. If the code turns out to be: # - too pessimistic, the topology is ignored when # resiliency could be maximized; # - or worse too optimistic, in which case this # method raises, possibly after a very long time. if len(x) == 1 or max(x) * repeats <= node_count: i, x = divmod(repeats, n) devpath_max.append((i + 1, x) if x else (i, n)) if n < repeats: continue devpaths = [x[:depth] for x in _devpaths] break logging.warning( "Can't maximize resiliency: fix the topology" " of your storage nodes and make sure they're all running." " %s storage device failure(s) may be enough to lose all" " the database." % (repeats - 1)) break topology = [{} for _ in xrange(self.np)] def update_topology(): for offset in option: n = topology[offset] for i, (j, k) in zip(devpath, devpath_max): try: i, x = n[i] except KeyError: n[i] = i, x = [0, {}] if i == j or i + 1 == j and k == sum( 1 for i in n.itervalues() if i[0] == j): # Too many cells would be assigned at this topology # node. return False n = x # The topology may be optimal with this option. Apply it. for offset in option: n = topology[offset] for i in devpath: n = n[i] n[0] += 1 n = n[1] return True def revert_topology(): for offset in option: n = topology[offset] for i in devpath: n = n[i] n[0] -= 1 n = n[1] # Strategies to find the "best" permutation of nodes. def node_options(): # The second part of the key goes with the above cosmetic sort. option_list = sorted(option_dict, key=lambda x: (-len(x), x)) # 1. Search for solution that does not cause extra replication. # This is important because tweak() must does nothing if it's # called a second time whereas the list of nodes hasn't changed. result = [] for i, (_, cell_dict) in enumerate(node_list): option = { offset for offset, cell in cell_dict.iteritems() if not cell.isFeeding() } x = filter(option.issubset, option_list) if not x: break result.append((i, x)) else: yield result # 2. We have to move cells. Evaluating all options would have # a complexity of O(node_count!), which is clearly too slow, # so we use a heuristic. # For each node, we compare the resulting amount of replication # in the best (min_cost) and worst (max_cost) case, and we first # iterate over nodes with the biggest difference. This minimizes # the impact of bad allocation patterns for the last nodes. result = [] np_complement = frozenset(xrange(self.np)).difference for i, (_, cell_dict) in enumerate(node_list): cost_list = [] for x, option in enumerate(option_list): discard = [0, 0] for offset in np_complement(option): cell = cell_dict.get(offset) if cell: discard[cell.isReadable()] += 1 cost_list.append(((discard[1], discard[0]), x)) cost_list.sort() min_cost = cost_list[0][0] max_cost = cost_list[-1][0] result.append( (min_cost[0] - max_cost[0], min_cost[1] - max_cost[1], i, [option_list[x[1]] for x in cost_list])) result.sort() yield result # The main loop, which is where we evaluate options. new = [] # the solution stack = [] # data recursion def options(): x = node_options[len(new)] return devpaths[x[-2]], iter(x[-1]) for node_options in node_options(): # for each strategy devpath, iter_option = options() while 1: try: option = next(iter_option) except StopIteration: if new: devpath, iter_option = stack.pop() option = new.pop() revert_topology() option_dict[option] += 1 continue break if option_dict[option] and update_topology(): new.append(option) if len(new) == node_count: break stack.append((devpath, iter_option)) devpath, iter_option = options() option_dict[option] -= 1 if new: break else: raise AssertionError # Apply the solution. if self._id is None: self._id = 1 self.num_filled_rows = self.np new_state = CellStates.UP_TO_DATE else: new_state = CellStates.OUT_OF_DATE changed_list = [] outdated_list = [repeats] * self.np discard_list = defaultdict(list) for i, offset_list in enumerate(new): node, cell_dict = node_list[node_options[i][-2]] for offset in offset_list: cell = cell_dict.pop(offset, None) if cell is None: self.count_dict[node] += 1 self.partition_list[offset].append(Cell(node, new_state)) changed_list.append((offset, node.getUUID(), new_state)) elif cell.isReadable(): if cell.isFeeding(): cell.setState(CellStates.UP_TO_DATE) changed_list.append( (offset, node.getUUID(), CellStates.UP_TO_DATE)) outdated_list[offset] -= 1 for offset, cell in cell_dict.iteritems(): discard_list[offset].append(cell) # NOTE: The following line disables the next 2 lines, which actually # causes cells in drop_list to be discarded, now or later; # drop_list could be renamed into ignore_list. # 1. Deleting data partition per partition is a lot of work, so # why ask nodes in drop_list to do that when the goal is # simply to trash the whole underlying database? # 2. By excluding nodes from a tweak, it becomes possible to have # parts of the partition table that are tweaked differently. # This may require to temporarily change the number of # replicas for the part being tweaked. In the future, this # number may be specified in the 'tweak' command, to avoid # race conditions with setUpToDate(). # Overall, a common use case is when importing a ZODB to NEO, # to keep the initial importing node up until the database is # split and replicated to the final nodes. drop_list = {} for offset, drop_list in drop_list.iteritems(): discard_list[offset] += drop_list # We have sorted cells to discard in order to first deallocate nodes # in drop_list, and have feeding cells in other nodes. # The following loop also makes sure not to discard cells too quickly, # by keeping a minimum of 'repeats' readable cells. for offset, outdated in enumerate(outdated_list): row = self.partition_list[offset] for cell in discard_list[offset]: if outdated and cell.isReadable(): outdated -= 1 if cell.isFeeding(): continue state = CellStates.FEEDING cell.setState(state) else: self.count_dict[cell.getNode()] -= 1 state = CellStates.DISCARDED row.remove(cell) changed_list.append((offset, cell.getUUID(), state)) assert self.operational(), changed_list return changed_list
def _handleConflicts(self, txn_context, tryToResolveConflict): result = [] append = result.append # Check for conflicts data_dict = txn_context['data_dict'] object_base_serial_dict = txn_context['object_base_serial_dict'] object_serial_dict = txn_context['object_serial_dict'] conflict_serial_dict = txn_context['conflict_serial_dict'].copy() txn_context['conflict_serial_dict'].clear() resolved_conflict_serial_dict = txn_context[ 'resolved_conflict_serial_dict'] for oid, conflict_serial_set in conflict_serial_dict.iteritems(): conflict_serial = max(conflict_serial_set) serial = object_serial_dict[oid] if ZERO_TID in conflict_serial_set: if 1: # XXX: disable deadlock avoidance code until it is fixed logging.info('Deadlock avoidance on %r:%r', dump(oid), dump(serial)) # 'data' parameter of ConflictError is only used to report the # class of the object. It doesn't matter if 'data' is None # because the transaction is too big. try: data = data_dict[oid] except KeyError: data = txn_context['cache_dict'][oid] else: # Storage refused us from taking object lock, to avoid a # possible deadlock. TID is actually used for some kind of # "locking priority": when a higher value has the lock, # this means we stored objects "too late", and we would # otherwise cause a deadlock. # To recover, we must ask storages to release locks we # hold (to let possibly-competing transactions acquire # them), and requeue our already-sent store requests. # XXX: currently, brute-force is implemented: we send # object data again. # WARNING: not maintained code logging.info('Deadlock avoidance triggered on %r:%r', dump(oid), dump(serial)) for store_oid, store_data in data_dict.iteritems(): store_serial = object_serial_dict[store_oid] if store_data is CHECKED_SERIAL: self._checkCurrentSerialInTransaction(txn_context, store_oid, store_serial) else: if store_data is None: # Some undo logging.warning('Deadlock avoidance cannot reliably' ' work with undo, this must be implemented.') conflict_serial = ZERO_TID break self._store(txn_context, store_oid, store_serial, store_data, unlock=True) else: continue else: data = data_dict.pop(oid) if data is CHECKED_SERIAL: raise ReadConflictError(oid=oid, serials=(conflict_serial, serial)) # TODO: data can be None if a conflict happens during undo if data: txn_context['data_size'] -= len(data) resolved_serial_set = resolved_conflict_serial_dict.setdefault( oid, set()) if resolved_serial_set and conflict_serial <= max( resolved_serial_set): # A later serial has already been resolved, skip. resolved_serial_set.update(conflict_serial_set) continue try: new_data = tryToResolveConflict(oid, conflict_serial, serial, data) except ConflictError: logging.info('Conflict resolution failed for ' '%r:%r with %r', dump(oid), dump(serial), dump(conflict_serial)) else: logging.info('Conflict resolution succeeded for ' '%r:%r with %r', dump(oid), dump(serial), dump(conflict_serial)) # Mark this conflict as resolved resolved_serial_set.update(conflict_serial_set) # Base serial changes too, as we resolved a conflict object_base_serial_dict[oid] = conflict_serial # Try to store again self._store(txn_context, oid, conflict_serial, new_data) append(oid) continue # With recent ZODB, get_pickle_metadata (from ZODB.utils) does # not support empty values, so do not pass 'data' in this case. raise ConflictError(oid=oid, serials=(conflict_serial, serial), data=data or None) return result
def _import(self): p64 = util.p64 u64 = util.u64 tid = p64(self.zodb_tid + 1) zodb_list = [] for zodb in self.zodb: try: zodb_list.append(ZODBIterator(zodb, tid, p64(self.zodb_ltid))) except StopIteration: pass tid = None def finish(): if tid: self.storeTransaction( tid, object_list, ((x[0] for x in object_list), str(txn.user), str(txn.description), cPickle.dumps( txn.extension), False, tid), False) self.releaseData(data_id_list) logging.debug( "TXN %s imported (user=%r, desc=%r, len(oid)=%s)", util.dump(tid), txn.user, txn.description, len(object_list)) del object_list[:], data_id_list[:] if self._last_commit + 1 < time.time(): self.commit() self.zodb_tid = u64(tid) if self.compress: from zlib import compress else: compress = None compression = 0 object_list = [] data_id_list = [] while zodb_list: zodb_list.sort() z = zodb_list[0] # Merge transactions with same tid. Only # user/desc/ext from first ZODB are kept. if tid != z.tid: finish() txn = z.transaction tid = txn.tid yield 1 zodb = z.zodb for r in z.transaction: oid = p64(u64(r.oid) + zodb.shift_oid) data_tid = r.data_txn if data_tid or r.data is None: data_id = None else: data = zodb.repickle(r.data) if compress: compressed_data = compress(data) compression = len(compressed_data) < len(data) if compression: data = compressed_data checksum = util.makeChecksum(data) data_id = self.holdData(util.makeChecksum(data), data, compression) data_id_list.append(data_id) object_list.append((oid, data_id, data_tid)) # Give the main loop the opportunity to process requests # from other nodes. In particular, clients may commit. If the # storage node exits after such commit, and before we actually # update 'obj' with 'object_list', some rows in 'data' may be # unreferenced. This is not a problem because the leak is # solved when resuming the migration. yield 1 try: z.next() except StopIteration: del zodb_list[0] self._last_commit = 0 finish() logging.warning( "All data are imported. You should change" " your configuration to use the native backend and restart.") self._import = None for x in """getObject getReplicationTIDList """.split(): setattr(self, x, getattr(self.db, x))
def playPrimaryRole(self): logging.info('play the primary role with %r', self.listening_conn) self.master_address_dict.clear() em = self.em packet = Packets.AnnouncePrimary() for conn in em.getConnectionList(): if conn.isListening(): conn.setHandler(identification.IdentificationHandler(self)) else: conn.notify(packet) # Primary master should rather establish connections to all # secondaries, rather than the other way around. This requires # a bit more work when a new master joins a cluster but makes # it easier to resolve UUID conflicts with minimal cluster # impact, and ensure primary master unicity (primary masters # become noisy, in that they actively try to maintain # connections to all other master nodes, so duplicate # primaries will eventually get in touch with each other and # resolve the situation with a duel). # TODO: only abort client connections, don't close server # connections as we want to have them in the end. Secondary # masters will reconnect nevertheless, but it's dirty. # Currently, it's not trivial to preserve connected nodes, # because of poor node status tracking during election. conn.abort() # If I know any storage node, make sure that they are not in the # running state, because they are not connected at this stage. for node in self.nm.getStorageList(): if node.isRunning(): node.setTemporarilyDown() if self.uuid is None: self.uuid = self.getNewUUID(None, self.server, NodeTypes.MASTER) logging.info('My UUID: ' + uuid_str(self.uuid)) else: in_conflict = self.nm.getByUUID(self.uuid) if in_conflict is not None: logging.warning('UUID conflict at election exit with %r', in_conflict) in_conflict.setUUID(None) # Do not restart automatically if ElectionFailure is raised, in order # to avoid a split of the database. For example, with 2 machines with # a master and a storage on each one and replicas=1, the secondary # master becomes primary in case of network failure between the 2 # machines but must not start automatically: otherwise, each storage # node would diverge. self._startup_allowed = False try: while True: self.runManager(RecoveryManager) try: self.runManager(VerificationManager) if not self.backup_tid: self.provideService() # self.provideService only returns without raising # when switching to backup mode. if self.backup_app is None: raise RuntimeError("No upstream cluster to backup" " defined in configuration") truncate = Packets.Truncate( self.backup_app.provideService()) except StoppedOperation, e: logging.critical('No longer operational') truncate = Packets.Truncate(*e.args) if e.args else None # Automatic restart except if we truncate or retry to. self._startup_allowed = not (self.truncate_tid or truncate) node_list = [] for node in self.nm.getIdentifiedList(): if node.isStorage() or node.isClient(): conn = node.getConnection() conn.notify(Packets.StopOperation()) if node.isClient(): conn.abort() continue if truncate: conn.notify(truncate) if node.isRunning(): node.setPending() node_list.append(node) self.broadcastNodesInformation(node_list) except StateChangedException, e: assert e.args[0] == ClusterStates.STOPPING self.shutdown()
def _handleConflicts(self, txn_context, tryToResolveConflict): result = [] append = result.append # Check for conflicts data_dict = txn_context['data_dict'] object_base_serial_dict = txn_context['object_base_serial_dict'] object_serial_dict = txn_context['object_serial_dict'] conflict_serial_dict = txn_context['conflict_serial_dict'].copy() txn_context['conflict_serial_dict'].clear() resolved_conflict_serial_dict = txn_context[ 'resolved_conflict_serial_dict'] for oid, conflict_serial_set in conflict_serial_dict.iteritems(): conflict_serial = max(conflict_serial_set) serial = object_serial_dict[oid] if ZERO_TID in conflict_serial_set: if 1: # XXX: disable deadlock avoidance code until it is fixed logging.info('Deadlock avoidance on %r:%r', dump(oid), dump(serial)) # 'data' parameter of ConflictError is only used to report the # class of the object. It doesn't matter if 'data' is None # because the transaction is too big. try: data = data_dict[oid] except KeyError: data = txn_context['cache_dict'][oid] else: # Storage refused us from taking object lock, to avoid a # possible deadlock. TID is actually used for some kind of # "locking priority": when a higher value has the lock, # this means we stored objects "too late", and we would # otherwise cause a deadlock. # To recover, we must ask storages to release locks we # hold (to let possibly-competing transactions acquire # them), and requeue our already-sent store requests. # XXX: currently, brute-force is implemented: we send # object data again. # WARNING: not maintained code logging.info('Deadlock avoidance triggered on %r:%r', dump(oid), dump(serial)) for store_oid, store_data in data_dict.iteritems(): store_serial = object_serial_dict[store_oid] if store_data is CHECKED_SERIAL: self._checkCurrentSerialInTransaction( txn_context, store_oid, store_serial) else: if store_data is None: # Some undo logging.warning( 'Deadlock avoidance cannot reliably' ' work with undo, this must be implemented.' ) conflict_serial = ZERO_TID break self._store(txn_context, store_oid, store_serial, store_data, unlock=True) else: continue else: data = data_dict.pop(oid) if data is CHECKED_SERIAL: raise ReadConflictError(oid=oid, serials=(conflict_serial, serial)) # TODO: data can be None if a conflict happens during undo if data: txn_context['data_size'] -= len(data) resolved_serial_set = resolved_conflict_serial_dict.setdefault( oid, set()) if resolved_serial_set and conflict_serial <= max( resolved_serial_set): # A later serial has already been resolved, skip. resolved_serial_set.update(conflict_serial_set) continue try: new_data = tryToResolveConflict(oid, conflict_serial, serial, data) except ConflictError: logging.info( 'Conflict resolution failed for ' '%r:%r with %r', dump(oid), dump(serial), dump(conflict_serial)) else: logging.info( 'Conflict resolution succeeded for ' '%r:%r with %r', dump(oid), dump(serial), dump(conflict_serial)) # Mark this conflict as resolved resolved_serial_set.update(conflict_serial_set) # Base serial changes too, as we resolved a conflict object_base_serial_dict[oid] = conflict_serial # Try to store again self._store(txn_context, oid, conflict_serial, new_data) append(oid) continue # With recent ZODB, get_pickle_metadata (from ZODB.utils) does # not support empty values, so do not pass 'data' in this case. raise ConflictError(oid=oid, serials=(conflict_serial, serial), data=data or None) return result
def pack(self, t, referencesf, gc=False): if gc: logging.warning('Garbage Collection is not available in NEO,' ' please use an external tool. Packing without GC.') self.app.pack(t)
def _import(self): p64 = util.p64 u64 = util.u64 tid = p64(self.zodb_tid + 1) zodb_list = [] for zodb in self.zodb: try: zodb_list.append(ZODBIterator(zodb, tid, p64(self.zodb_ltid))) except StopIteration: pass tid = None def finish(): if tid: self.storeTransaction(tid, object_list, ( (x[0] for x in object_list), str(txn.user), str(txn.description), cPickle.dumps(txn.extension), False, tid), False) self.releaseData(data_id_list) logging.debug("TXN %s imported (user=%r, desc=%r, len(oid)=%s)", util.dump(tid), txn.user, txn.description, len(object_list)) del object_list[:], data_id_list[:] if self._last_commit + 1 < time.time(): self.commit() self.zodb_tid = u64(tid) if self.compress: from zlib import compress else: compress = None compression = 0 object_list = [] data_id_list = [] while zodb_list: zodb_list.sort() z = zodb_list[0] # Merge transactions with same tid. Only # user/desc/ext from first ZODB are kept. if tid != z.tid: finish() txn = z.transaction tid = txn.tid yield 1 zodb = z.zodb for r in z.transaction: oid = p64(u64(r.oid) + zodb.shift_oid) data_tid = r.data_txn if data_tid or r.data is None: data_id = None else: data = zodb.repickle(r.data) if compress: compressed_data = compress(data) compression = len(compressed_data) < len(data) if compression: data = compressed_data checksum = util.makeChecksum(data) data_id = self.holdData(util.makeChecksum(data), data, compression) data_id_list.append(data_id) object_list.append((oid, data_id, data_tid)) # Give the main loop the opportunity to process requests # from other nodes. In particular, clients may commit. If the # storage node exits after such commit, and before we actually # update 'obj' with 'object_list', some rows in 'data' may be # unreferenced. This is not a problem because the leak is # solved when resuming the migration. yield 1 try: z.next() except StopIteration: del zodb_list[0] self._last_commit = 0 finish() logging.warning("All data are imported. You should change" " your configuration to use the native backend and restart.") self._import = None for x in """getObject getReplicationTIDList """.split(): setattr(self, x, getattr(self.db, x))
def provideService(self): logging.info('provide backup') poll = self.em.poll app = self.app pt = app.pt while True: app.changeClusterState(ClusterStates.STARTING_BACKUP) bootstrap = BootstrapManager(self, self.name, NodeTypes.CLIENT) # {offset -> node} self.primary_partition_dict = {} # [[tid]] self.tid_list = tuple([] for _ in xrange(pt.getPartitions())) try: while True: for node in pt.getNodeSet(readable=True): if not app.isStorageReady(node.getUUID()): break else: break poll(1) node, conn, uuid, num_partitions, num_replicas = \ bootstrap.getPrimaryConnection() try: app.changeClusterState(ClusterStates.BACKINGUP) del bootstrap, node if num_partitions != pt.getPartitions(): raise RuntimeError("inconsistent number of partitions") self.pt = PartitionTable(num_partitions, num_replicas) conn.setHandler(BackupHandler(self)) conn.ask(Packets.AskNodeInformation()) conn.ask(Packets.AskPartitionTable()) conn.ask(Packets.AskLastTransaction()) # debug variable to log how big 'tid_list' can be. self.debug_tid_count = 0 while True: poll(1) except PrimaryFailure, msg: logging.error('upstream master is down: %s', msg) finally: app.backup_tid = pt.getBackupTid() try: conn.close() except PrimaryFailure: pass try: del self.pt except AttributeError: pass except StateChangedException, e: if e.args[0] != ClusterStates.STOPPING_BACKUP: raise app.changeClusterState(*e.args) tid = app.backup_tid # Wait for non-primary partitions to catch up, # so that all UP_TO_DATE cells are really UP_TO_DATE. # XXX: Another possibility could be to outdate such cells, and # they would be quickly updated at the beginning of the # RUNNING phase. This may simplify code. # Any unfinished replication from upstream will be truncated. while pt.getBackupTid(min) < tid: poll(1) last_tid = app.getLastTransaction() handler = EventHandler(app) if tid < last_tid: assert tid != ZERO_TID logging.warning("Truncating at %s (last_tid was %s)", dump(app.backup_tid), dump(last_tid)) else: # We will do a dummy truncation, just to leave backup mode, # so it's fine to start automatically if there's any # missing storage. # XXX: Consider using another method to leave backup mode, # at least when there's nothing to truncate. Because # in case of StoppedOperation during VERIFYING state, # this flag will be wrongly set to False. app._startup_allowed = True # If any error happened before reaching this line, we'd go back # to backup mode, which is the right mode to recover. del app.backup_tid # Now back to RECOVERY... return tid
def connectionLost(self, conn, new_state): logging.warning('A connection was lost during identification')