def start_instances(self, request, queryset): """ Start all transmitted PostgreSQL instances This function assumes we're running against a bunch of Debian-based systems so we can use pg_ctlcluster. Thus far, that's the case. Later revisions may change that assumption. Skip already running services. """ for inst in queryset: if inst.is_online: self.message_user(request, "%s is already running." % inst, messages.WARNING ) continue try: util = PGUtility(inst) util.start() except Exception, e: self.message_user(request, "%s : %s" % (e, inst), messages.ERROR) continue self.message_user(request, "%s started!" % inst)
def save_model(self, request, obj, form, change): """ Automatically detect/populate several fields before saving instance Since we're defining what is (hopefully) an existing structure, we should be able to auto-detect several elements from the database itself. There can also a backend monitor on each server that will keep these values updated, but bootstrapping is always best. Autodetected fields: * is_online * master * version """ # First, check the online status. We want this to be as fresh as # possible, so we might as well grab it now. obj.is_online = False sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) check = sock.connect_ex((obj.server.hostname, obj.herd.db_port)) if check == 0: obj.is_online = True # Then, since herds are organized such that each herd follows a single # primary node, we can auto-declare that this is a replica or not. # If we search and find a primary for this herd, that instance will # become our master. util = PGUtility(obj) obj.master = util.get_herd_primary() obj.version = util.get_version() if obj.master and not obj.version: obj.version = obj.master.version # Save now that we've hijacked everything. obj.save() # Attempt to initialize the instance. This only works if the instance # doesn't already exist. It's also optional, so don't derail the save # just because it didn't fully work. try: util.init_missing() except Exception, e: self.message_user(request, "Instance init: %s" % str(e), messages.WARNING )
def reload_instances(self, request, queryset): """ Reload all transmitted PostgreSQL instances This is provided as a way of reloading configuration files. """ for inst in queryset: try: util = PGUtility(inst) util.reload() except Exception, e: self.message_user(request, "%s : %s" % (e, inst), messages.ERROR) continue self.message_user(request, "%s config files reloaded!" % inst)
def promote_instances(self, request, queryset): """ Promote transmitted PostgreSQL replication instances to master state """ if request.POST.get('post') == 'yes': for inst_id in request.POST.getlist(admin.ACTION_CHECKBOX_NAME): inst = Instance.objects.get(pk=inst_id) try: util = PGUtility(inst) util.promote() except Exception, e: self.message_user(request, "%s : %s" % (e, inst), messages.ERROR) continue self.message_user(request, "%s promoted to read/write!" % inst) return
def restart_instances(self, request, queryset): """ Restart all transmitted PostgreSQL instances Basicaly we just call for a fast stop followed by a start. Nothing complicated here. Unlike stop, we don't skip stopped instances, and unline start, we don't skip running ones. """ for inst in queryset: try: util = PGUtility(inst) util.stop() util.start() except Exception, e: self.message_user(request, "%s : %s" % (e, inst), messages.ERROR) continue self.message_user(request, "%s restarted!" % inst)
def save_model(self, request, obj, form, change): """ Automatically detect/populate several fields before saving instance Since we're defining what is (hopefully) an existing structure, we should be able to auto-detect several elements from the database itself. There can also a backend monitor on each server that will keep these values updated, but bootstrapping is always best. Autodetected fields: * is_online * master * version """ # First, check the online status. We want this to be as fresh as # possible, so we might as well grab it now. obj.is_online = False sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) check = sock.connect_ex((obj.server.hostname, obj.herd.db_port)) if check == 0: obj.is_online = True # Then, since herds are organized such that each herd follows a single # primary node, we can auto-declare that this is a replica or not. # If we search and find a primary for this herd, that instance will # become our master. try: util = PGUtility(obj) obj.master = util.get_herd_primary() obj.version = util.get_version() except: pass # Finally, save now that we've hijacked everything. obj.save()
def demote_instances(self, request, queryset): """ Demote selected instances back into streaming herd replicas Given a node is a primary, meaning at one point it was promoted, we probably eventually want to convert it back. This encapsulates that process and works for several selected primaries. Instances which are the only primary in the herd are automatically pruned from the select list. This check is performed both before *and* after the confirmation form, in case the only masters from a single herd are all selected. """ if request.POST.get('post') == 'yes': # Iterate through every submitted instance and call the utility # to demote each. It should perform the check logic that ensures # we always have at least one remaining master in the herd. for inst_id in request.POST.getlist(admin.ACTION_CHECKBOX_NAME): inst = Instance.objects.get(pk=inst_id) #util = PGUtility(inst) #result = util.demote() try: util = PGUtility(inst) result = util.demote() except Exception, e: self.message_user(request, "%s : %s" % (e, inst), messages.ERROR) continue host = inst.server.hostname herd = inst.herd self.message_user(request, "%s demoted to %s replica!" % (host, herd)) return
def demote_instances(self, request, queryset): """ Demote selected instances back into streaming herd replicas Given a node is a primary, meaning at one point it was promoted, we probably eventually want to convert it back. This encapsulates that process and works for several selected primaries. Instances which are the only primary in the herd are automatically pruned from the select list. This check is performed both before *and* after the confirmation form, in case the only masters from a single herd are all selected. """ if request.POST.get('post') == 'yes': # Iterate through every submitted instance and call the utility # to demote each. It should perform the check logic that ensures # we always have at least one remaining master in the herd. for inst_id in request.POST.getlist(admin.ACTION_CHECKBOX_NAME): inst = Instance.objects.get(pk=inst_id) #util = PGUtility(inst) #result = util.demote() try: util = PGUtility(inst) result = util.demote() except Exception, e: self.message_user(request, "%s : %s" % (e, inst), messages.ERROR) continue host=inst.server.hostname herd=inst.herd self.message_user(request, "%s demoted to %s replica!" % (host, herd)) return
def stop_instances(self, request, queryset): """ Stop all transmitted PostgreSQL instances Skip already stopped services. """ for inst in queryset: if not inst.is_online: self.message_user(request, "%s is already stopped." % inst, messages.WARNING ) continue try: util = PGUtility(inst) util.stop() except Exception, e: self.message_user(request, "%s : %s" % (e, inst), messages.ERROR) continue self.message_user(request, "%s stopped!" % inst)
def stop_instances(self, request, queryset): """ Stop all transmitted PostgreSQL instances Skip already stopped services. """ for inst in queryset: if not inst.is_online: self.message_user(request, "%s is already stopped." % inst, messages.WARNING) continue try: util = PGUtility(inst) util.stop() except Exception, e: self.message_user(request, "%s : %s" % (e, inst), messages.ERROR) continue self.message_user(request, "%s stopped!" % inst)
def rebuild_instances(self, request, queryset): """ Rebuild all transmitted PostgreSQL replication instances from master """ # If we should be rebuilding an instance, connect to the host, # ensure the instance is stopped, and sync the data directories # through rsync + ssh. if request.POST.get('post') == 'yes': for inst_id in request.POST.getlist(admin.ACTION_CHECKBOX_NAME): inst = Instance.objects.get(pk=inst_id) try: util = PGUtility(inst) util.master_sync() except Exception, e: self.message_user(request, "%s : %s" % (e, inst), messages.ERROR) continue self.message_user(request, "%s rebuilt!" % inst) return
def save_model(self, request, obj, form, change): """ Automatically detect/populate several fields before saving instance Since we're defining what is (hopefully) an existing structure, we should be able to auto-detect several elements from the database itself. There can also a backend monitor on each server that will keep these values updated, but bootstrapping is always best. Autodetected fields: * is_online * master * version """ # First, check the online status. We want this to be as fresh as # possible, so we might as well grab it now. obj.is_online = False sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) check = sock.connect_ex((obj.server.hostname, obj.herd.db_port)) if check == 0: obj.is_online = True # Then, since herds are organized such that each herd follows a single # primary node, we can auto-declare that this is a replica or not. # If we search and find a primary for this herd, that instance will # become our master. util = PGUtility(obj) obj.master = util.get_herd_primary() obj.version = util.get_version() if obj.master and not obj.version: obj.version = obj.master.version # Save now that we've hijacked everything. obj.save() # Attempt to initialize the instance. This only works if the instance # doesn't already exist. It's also optional, so don't derail the save # just because it didn't fully work. try: util.init_missing() except Exception, e: self.message_user(request, "Instance init: %s" % str(e), messages.WARNING)
def failover_pair(self, request, queryset): """ Promote a Herd Follower to Leader Status This process is fairly complicated, and comes in several parts: 1. Stop the current primary node. This ensures only the secondary can accept new data. 2. Promote the top follower to read/write status. This essentially makes it the new leader of the herd. 3. Assign the follower as the new stream source to the old primary. This officially swaps the roles of the two nodes. Note that the new follower is still out of sync with the new leader. This will require a separate node rebuild step to rectify. 4. Move the declared virtual host to the new leader. 5. Reassign all replicas to follow the new leader. We do this last because it relies on DNS propagation, and pushing a reload after that step implies a reconnection. """ # Go to the confirmation form. As usual, this is fairly important, # so make sure the template is extremely descriptive regarding the # failover process. if request.POST.get('post') != 'yes': return render(request, 'admin/haas/disasterrecovery/failover.html', {'queryset' : queryset, 'opts': self.model._meta, 'action_checkbox_name': admin.ACTION_CHECKBOX_NAME, } ) # Since the form has been submitted, start swapping DR pairs. for dr_id in request.POST.getlist(admin.ACTION_CHECKBOX_NAME): newb = Instance.objects.get(pk=dr_id) sage = newb.master # Start with the transfer: stop -> promote -> alter. # Add in a short pause between to allow xlog propagation. try: sage_util = PGUtility(sage) newb_util = PGUtility(newb) sage_util.stop() sleep(5) newb_util.promote() sage.master = newb sage.save() except Exception, e: self.message_user(request, "%s : %s" % (e, newb), messages.ERROR ) continue # Now update the DNS. We'll just use the basic dnspython # module and load it with nameserver defaults. That should # be more than enough to propagate this change. try: def_dns = dns.resolver.get_default_resolver() new_dns = dns.update.Update(str(def_dns.domain).rstrip('.')) new_dns.delete(str(newb.herd.vhost), 'cname') new_dns.add( str(newb.herd.vhost), '300', 'cname', str(newb.server.hostname) ) for ns in def_dns.nameservers: dns.query.tcp(new_dns, ns) except Exception, e: self.message_user(request, "%s : %s" % (e, newb), messages.ERROR ) continue
def failover_pair(self, request, queryset): """ Promote a Herd Follower to Leader Status This process is fairly complicated, and comes in several parts: 1. Stop the current primary node. This ensures only the secondary can accept new data. 2. Promote the top follower to read/write status. This essentially makes it the new leader of the herd. 3. Assign the follower as the new stream source to the old primary. This officially swaps the roles of the two nodes. Note that the new follower is still out of sync with the new leader. This will require a separate node rebuild step to rectify. 4. Move the declared virtual host to the new leader. 5. Reassign all replicas to follow the new leader. We do this last because it relies on DNS propagation, and pushing a reload after that step implies a reconnection. """ # Go to the confirmation form. As usual, this is fairly important, # so make sure the template is extremely descriptive regarding the # failover process. if request.POST.get('post') != 'yes': return render( request, 'admin/haas/disasterrecovery/failover.html', { 'queryset': queryset, 'opts': self.model._meta, 'action_checkbox_name': admin.ACTION_CHECKBOX_NAME, }) # Since the form has been submitted, start swapping DR pairs. for dr_id in request.POST.getlist(admin.ACTION_CHECKBOX_NAME): newb = Instance.objects.get(pk=dr_id) sage = newb.master # Start with the transfer: stop -> promote -> alter. # Add in a short pause between to allow xlog propagation. try: sage_util = PGUtility(sage) newb_util = PGUtility(newb) sage_util.stop() sleep(5) newb_util.promote() sage.master = newb sage.save() except Exception, e: self.message_user(request, "%s : %s" % (e, newb), messages.ERROR) continue # Now update the DNS. We'll just use the basic dnspython # module and load it with nameserver defaults. That should # be more than enough to propagate this change. try: def_dns = dns.resolver.get_default_resolver() new_dns = dns.update.Update(str(def_dns.domain).rstrip('.')) new_dns.delete(str(newb.herd.vhost), 'cname') new_dns.add(str(newb.herd.vhost), '300', 'cname', str(newb.server.hostname)) for ns in def_dns.nameservers: dns.query.tcp(new_dns, ns) except Exception, e: self.message_user(request, "%s : %s" % (e, newb), messages.ERROR) continue
class DRAdmin(SharedInstanceAdmin): actions = ['failover_pair', 'rebuild_instances'] list_display = ('herd', 'container', 'mb_lag', 'vhost') list_filter = ('herd__environment', ) search_fields = ('herd__herd_name', 'server__hostname', 'vhost') list_display_links = None can_delete = False def has_add_permission(self, request): return False def get_actions(self, request): """ Remove Unused Actions From Master Class Though we inherit quite a lot from the Instance admin menu, we don't need most of the actions. So we'll throw away the ones we didn't explicitly include. """ actions = super(DRAdmin, self).get_actions(request) for key in actions.keys(): if key not in self.actions: del (actions[key]) return actions def container(self, instance): return instance.server.hostname container.short_description = 'DR Container' container.admin_order_field = 'server__hostname' def failover_pair(self, request, queryset): """ Promote a Herd Follower to Leader Status This process is fairly complicated, and comes in several parts: 1. Stop the current primary node. This ensures only the secondary can accept new data. 2. Promote the top follower to read/write status. This essentially makes it the new leader of the herd. 3. Assign the follower as the new stream source to the old primary. This officially swaps the roles of the two nodes. Note that the new follower is still out of sync with the new leader. This will require a separate node rebuild step to rectify. 4. Move the declared virtual host to the new leader. 5. Reassign all replicas to follow the new leader. We do this last because it relies on DNS propagation, and pushing a reload after that step implies a reconnection. """ # Go to the confirmation form. As usual, this is fairly important, # so make sure the template is extremely descriptive regarding the # failover process. if request.POST.get('post') != 'yes': return render( request, 'admin/haas/disasterrecovery/failover.html', { 'queryset': queryset, 'opts': self.model._meta, 'action_checkbox_name': admin.ACTION_CHECKBOX_NAME, }) # Since the form has been submitted, start swapping DR pairs. for dr_id in request.POST.getlist(admin.ACTION_CHECKBOX_NAME): newb = Instance.objects.get(pk=dr_id) sage = newb.master # Start with the transfer: stop -> promote -> alter. # Add in a short pause between to allow xlog propagation. try: sage_util = PGUtility(sage) newb_util = PGUtility(newb) sage_util.stop() sleep(5) newb_util.promote() sage.master = newb sage.save() except Exception, e: self.message_user(request, "%s : %s" % (e, newb), messages.ERROR) continue # Now update the DNS. We'll just use the basic dnspython # module and load it with nameserver defaults. That should # be more than enough to propagate this change. try: def_dns = dns.resolver.get_default_resolver() new_dns = dns.update.Update(str(def_dns.domain).rstrip('.')) new_dns.delete(str(newb.herd.vhost), 'cname') new_dns.add(str(newb.herd.vhost), '300', 'cname', str(newb.server.hostname)) for ns in def_dns.nameservers: dns.query.tcp(new_dns, ns) except Exception, e: self.message_user(request, "%s : %s" % (e, newb), messages.ERROR) continue # Now we should get the list of all replica instances in this # herd, which should include the old primary. We just need to # update the recovery.conf file and reload the instance. try: herd = Instance.objects.filter(master_id__isnull=False, herd_id=newb.herd_id) for member in herd: member.master = newb member.save() util = PGUtility(member) util.update_stream_config() util.reload() except Exception, e: self.message_user(request, "%s : %s" % (e, newb), messages.ERROR) continue