def _run(self): t = None if self._memory_params: t = AbortSnapshot(self._vm, self._job_uuid, self._start_time, self._timeout, self._abort, self._completed, self._snapshot_job, self._lock) t.start() try: self._vm.log.info('Starting snapshot job') if self._recovery: LiveSnapshotRecovery(self._vm, self._abort, self._completed, self._snapshot_job, self._lock).run() else: snap = Snapshot(self._vm, self._snap_drives, self._memory_params, self._frozen, self._job_uuid, self._abort, self._completed, self._start_time, self._timeout, self._snapshot_job, self._lock, self._freeze_timeout) snap.snapshot() except: # Setting the abort in cases where the snapshot job failed before # starting the snapshot in libvirt, causing AbortSnapshot thread # to finish. This is also safe for recovery, since it saved to the # VMs metadata. The engine will see abort and failed as the same. _set_abort(self._vm, self._snapshot_job, self._completed, self._abort, self._lock) # We need to raise an exception in order to make job framework # report the current job as a failure. raise exception.SnapshotFailed() finally: if self._memory_params: t.join()
def snapshot(self): """Live snapshot command""" def norm_snap_drive_params(drive): """Normalize snapshot parameters""" if "baseVolumeID" in drive: base_drv = { "device": "disk", "domainID": drive["domainID"], "imageID": drive["imageID"], "volumeID": drive["baseVolumeID"] } target_drv = base_drv.copy() target_drv["volumeID"] = drive["volumeID"] elif "baseGUID" in drive: base_drv = {"GUID": drive["baseGUID"]} target_drv = {"GUID": drive["GUID"]} elif "baseUUID" in drive: base_drv = {"UUID": drive["baseUUID"]} target_drv = {"UUID": drive["UUID"]} else: base_drv, target_drv = (None, None) return base_drv, target_drv def rollback_drives(new_drives): """Rollback the prepared volumes for the snapshot""" for vm_dev_name, drive in new_drives.items(): try: self._vm.cif.teardownVolumePath(drive) except Exception: self._vm.log.exception("Unable to teardown drive: %s", vm_dev_name) def memory_snapshot(memory_volume_path): """Libvirt snapshot XML""" return vmxml.Element('memory', snapshot='external', file=memory_volume_path) def vm_conf_for_memory_snapshot(): """Returns the needed vm configuration with the memory snapshot""" return { 'restoreFromSnapshot': True, '_srcDomXML': self._vm.migratable_domain_xml(), 'elapsedTimeOffset': time.time() - self._vm.start_time } snap = vmxml.Element('domainsnapshot') disks = vmxml.Element('disks') new_drives = {} vm_drives = {} for drive in self._snap_drives: base_drv, tget_drv = norm_snap_drive_params(drive) try: self._vm.findDriveByUUIDs(tget_drv) except LookupError: # The vm is not already using the requested volume for the # snapshot, continuing. pass else: # The snapshot volume is the current one, skipping self._vm.log.debug("The volume is already in use: %s", tget_drv) continue # Next drive try: vm_drive = self._vm.findDriveByUUIDs(base_drv) except LookupError: # The volume we want to snapshot doesn't exist self._vm.log.error("The base volume doesn't exist: %s", base_drv) raise exception.SnapshotFailed() if vm_drive.hasVolumeLeases: self._vm.log.error('disk %s has volume leases', vm_drive.name) raise exception.SnapshotFailed() if vm_drive.transientDisk: self._vm.log.error('disk %s is a transient disk', vm_drive.name) raise exception.SnapshotFailed() vm_dev_name = vm_drive.name new_drives[vm_dev_name] = tget_drv.copy() new_drives[vm_dev_name]["type"] = "disk" new_drives[vm_dev_name]["diskType"] = vm_drive.diskType new_drives[vm_dev_name]["poolID"] = vm_drive.poolID new_drives[vm_dev_name]["name"] = vm_dev_name new_drives[vm_dev_name]["format"] = "cow" # We need to keep track of the drive object because # it keeps original data and used to generate snapshot element. # We keep the old volume ID so we can clear the block threshold. vm_drives[vm_dev_name] = (vm_drive, base_drv["volumeID"]) prepared_drives = {} for vm_dev_name, vm_device in new_drives.items(): # Adding the device before requesting to prepare it as we want # to be sure to teardown it down even when prepareVolumePath # failed for some unknown issue that left the volume active. prepared_drives[vm_dev_name] = vm_device try: new_drives[vm_dev_name]["path"] = \ self._vm.cif.prepareVolumePath(new_drives[vm_dev_name]) except Exception: self._vm.log.exception( 'unable to prepare the volume path for ' 'disk %s', vm_dev_name) rollback_drives(prepared_drives) raise exception.SnapshotFailed() drive, _ = vm_drives[vm_dev_name] snapelem = drive.get_snapshot_xml(vm_device) disks.appendChild(snapelem) snap.appendChild(disks) snap_flags = (libvirt.VIR_DOMAIN_SNAPSHOT_CREATE_REUSE_EXT | libvirt.VIR_DOMAIN_SNAPSHOT_CREATE_NO_METADATA) if self._memory_params: # Save the needed vm configuration # TODO: this, as other places that use pickle.dump # directly to files, should be done with outOfProcess vm_conf_vol = self._memory_params['dstparams'] vm_conf_vol_path = self._vm.cif.prepareVolumePath(vm_conf_vol) try: with open(vm_conf_vol_path, "rb+") as f: vm_conf = vm_conf_for_memory_snapshot() # protocol=2 is needed for clusters < 4.4 # (for Python 2 host compatibility) data = pickle.dumps(vm_conf, protocol=2) # Ensure that the volume is aligned; qemu-img may segfault # when converting unligned images. # https://bugzilla.redhat.com/1649788 aligned_length = utils.round(len(data), 4096) data = data.ljust(aligned_length, b"\0") f.write(data) f.flush() os.fsync(f.fileno()) finally: self._vm.cif.teardownVolumePath(vm_conf_vol) # Adding the memory volume to the snapshot xml memory_vol = self._memory_params['dst'] memory_vol_path = self._vm.cif.prepareVolumePath(memory_vol) snap.appendChild(memory_snapshot(memory_vol_path)) else: memory_vol = memory_vol_path = None snap_flags |= libvirt.VIR_DOMAIN_SNAPSHOT_CREATE_DISK_ONLY snapxml = xmlutils.tostring(snap) # TODO: this is debug information. For 3.6.x we still need to # see the XML even with 'info' as default level. self._vm.log.info("%s", snapxml) self._snapshot_job['memoryVolPath'] = memory_vol_path self._snapshot_job['memoryVol'] = memory_vol self._snapshot_job['newDrives'] = new_drives vm_drives_serialized = {} for k, v in vm_drives.items(): vm_drives_serialized[k] = [xmlutils.tostring(v[0].getXML()), v[1]] self._snapshot_job['vmDrives'] = vm_drives_serialized _write_snapshot_md(self._vm, self._snapshot_job, self._lock) # We need to stop the volume monitor for two reasons, one is to # prevent spurious libvirt errors about missing drive paths (since # we're changing them), and also to prevent to trigger a drive # extension for the new volume with the apparent size of the old one # (the apparentsize is updated as last step in updateDriveParameters) self._vm.volume_monitor.disable() try: if self._should_freeze: self._vm.freeze() if not self._memory_params: run_time = _running_time(self._start_time) if run_time > self._freeze_timeout: self._vm.log.error( "Non-memory snapshot timeout %s passed after %s " "seconds", self._freeze_timeout, run_time) raise exception.SnapshotFailed() self._vm.log.info( "Taking a live snapshot (drives=%s," "memory=%s)", ', '.join(drive["name"] for drive in new_drives.values()), self._memory_params is not None) try: self._vm.run_dom_snapshot(snapxml, snap_flags) except libvirt.libvirtError as e: if e.get_error_code() == libvirt.VIR_ERR_OPERATION_ABORTED: self_abort = self._abort.is_set() with self._lock: self._abort.set() self._snapshot_job['abort'] = self._abort.is_set() _set_abort(self._vm, self._snapshot_job, self._completed, self._abort, self._lock) if self_abort: self._vm.log.info("Snapshot timeout reached," " operation aborted") else: self._vm.log.warning( "Snapshot operation" " aborted by libvirt: %s", e.get_error_message()) self._vm.log.exception("Unable to take snapshot") if self._abort.is_set(): # This will cause a jump into the finalize_vm. # The abort is set and the finalize_vm will raise # ActionStopped exception as well. This is an indicator # to the Jobs framework signing a client abort of the job. raise exception.ActionStopped() self._thaw_vm() raise exception.SnapshotFailed() _set_completed(self._vm, self._snapshot_job, self._completed, self._abort, self._lock) if self._completed.is_set(): _write_snapshot_md(self._vm, self._snapshot_job, self._lock) self._vm.log.info("Completed live snapshot") except: # In case the VM was shutdown in the middle of the snapshot # operation we keep doing the finalizing and reporting the failure. # Or, when the Job was aborted, finalize_vm will raise # ActionStopped exception to sign it was aborted by user(VDSM). self.finalize_vm(memory_vol) res = False else: res = self.teardown(memory_vol_path, memory_vol, new_drives, vm_drives) if not res: raise RuntimeError("Failed to execute snapshot, " "considering the operation as failure")
'unexpected': exception.UnexpectedError().response(), 'unsupFormat': exception.UnsupportedImageFormat().response(), 'ticketErr': exception.SpiceTicketError().response(), 'nonresp': exception.NonResponsiveGuestAgent().response(), # codes 20-35 are reserved for add/delNetwork # code 39 was used for: # wrongHost - migration destination has an invalid hostname 'unavail': exception.ResourceUnavailable().response(), 'changeDisk': exception.ChangeDiskFailed().response(), 'destroyErr': exception.VMDestroyFailed().response(), 'fenceAgent': exception.UnsupportedFenceAgent().response(), 'noimpl': exception.MethodNotImplemented().response(), 'hotplugDisk': exception.HotplugDiskFailed().response(), 'hotunplugDisk': exception.HotunplugDiskFailed().response(), 'migCancelErr': exception.MigrationCancelationFailed().response(), 'snapshotErr': exception.SnapshotFailed().response(), 'hotplugNic': exception.HotplugNicFailed().response(), 'hotunplugNic': exception.HotunplugNicFailed().response(), 'migInProgress': exception.MigrationInProgress().response(), 'mergeErr': exception.MergeFailed().response(), 'balloonErr': exception.BalloonError().response(), 'momErr': exception.MOMPolicyUpdateFailed().response(), 'replicaErr': exception.ReplicaError().response(), 'updateDevice': exception.UpdateDeviceFailed().response(), 'hwInfoErr': exception.CannotRetrieveHWInfo().response(), 'resizeErr': exception.BadDiskResizeParameter().response(), 'transientErr': exception.TransientError().response(), 'setNumberOfCpusErr': exception.SetNumberOfCpusFailed().response(), 'haErr': exception.SetHAPolicyFailed().response(), 'cpuTuneErr': exception.CpuTuneError().response(), 'updateVmPolicyErr': exception.UpdateVMPolicyFailed().response(),