def _refresh_rt(self): # for incremental sends, the receiver tells us the latest successful # snapshot on it. This should match self.rt in most cases. Sometimes, # it may not be the one refered by self.rt(latest) but a previous one. # We need to make sure to *only* send the incremental send that # receiver expects. self.msg = ('Failed to validate/refresh ReplicaTrail.') if (self.rlatest_snap is None): # Validate/update self.rt to the one that has the expected Snapshot # on the system. for rt in ReplicaTrail.objects.filter( replica=self.replica, status='succeeded').order_by('-id'): snap_path = ('%s%s/.snapshots/%s/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.rt.snap_name)) if (is_subvol(snap_path)): return rt # Snapshots from previous succeeded ReplicaTrails don't actually # exist on the system. So we send a Full replication instead of # incremental. return None if (len(self.rlatest_snap) == 0): # Receiver sends empty string when it fails to reply back to an # incremental send request with an appropriate parent snapshot # name. return None if (self.rt.snap_name != self.rlatest_snap): self.msg = ('Mismatch on starting snapshot for ' 'btrfs-send. Sender picked %s but Receiver wants ' '%s, which takes precedence.' % (self.rt.snap_name, self.rlatest_snap)) for rt in ReplicaTrail.objects.filter( replica=self.replica, status='succeeded').order_by('-id'): if (rt.snap_name == self.rlatest_snap): self.msg = ('%s. successful trail found for %s' % (self.msg, self.rlatest_snap)) snap_path = ('%s%s/.snapshots/%s/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.rlatest_snap)) if (is_subvol(snap_path)): self.msg = ('Snapshot(%s) exists in the system and ' 'will be used as the parent' % snap_path) logger.debug('Id: %s. %s' % (self.identity, self.msg)) return rt self.msg = ('Snapshot(%s) does not exist on the system. ' 'So cannot use it.' % snap_path) raise Exception(self.msg) raise Exception('%s. No succeeded trail found for %s.' % (self.msg, self.rlatest_snap)) snap_path = ('%s%s/.snapshots/%s/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.rlatest_snap)) if (is_subvol(snap_path)): return self.rt raise Exception('Parent Snapshot(%s) to use in btrfs-send does not ' 'exist in the system.' % snap_path)
def test_is_subvol_nonexistent(self): mount_point = '/mnt2/test-pool/test-share' o = [''] e = ["ERROR: cannot find real path for '/mnt2/test-pool/test-share': No such file or directory", ''] rc = 1 # btrfs subvol show has return code of 1 when subvol doesn't exist. self.mock_run_command.return_value = (o, e, rc) self.assertFalse(is_subvol(mount_point), msg='Did NOT return False for nonexistent subvol')
def test_is_subvol_nonexistent(self): mount_point = '/mnt2/test-pool/test-share' o = [''] e = [("ERROR: cannot find real path for '/mnt2/test-pool/test-share': " "No such file or directory"), ''] rc = 1 # btrfs subvol show has return code of 1 when subvol doesn't exist. self.mock_run_command.return_value = (o, e, rc) self.assertFalse(is_subvol(mount_point), msg='Did NOT return False for nonexistent subvol')
def _latest_snap(self, rso): for snap in ReceiveTrail.objects.filter( rshare=rso, status='succeeded').order_by('-id'): if (is_subvol('%s/%s' % (self.snap_dir, snap.snap_name))): return str(snap.snap_name) # cannot be unicode for zmq message logger.error('Id: %s. There are no replication snapshots on the ' 'system for ' 'Share(%s).' % (self.identity, rso.share)) # This would mean, a full backup transfer is required. return None
def test_is_subvol_exists(self): mount_point = '/mnt2/test-pool/test-share' o = ['/mnt2/test-pool/test-share', '\tName: \t\t\ttest-share', '\tUUID: \t\t\t80c240a2-c353-7540-bb5e-b6a71a50a02e', '\tParent UUID: \t\t-', '\tReceived UUID: \t\t-', '\tCreation time: \t\t2016-07-27 17:01:09 +0100', '\tSubvolume ID: \t\t258', '\tGeneration: \t\t13', '\tGen at creation: \t13', '\tParent ID: \t\t5', '\tTop level ID: \t\t5', '\tFlags: \t\t\t-', '\tSnapshot(s):', ''] e = [''] rc = 0 # btrfs subvol show has return code of 0 (no errors) when subvol exists self.mock_run_command.return_value = (o, e, rc) self.assertTrue(is_subvol(mount_point), msg='Did NOT return True for existing subvol')
def test_is_subvol_exists(self): mount_point = '/mnt2/test-pool/test-share' o = [ '/mnt2/test-pool/test-share', '\tName: \t\t\ttest-share', '\tUUID: \t\t\t80c240a2-c353-7540-bb5e-b6a71a50a02e', '\tParent UUID: \t\t-', '\tReceived UUID: \t\t-', '\tCreation time: \t\t2016-07-27 17:01:09 +0100', '\tSubvolume ID: \t\t258', '\tGeneration: \t\t13', '\tGen at creation: \t13', '\tParent ID: \t\t5', '\tTop level ID: \t\t5', '\tFlags: \t\t\t-', '\tSnapshot(s):', '' ] e = [''] rc = 0 # btrfs subvol show has return code of 0 (no errors) when subvol exists self.mock_run_command.return_value = (o, e, rc) self.assertTrue(is_subvol(mount_point), msg='Did NOT return True for existing subvol')
def create_repclone(share, request, logger, snapshot): """ Variant of create_clone but where the share already exists and is to be supplanted by a snapshot which is effectively moved into the shares prior position, both in the db and on the file system. This is achieved thus: Unmount target share - (via remove_share()). Btrfs subvol delete target share (via remove_share()). Remove prior target share mount point (dir). Move snap source to target share's former location (becomes share on disk). Update existing target share db entry with source snap's qgroup / usage. Remove source snap's db entry: updated share db entry makes it redundant. Remount share (which now represents the prior snap's subvol relocated). :param share: Share object to be supplanted :param request: :param logger: Logger object to reference :param snapshot: Source snapshot/quirk share object to supplant target. :return: response of serialized share (in it's updated form) """ try: logger.info("Supplanting share ({}) with " "snapshot ({}).".format(share.name, snapshot.name)) # We first strip our snapshot.name of any path as when we encounter the # initially created receive subvol it is identified as a share with a # snapshots location as it's subvol name (current quirk of import sys). # E.g. first receive subvol/share-in-snapdir name example: # ".snapshots/C583C37F-...1712B_sharename/sharename_19_replication_1". # Subsequent more regular snapshots (in db as such) are named thus: # "sharename_19_replication_2" or "sharename_19_replication_2" and on. # The 19 in the above names is the generation of the replication task. # # Normalise source name across initial quirk share & subsequent snaps. source_name = snapshot.name.split("/")[-1] # Note in the above we have to use Object.name for polymorphism, but # our share is passed by it's subvol (potential fragility point). snap_path = "{}/.snapshots/{}/{}".format(share.pool.mnt_pt, share.name, source_name).replace( "//", "/") # e.g. for above: /mnt2/poolname/.snapshots/sharename/snapname # or /.snapshots/sharename/snapname for system pool shares share_path = ("{}/{}".format(share.pool.mnt_pt, share.name)).replace("//", "/") # e.g. for above: /mnt2/poolname/sharename or /sharename for system pool shares # Passed db snap assured by caller but this does not guarantee on disk. if not is_subvol(snap_path): raise Exception("Subvol with path ({}) does not exist. Aborting " "replacement of share with path ({}).".format( snap_path, share_path)) # unmounts and then subvol deletes our on disk share remove_share(share.pool, share.name, PQGROUP_DEFAULT) # Remove read only flag on our snapshot subvol set_property(snap_path, "ro", "false", mount=False) # Ensure removed share path is clean, ie remove mount point. run_command(["/usr/bin/rm", "-rf", share_path], throw=False) # Now move snapshot to prior shares location. Given both a share and # a snapshot are subvols, we effectively promote the snap to a share. logger.info( "Moving snapshot ({}) to prior share's pool location ({})".format( snap_path, share_path)) shutil.move(snap_path, share_path) # This should have re-established our just removed subvol. # Supplant share db info with snap info to reflect new on disk state. share.qgroup = snapshot.qgroup share.rusage = snapshot.rusage share.eusage = snapshot.eusage share.save() # delete our now redundant snapshot/quirky share db entry snapshot.delete() # update our share's quota update_quota(share.pool, share.pqgroup, share.size * 1024) # mount our newly supplanted share # We independently mount all shares, data pool or system pool, in /mnt2/name mnt_pt = "{}{}".format(settings.MNT_PT, share.name) mount_share(share, mnt_pt) return Response(ShareSerializer(share).data) except Exception as e: handle_exception(e, request)
def run(self): msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' % (self.meta['uuid'], self.meta)) with self._clean_exit_handler(msg): self.sender_ip = get_sender_ip(self.meta['uuid'], logger) msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: ' '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta)) with self._clean_exit_handler(msg): #@todo: add validation recv_sub = self.ctx.socket(zmq.SUB) recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port)) recv_sub.RCVTIMEO = 100 recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id'])) msg = ('Failed to connect to the sender(%s) on ' 'meta_port(%d). meta: %s. Aborting.' % (self.sender_ip, self.meta_port, self.meta)) with self._clean_exit_handler(msg): self.meta_push = self.ctx.socket(zmq.PUSH) self.meta_push.connect('tcp://%s:%d' % (self.sender_ip, self.meta_port)) sname = ('%s_%s' % (self.sender_id, self.src_share)) if (not self.incremental): msg = ('Failed to verify/create share: %s. meta: %s. ' 'Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): create_share(sname, self.dest_pool, logger) msg = ('Failed to create the replica metadata object ' 'for share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): data = {'share': sname, 'appliance': self.sender_ip, 'src_share': self.src_share, 'data_port': self.data_port, 'meta_port': self.meta_port, } self.rid = create_rshare(data, logger) else: msg = ('Failed to retreive the replica metadata object for ' 'share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg): self.rid = rshare_id(sname, logger) sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.meta['pool'], sname)) if (not is_subvol(sub_vol)): msg = ('Failed to create parent subvolume %s' % sub_vol) with self._clean_exit_handler(msg, ack=True): run_command([BTRFS, 'subvolume', 'create', sub_vol]) snap_fp = ('%s/%s' % (sub_vol, self.snap_name)) with self._clean_exit_handler(msg): if (is_subvol(snap_fp)): ack = {'msg': 'snap_exists', 'id': self.meta['id'], } self.meta_push.send_json(ack) cmd = [BTRFS, 'receive', sub_vol] msg = ('Failed to start the low level btrfs receive command(%s)' '. Aborting.' % (cmd)) with self._clean_exit_handler(msg, ack=True): rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) msg = ('Failed to send begin_ok to the sender for meta: %s' % self.meta) with self._clean_exit_handler(msg): ack = {'msg': 'begin_ok', 'id': self.meta['id'], } self.meta_push.send_json(ack) recv_timeout_counter = 0 credit = settings.DEFAULT_SEND_CREDIT check_credit = True while True: if (check_credit is True and credit < 5): ack = {'msg': 'send_more', 'id': self.meta['id'], 'credit': settings.DEFAULT_SEND_CREDIT, } self.meta_push.send_json(ack) credit = credit + settings.DEFAULT_SEND_CREDIT logger.debug('%d KB received for %s' % (int(self.kb_received / 1024), sname)) try: recv_data = recv_sub.recv() recv_data = recv_data[len(self.meta['id']):] credit = credit - 1 recv_timeout_counter = 0 self.kb_received = self.kb_received + len(recv_data) if (self.rtid is None): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) # create a snapshot only if it's not already from a previous failed attempt with self._clean_exit_handler(msg, ack=True): create_snapshot(sname, self.snap_name, logger, snap_type='receiver') data = {'snap_name': self.snap_name} msg = ('Failed to create receive trail for rid: %d' '. meta: %s' % (self.rid, self.meta)) with self._clean_exit_handler(msg, ack=True): self.rtid = create_receive_trail(self.rid, data, logger) if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'): check_credit = False ts = datetime.utcnow().replace(tzinfo=utc) data = {'kb_received': self.kb_received / 1024, } if (recv_data == 'END_SUCCESS'): data['receive_succeeded'] = ts.strftime(settings.SNAP_TS_FORMAT) #delete the share, move the oldest snap to share oldest_snap = get_oldest_snap(sub_vol, 3) if (oldest_snap is not None): snap_path = ('%s/%s' % (sub_vol, oldest_snap)) share_path = ('%s%s/%s' % (settings.MNT_PT, self.dest_pool, sname)) msg = ('Failed to promote the oldest Snapshot(%s) ' 'to Share(%s)' % (snap_path, share_path)) try: pool = Pool.objects.get(name=self.dest_pool) remove_share(pool, sname) set_property(snap_path, 'ro', 'false', mount=False) run_command(['/usr/bin/rm', '-rf', share_path], throw=False) shutil.move(snap_path, share_path) set_property(share_path, 'ro', 'true', mount=False) delete_snapshot(sname, oldest_snap, logger) except Exception, e: logger.error(msg) logger.exception(msg) else: logger.error('END_FAIL received for meta: %s. ' 'Terminating.' % self.meta) rp.terminate() data['receive_failed'] = ts data['status'] = 'failed' msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) break if (rp.poll() is None): rp.stdin.write(recv_data) rp.stdin.flush() else: logger.error('It seems the btrfs receive process died' ' unexpectedly.') out, err = rp.communicate() msg = ('Low level system error from btrfs receive ' 'command. out: %s err: %s for rtid: %s meta: %s' % (out, err, self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): ts = datetime.utcnow().replace(tzinfo=utc) data = {'receive_failed': ts.strftime(settings.SNAP_TS_FORMAT), 'status': 'failed', 'error': msg, } update_receive_trail(self.rtid, data, logger) except zmq.error.Again: recv_timeout_counter = recv_timeout_counter + 1 if (recv_timeout_counter > 600): logger.error('Nothing received in the last 60 seconds ' 'from the sender for meta: %s. Aborting.' % self.meta) self._sys_exit(3)
def create_repclone(share, request, logger, snapshot): """ Variant of create_clone but where the share already exists and is to be supplanted by a snapshot which is effectively moved into the shares prior position, both in the db and on the file system. This is achieved thus: Unmount target share - (via remove_share()). Btrfs subvol delete target share (via remove_share()). Remove prior target share mount point (dir). Move snap source to target share's former location (becomes share on disk). Update existing target share db entry with source snap's qgroup / usage. Remove source snap's db entry: updated share db entry makes it redundant. Remount share (which now represents the prior snap's subvol relocated). :param share: Share object to be supplanted :param request: :param logger: Logger object to reference :param snapshot: Source snapshot/quirk share object to supplant target. :return: response of serialized share (in it's updated form) """ try: logger.info('Supplanting share ({}) with ' 'snapshot ({}).'.format(share.name, snapshot.name)) # We first strip our snapshot.name of any path as when we encounter the # initially created receive subvol it is identified as a share with a # snapshots location as it's subvol name (current quirk of import sys). # E.g. first receive subvol/share-in-snapdir name example: # ".snapshots/C583C37F-...1712B_sharename/sharename_19_replication_1". # Subsequent more regular snapshots (in db as such) are named thus: # "sharename_19_replication_2" or "sharename_19_replication_2" and on. # The 19 in the above names is the generation of the replication task. # # Normalise source name across initial quirk share & subsequent snaps. source_name = snapshot.name.split('/')[-1] # Note in the above we have to use Object.name for polymorphism, but # our share is passed by it's subvol (potential fragility point). snap_path = '{}{}/.snapshots/{}/{}'.format(settings.MNT_PT, share.pool.name, share.name, source_name) # eg /mnt2/poolname/.snapshots/sharename/snapname share_path = ('{}{}/{}'.format(settings.MNT_PT, share.pool.name, share.name)) # eg /mnt2/poolname/sharename # Passed db snap assured by caller but this does not guarantee on disk. if not is_subvol(snap_path): raise Exception('Subvol with path ({}) does not exist. Aborting ' 'replacement of share ({}).'.format(snap_path, share.name)) # unmounts and then subvol deletes our on disk share remove_share(share.pool, share.name, PQGROUP_DEFAULT) # Remove read only flag on our snapshot subvol set_property(snap_path, 'ro', 'false', mount=False) # Ensure removed share path is clean, ie remove mount point. run_command(['/usr/bin/rm', '-rf', share_path], throw=False) # Now move snapshot to prior shares location. Given both a share and # a snapshot are subvols, we effectively promote the snap to a share. shutil.move(snap_path, share_path) # This should have re-established our just removed subvol. # Supplant share db info with snap info to reflect new on disk state. share.qgroup = snapshot.qgroup share.rusage = snapshot.rusage share.eusage = snapshot.eusage share.save() # delete our now redundant snapshot/quirky share db entry snapshot.delete() # update our share's quota update_quota(share.pool, share.pqgroup, share.size * 1024) # mount our newly supplanted share mnt_pt = '{}{}'.format(settings.MNT_PT, share.name) mount_share(share, mnt_pt) return Response(ShareSerializer(share).data) except Exception as e: handle_exception(e, request)
def run(self): msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' % (self.meta['uuid'], self.meta)) with self._clean_exit_handler(msg): self.sender_ip = get_sender_ip(self.meta['uuid'], logger) msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: ' '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta)) with self._clean_exit_handler(msg): #@todo: add validation recv_sub = self.ctx.socket(zmq.SUB) recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port)) recv_sub.RCVTIMEO = 100 recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id'])) msg = ('Failed to connect to the sender(%s) on ' 'meta_port(%d). meta: %s. Aborting.' % (self.sender_ip, self.meta_port, self.meta)) with self._clean_exit_handler(msg): self.meta_push = self.ctx.socket(zmq.PUSH) self.meta_push.connect('tcp://%s:%d' % (self.sender_ip, self.meta_port)) sname = ('%s_%s' % (self.sender_id, self.src_share)) if (not self.incremental): msg = ('Failed to verify/create share: %s. meta: %s. ' 'Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): create_share(sname, self.dest_pool, logger) msg = ('Failed to create the replica metadata object ' 'for share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): data = {'share': sname, 'appliance': self.sender_ip, 'src_share': self.src_share, 'data_port': self.data_port, 'meta_port': self.meta_port, } self.rid = create_rshare(data, logger) else: msg = ('Failed to retreive the replica metadata object for ' 'share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg): self.rid = rshare_id(sname, logger) sub_vol = ('%s%s/.snapshots/%s' % (settings.MNT_PT, self.meta['pool'], sname)) if (not is_subvol(sub_vol)): msg = ('Failed to create parent subvolume %s' % sub_vol) with self._clean_exit_handler(msg, ack=True): run_command([BTRFS, 'subvolume', 'create', sub_vol]) snap_fp = ('%s/%s' % (sub_vol, self.snap_name)) with self._clean_exit_handler(msg): if (is_subvol(snap_fp)): ack = {'msg': 'snap_exists', 'id': self.meta['id'], } self.meta_push.send_json(ack) cmd = [BTRFS, 'receive', sub_vol] msg = ('Failed to start the low level btrfs receive command(%s)' '. Aborting.' % (cmd)) with self._clean_exit_handler(msg, ack=True): rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) msg = ('Failed to send begin_ok to the sender for meta: %s' % self.meta) with self._clean_exit_handler(msg): ack = {'msg': 'begin_ok', 'id': self.meta['id'], } self.meta_push.send_json(ack) recv_timeout_counter = 0 credit = settings.DEFAULT_SEND_CREDIT check_credit = True while True: if (check_credit is True and credit < 5): ack = {'msg': 'send_more', 'id': self.meta['id'], 'credit': settings.DEFAULT_SEND_CREDIT, } self.meta_push.send_json(ack) credit = credit + settings.DEFAULT_SEND_CREDIT logger.debug('%d KB received for %s' % (int(self.kb_received / 1024), sname)) try: recv_data = recv_sub.recv() recv_data = recv_data[len(self.meta['id']):] credit = credit - 1 recv_timeout_counter = 0 self.kb_received = self.kb_received + len(recv_data) if (self.rtid is None): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) # create a snapshot only if it's not already from a previous failed attempt with self._clean_exit_handler(msg, ack=True): create_snapshot(sname, self.snap_name, logger, snap_type='receiver') data = {'snap_name': self.snap_name} msg = ('Failed to create receive trail for rid: %d' '. meta: %s' % (self.rid, self.meta)) with self._clean_exit_handler(msg, ack=True): self.rtid = create_receive_trail(self.rid, data, logger) if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'): check_credit = False ts = datetime.utcnow().replace(tzinfo=utc) data = {'kb_received': self.kb_received / 1024, } if (recv_data == 'END_SUCCESS'): data['receive_succeeded'] = ts #delete the share, move the oldest snap to share oldest_snap = get_oldest_snap(sub_vol, 3) if (oldest_snap is not None): snap_path = ('%s/%s' % (sub_vol, oldest_snap)) share_path = ('%s%s/%s' % (settings.MNT_PT, self.dest_pool, sname)) msg = ('Failed to promote the oldest Snapshot(%s) ' 'to Share(%s)' % (snap_path, share_path)) try: pool = Pool.objects.get(name=self.dest_pool) remove_share(pool, sname) set_property(snap_path, 'ro', 'false', mount=False) run_command(['/usr/bin/rm', '-rf', share_path], throw=False) shutil.move(snap_path, share_path) set_property(share_path, 'ro', 'true', mount=False) delete_snapshot(sname, oldest_snap, logger) except Exception, e: logger.error(msg) logger.exception(msg) else: logger.error('END_FAIL received for meta: %s. ' 'Terminating.' % self.meta) rp.terminate() data['receive_failed'] = ts data['status'] = 'failed' msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) break if (rp.poll() is None): rp.stdin.write(recv_data) rp.stdin.flush() else: logger.error('It seems the btrfs receive process died' ' unexpectedly.') out, err = rp.communicate() msg = ('Low level system error from btrfs receive ' 'command. out: %s err: %s for rtid: %s meta: %s' % (out, err, self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): ts = datetime.utcnow().replace(tzinfo=utc) data = {'receive_failed': ts, 'status': 'failed', 'error': msg, } update_receive_trail(self.rtid, data, logger) except zmq.error.Again: recv_timeout_counter = recv_timeout_counter + 1 if (recv_timeout_counter > 600): logger.error('Nothing received in the last 60 seconds ' 'from the sender for meta: %s. Aborting.' % self.meta) self._sys_exit(3)
def run(self): logger.debug('Id: %s. Starting a new Receiver for meta: %s' % (self.identity, self.meta)) self.msg = ('Top level exception in receiver') latest_snap = None with self._clean_exit_handler(): self.law = APIWrapper() self.poll = zmq.Poller() self.dealer = self.ctx.socket(zmq.DEALER) self.dealer.setsockopt_string(zmq.IDENTITY, u'%s' % self.identity) self.dealer.set_hwm(10) self.dealer.connect('ipc://%s' % settings.REPLICATION.get('ipc_socket')) self.poll.register(self.dealer, zmq.POLLIN) self.ack = True self.msg = ('Failed to get the sender ip for appliance: %s' % self.sender_id) self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip if (not self.incremental): self.msg = ('Failed to verify/create share: %s.' % self.sname) self.create_share(self.sname, self.dest_pool) self.msg = ('Failed to create the replica metadata object ' 'for share: %s.' % self.sname) data = { 'share': self.sname, 'appliance': self.sender_ip, 'src_share': self.src_share, } self.rid = self.create_rshare(data) else: self.msg = ('Failed to retreive the replica metadata ' 'object for share: %s.' % self.sname) rso = ReplicaShare.objects.get(share=self.sname) self.rid = rso.id # Find and send the current snapshot to the sender. This will # be used as the start by btrfs-send diff. self.msg = ('Failed to verify latest replication snapshot ' 'on the system.') latest_snap = self._latest_snap(rso) self.msg = ('Failed to create receive trail for rid: %d' % self.rid) data = { 'snap_name': self.snap_name, } self.rtid = self.create_receive_trail(self.rid, data) # delete the share, move the oldest snap to share self.msg = ('Failed to promote the oldest Snapshot to Share.') oldest_snap = get_oldest_snap(self.snap_dir, self.num_retain_snaps, regex='_replication_') if (oldest_snap is not None): self.update_repclone(self.sname, oldest_snap) self.refresh_share_state() self.refresh_snapshot_state() self.msg = ('Failed to prune old Snapshots') self._delete_old_snaps(self.sname, self.snap_dir, self.num_retain_snaps + 1) # TODO: The following should be re-instantiated once we have a # TODO: working method for doing so. see validate_src_share. # self.msg = ('Failed to validate the source share(%s) on ' # 'sender(uuid: %s ' # ') Did the ip of the sender change?' % # (self.src_share, self.sender_id)) # self.validate_src_share(self.sender_id, self.src_share) sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.dest_pool, self.sname)) if (not is_subvol(sub_vol)): self.msg = ('Failed to create parent subvolume %s' % sub_vol) run_command([BTRFS, 'subvolume', 'create', sub_vol]) self.msg = ('Failed to create snapshot directory: %s' % self.snap_dir) run_command(['/usr/bin/mkdir', '-p', self.snap_dir]) snap_fp = ('%s/%s' % (self.snap_dir, self.snap_name)) # If the snapshot already exists, presumably from the previous # attempt and the sender tries to send the same, reply back with # snap_exists and do not start the btrfs-receive if (is_subvol(snap_fp)): logger.debug('Id: %s. Snapshot to be sent(%s) already ' 'exists. Not starting a new receive process' % (self.identity, snap_fp)) self._send_recv('snap-exists') self._sys_exit(0) cmd = [BTRFS, 'receive', self.snap_dir] self.msg = ('Failed to start the low level btrfs receive ' 'command(%s). Aborting.' % cmd) self.rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.msg = ('Failed to send receiver-ready') rcommand, rmsg = self._send_recv('receiver-ready', latest_snap or '') if (rcommand is None): logger.error('Id: %s. No response from the broker for ' 'receiver-ready command. Aborting.' % self.identity) self._sys_exit(3) term_commands = ( 'btrfs-send-init-error', 'btrfs-send-unexpected-termination-error', 'btrfs-send-nonzero-termination-error', ) num_tries = 10 poll_interval = 6000 # 6 seconds num_msgs = 0 t0 = time.time() while (True): socks = dict(self.poll.poll(poll_interval)) if (socks.get(self.dealer) == zmq.POLLIN): # reset to wait upto 60(poll_interval x num_tries # milliseconds) for every message num_tries = 10 command, message = self.dealer.recv_multipart() if (command == 'btrfs-send-stream-finished'): # this command concludes fsdata transfer. After this, # btrfs-recev process should be # terminated(.communicate). if (self.rp.poll() is None): self.msg = ('Failed to terminate btrfs-recv ' 'command') out, err = self.rp.communicate() out = out.split('\n') err = err.split('\n') logger.debug('Id: %s. Terminated btrfs-recv. ' 'cmd = %s out = %s err: %s rc: %s' % (self.identity, cmd, out, err, self.rp.returncode)) if (self.rp.returncode != 0): self.msg = ('btrfs-recv exited with unexpected ' 'exitcode(%s). ' % self.rp.returncode) raise Exception(self.msg) data = { 'status': 'succeeded', 'kb_received': self.total_bytes_received / 1024, } self.msg = ('Failed to update receive trail for ' 'rtid: %d' % self.rtid) self.update_receive_trail(self.rtid, data) self._send_recv('btrfs-recv-finished') self.refresh_share_state() self.refresh_snapshot_state() dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug('Id: %s. Receive complete. Total data ' 'transferred: %s. Rate: %s/sec.' % (self.identity, dsize, drate)) self._sys_exit(0) if (command in term_commands): self.msg = ('Terminal command(%s) received from the ' 'sender. Aborting.' % command) raise Exception(self.msg) if (self.rp.poll() is None): self.rp.stdin.write(message) self.rp.stdin.flush() # @todo: implement advanced credit request system. self.dealer.send_multipart([b'send-more', '']) num_msgs += 1 self.total_bytes_received += len(message) if (num_msgs == 1000): num_msgs = 0 data = { 'status': 'pending', 'kb_received': self.total_bytes_received / 1024, } self.update_receive_trail(self.rtid, data) dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug('Id: %s. Receiver alive. Data ' 'transferred: %s. Rate: %s/sec.' % (self.identity, dsize, drate)) else: out, err = self.rp.communicate() out = out.split('\n') err = err.split('\n') logger.error('Id: %s. btrfs-recv died unexpectedly. ' 'cmd: %s out: %s. err: %s' % (self.identity, cmd, out, err)) msg = ( 'Low level system error from btrfs receive ' 'command. cmd: %s out: %s err: %s for rtid: %s' % (cmd, out, err, self.rtid)) data = { 'status': 'failed', 'error': msg, } self.msg = ('Failed to update receive trail for ' 'rtid: %d.' % self.rtid) self.update_receive_trail(self.rtid, data) self.msg = msg raise Exception(self.msg) else: num_tries -= 1 msg = ('No response received from the broker. ' 'remaining tries: %d' % num_tries) logger.error('Id: %s. %s' % (self.identity, msg)) if (num_tries == 0): self.msg = ('%s. Terminating the receiver.' % msg) raise Exception(self.msg)
def run(self): logger.debug("Id: %s. Starting a new Receiver for meta: %s" % (self.identity, self.meta)) self.msg = "Top level exception in receiver" latest_snap = None with self._clean_exit_handler(): self.law = APIWrapper() self.poll = zmq.Poller() self.dealer = self.ctx.socket(zmq.DEALER) self.dealer.setsockopt_string(zmq.IDENTITY, u"%s" % self.identity) self.dealer.set_hwm(10) self.dealer.connect("ipc://%s" % settings.REPLICATION.get("ipc_socket")) self.poll.register(self.dealer, zmq.POLLIN) self.ack = True self.msg = "Failed to get the sender ip for appliance: %s" % self.sender_id self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip if not self.incremental: self.msg = "Failed to verify/create share: %s." % self.sname self.create_share(self.sname, self.dest_pool) self.msg = ("Failed to create the replica metadata object " "for share: %s." % self.sname) data = { "share": self.sname, "appliance": self.sender_ip, "src_share": self.src_share, } self.rid = self.create_rshare(data) else: self.msg = ("Failed to retreive the replica metadata " "object for share: %s." % self.sname) rso = ReplicaShare.objects.get(share=self.sname) self.rid = rso.id # Find and send the current snapshot to the sender. This will # be used as the start by btrfs-send diff. self.msg = ( "Failed to verify latest replication snapshot on the system." ) latest_snap = self._latest_snap(rso) self.msg = "Failed to create receive trail for rid: %d" % self.rid data = { "snap_name": self.snap_name, } self.rtid = self.create_receive_trail(self.rid, data) # delete the share, move the oldest snap to share self.msg = "Failed to promote the oldest Snapshot to Share." oldest_snap = get_oldest_snap(self.snap_dir, self.num_retain_snaps, regex="_replication_") if oldest_snap is not None: self.update_repclone(self.sname, oldest_snap) self.refresh_share_state() self.refresh_snapshot_state() self.msg = "Failed to prune old Snapshots" self._delete_old_snaps(self.sname, self.snap_dir, self.num_retain_snaps + 1) # TODO: The following should be re-instantiated once we have a # TODO: working method for doing so. see validate_src_share. # self.msg = ('Failed to validate the source share(%s) on ' # 'sender(uuid: %s ' # ') Did the ip of the sender change?' % # (self.src_share, self.sender_id)) # self.validate_src_share(self.sender_id, self.src_share) sub_vol = "%s%s/%s" % (settings.MNT_PT, self.dest_pool, self.sname) if not is_subvol(sub_vol): self.msg = "Failed to create parent subvolume %s" % sub_vol run_command([BTRFS, "subvolume", "create", sub_vol]) self.msg = "Failed to create snapshot directory: %s" % self.snap_dir run_command(["/usr/bin/mkdir", "-p", self.snap_dir]) snap_fp = "%s/%s" % (self.snap_dir, self.snap_name) # If the snapshot already exists, presumably from the previous # attempt and the sender tries to send the same, reply back with # snap_exists and do not start the btrfs-receive if is_subvol(snap_fp): logger.debug("Id: %s. Snapshot to be sent(%s) already " "exists. Not starting a new receive process" % (self.identity, snap_fp)) self._send_recv("snap-exists") self._sys_exit(0) cmd = [BTRFS, "receive", self.snap_dir] self.msg = ("Failed to start the low level btrfs receive " "command(%s). Aborting." % cmd) self.rp = subprocess.Popen( cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) self.msg = "Failed to send receiver-ready" rcommand, rmsg = self._send_recv("receiver-ready", latest_snap or "") if rcommand is None: logger.error("Id: %s. No response from the broker for " "receiver-ready command. Aborting." % self.identity) self._sys_exit(3) term_commands = ( "btrfs-send-init-error", "btrfs-send-unexpected-termination-error", "btrfs-send-nonzero-termination-error", ) num_tries = 10 poll_interval = 6000 # 6 seconds num_msgs = 0 t0 = time.time() while True: socks = dict(self.poll.poll(poll_interval)) if socks.get(self.dealer) == zmq.POLLIN: # reset to wait upto 60(poll_interval x num_tries # milliseconds) for every message num_tries = 10 command, message = self.dealer.recv_multipart() if command == "btrfs-send-stream-finished": # this command concludes fsdata transfer. After this, # btrfs-recev process should be # terminated(.communicate). if self.rp.poll() is None: self.msg = "Failed to terminate btrfs-recv command" out, err = self.rp.communicate() out = out.split("\n") err = err.split("\n") logger.debug("Id: %s. Terminated btrfs-recv. " "cmd = %s out = %s err: %s rc: %s" % (self.identity, cmd, out, err, self.rp.returncode)) if self.rp.returncode != 0: self.msg = ("btrfs-recv exited with unexpected " "exitcode(%s). " % self.rp.returncode) raise Exception(self.msg) data = { "status": "succeeded", "kb_received": self.total_bytes_received / 1024, } self.msg = ( "Failed to update receive trail for rtid: %d" % self.rtid) self.update_receive_trail(self.rtid, data) self._send_recv("btrfs-recv-finished") self.refresh_share_state() self.refresh_snapshot_state() dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug("Id: %s. Receive complete. Total data " "transferred: %s. Rate: %s/sec." % (self.identity, dsize, drate)) self._sys_exit(0) if command in term_commands: self.msg = ("Terminal command(%s) received from the " "sender. Aborting." % command) raise Exception(self.msg) if self.rp.poll() is None: self.rp.stdin.write(message) self.rp.stdin.flush() # @todo: implement advanced credit request system. self.dealer.send_multipart([b"send-more", ""]) num_msgs += 1 self.total_bytes_received += len(message) if num_msgs == 1000: num_msgs = 0 data = { "status": "pending", "kb_received": self.total_bytes_received / 1024, } self.update_receive_trail(self.rtid, data) dsize, drate = self.size_report( self.total_bytes_received, t0) logger.debug("Id: %s. Receiver alive. Data " "transferred: %s. Rate: %s/sec." % (self.identity, dsize, drate)) else: out, err = self.rp.communicate() out = out.split("\n") err = err.split("\n") logger.error("Id: %s. btrfs-recv died unexpectedly. " "cmd: %s out: %s. err: %s" % (self.identity, cmd, out, err)) msg = ( "Low level system error from btrfs receive " "command. cmd: %s out: %s err: %s for rtid: %s" % (cmd, out, err, self.rtid)) data = { "status": "failed", "error": msg, } self.msg = ("Failed to update receive trail for " "rtid: %d." % self.rtid) self.update_receive_trail(self.rtid, data) self.msg = msg raise Exception(self.msg) else: num_tries -= 1 msg = ("No response received from the broker. " "remaining tries: %d" % num_tries) logger.error("Id: %s. %s" % (self.identity, msg)) if num_tries == 0: self.msg = "%s. Terminating the receiver." % msg raise Exception(self.msg)