예제 #1
0
    def _refresh_rt(self):
        # for incremental sends, the receiver tells us the latest successful
        # snapshot on it. This should match self.rt in most cases. Sometimes,
        # it may not be the one refered by self.rt(latest) but a previous one.
        # We need to make sure to *only* send the incremental send that
        # receiver expects.
        self.msg = ('Failed to validate/refresh ReplicaTrail.')
        if (self.rlatest_snap is None):
            # Validate/update self.rt to the one that has the expected Snapshot
            # on the system.
            for rt in ReplicaTrail.objects.filter(
                    replica=self.replica, status='succeeded').order_by('-id'):
                snap_path = ('%s%s/.snapshots/%s/%s'
                             % (settings.MNT_PT, self.replica.pool,
                                self.replica.share, self.rt.snap_name))
                if (is_subvol(snap_path)):
                    return rt
            # Snapshots from previous succeeded ReplicaTrails don't actually
            # exist on the system. So we send a Full replication instead of
            # incremental.
            return None

        if (len(self.rlatest_snap) == 0):
            # Receiver sends empty string when it fails to reply back to an
            # incremental send request with an appropriate parent snapshot
            # name.
            return None

        if (self.rt.snap_name != self.rlatest_snap):
            self.msg = ('Mismatch on starting snapshot for '
                        'btrfs-send. Sender picked %s but Receiver wants '
                        '%s, which takes precedence.'
                        % (self.rt.snap_name, self.rlatest_snap))
            for rt in ReplicaTrail.objects.filter(
                    replica=self.replica, status='succeeded').order_by('-id'):
                if (rt.snap_name == self.rlatest_snap):
                    self.msg = ('%s. successful trail found for %s'
                                % (self.msg, self.rlatest_snap))
                    snap_path = ('%s%s/.snapshots/%s/%s'
                                 % (settings.MNT_PT, self.replica.pool,
                                    self.replica.share, self.rlatest_snap))
                    if (is_subvol(snap_path)):
                        self.msg = ('Snapshot(%s) exists in the system and '
                                    'will be used as the parent' % snap_path)
                        logger.debug('Id: %s. %s' % (self.identity, self.msg))
                        return rt
                    self.msg = ('Snapshot(%s) does not exist on the system. '
                                'So cannot use it.' % snap_path)
                    raise Exception(self.msg)
            raise Exception('%s. No succeeded trail found for %s.'
                            % (self.msg, self.rlatest_snap))

        snap_path = ('%s%s/.snapshots/%s/%s'
                     % (settings.MNT_PT, self.replica.pool,
                        self.replica.share, self.rlatest_snap))
        if (is_subvol(snap_path)):
            return self.rt
        raise Exception('Parent Snapshot(%s) to use in btrfs-send does not '
                        'exist in the system.' % snap_path)
예제 #2
0
    def _refresh_rt(self):
        # for incremental sends, the receiver tells us the latest successful
        # snapshot on it. This should match self.rt in most cases. Sometimes,
        # it may not be the one refered by self.rt(latest) but a previous one.
        # We need to make sure to *only* send the incremental send that
        # receiver expects.
        self.msg = ('Failed to validate/refresh ReplicaTrail.')
        if (self.rlatest_snap is None):
            # Validate/update self.rt to the one that has the expected Snapshot
            # on the system.
            for rt in ReplicaTrail.objects.filter(
                    replica=self.replica, status='succeeded').order_by('-id'):
                snap_path = ('%s%s/.snapshots/%s/%s' %
                             (settings.MNT_PT, self.replica.pool,
                              self.replica.share, self.rt.snap_name))
                if (is_subvol(snap_path)):
                    return rt
            # Snapshots from previous succeeded ReplicaTrails don't actually
            # exist on the system. So we send a Full replication instead of
            # incremental.
            return None

        if (len(self.rlatest_snap) == 0):
            # Receiver sends empty string when it fails to reply back to an
            # incremental send request with an appropriate parent snapshot
            # name.
            return None

        if (self.rt.snap_name != self.rlatest_snap):
            self.msg = ('Mismatch on starting snapshot for '
                        'btrfs-send. Sender picked %s but Receiver wants '
                        '%s, which takes precedence.' %
                        (self.rt.snap_name, self.rlatest_snap))
            for rt in ReplicaTrail.objects.filter(
                    replica=self.replica, status='succeeded').order_by('-id'):
                if (rt.snap_name == self.rlatest_snap):
                    self.msg = ('%s. successful trail found for %s' %
                                (self.msg, self.rlatest_snap))
                    snap_path = ('%s%s/.snapshots/%s/%s' %
                                 (settings.MNT_PT, self.replica.pool,
                                  self.replica.share, self.rlatest_snap))
                    if (is_subvol(snap_path)):
                        self.msg = ('Snapshot(%s) exists in the system and '
                                    'will be used as the parent' % snap_path)
                        logger.debug('Id: %s. %s' % (self.identity, self.msg))
                        return rt
                    self.msg = ('Snapshot(%s) does not exist on the system. '
                                'So cannot use it.' % snap_path)
                    raise Exception(self.msg)
            raise Exception('%s. No succeeded trail found for %s.' %
                            (self.msg, self.rlatest_snap))

        snap_path = ('%s%s/.snapshots/%s/%s' %
                     (settings.MNT_PT, self.replica.pool, self.replica.share,
                      self.rlatest_snap))
        if (is_subvol(snap_path)):
            return self.rt
        raise Exception('Parent Snapshot(%s) to use in btrfs-send does not '
                        'exist in the system.' % snap_path)
예제 #3
0
 def test_is_subvol_nonexistent(self):
     mount_point = '/mnt2/test-pool/test-share'
     o = ['']
     e = ["ERROR: cannot find real path for '/mnt2/test-pool/test-share': No such file or directory", '']
     rc = 1
     # btrfs subvol show has return code of 1 when subvol doesn't exist.
     self.mock_run_command.return_value = (o, e, rc)
     self.assertFalse(is_subvol(mount_point),
                      msg='Did NOT return False for nonexistent subvol')
예제 #4
0
 def test_is_subvol_nonexistent(self):
     mount_point = '/mnt2/test-pool/test-share'
     o = ['']
     e = [("ERROR: cannot find real path for '/mnt2/test-pool/test-share': "
           "No such file or directory"), '']
     rc = 1
     # btrfs subvol show has return code of 1 when subvol doesn't exist.
     self.mock_run_command.return_value = (o, e, rc)
     self.assertFalse(is_subvol(mount_point),
                      msg='Did NOT return False for nonexistent subvol')
예제 #5
0
 def _latest_snap(self, rso):
     for snap in ReceiveTrail.objects.filter(
             rshare=rso, status='succeeded').order_by('-id'):
         if (is_subvol('%s/%s' % (self.snap_dir, snap.snap_name))):
             return str(snap.snap_name)  # cannot be unicode for zmq message
     logger.error('Id: %s. There are no replication snapshots on the '
                  'system for '
                  'Share(%s).' % (self.identity, rso.share))
     # This would mean, a full backup transfer is required.
     return None
예제 #6
0
 def test_is_subvol_exists(self):
     mount_point = '/mnt2/test-pool/test-share'
     o = ['/mnt2/test-pool/test-share', '\tName: \t\t\ttest-share',
          '\tUUID: \t\t\t80c240a2-c353-7540-bb5e-b6a71a50a02e',
          '\tParent UUID: \t\t-', '\tReceived UUID: \t\t-',
          '\tCreation time: \t\t2016-07-27 17:01:09 +0100',
          '\tSubvolume ID: \t\t258', '\tGeneration: \t\t13',
          '\tGen at creation: \t13', '\tParent ID: \t\t5',
          '\tTop level ID: \t\t5', '\tFlags: \t\t\t-', '\tSnapshot(s):', '']
     e = ['']
     rc = 0
     # btrfs subvol show has return code of 0 (no errors) when subvol exists
     self.mock_run_command.return_value = (o, e, rc)
     self.assertTrue(is_subvol(mount_point),
                     msg='Did NOT return True for existing subvol')
예제 #7
0
 def test_is_subvol_exists(self):
     mount_point = '/mnt2/test-pool/test-share'
     o = [
         '/mnt2/test-pool/test-share', '\tName: \t\t\ttest-share',
         '\tUUID: \t\t\t80c240a2-c353-7540-bb5e-b6a71a50a02e',
         '\tParent UUID: \t\t-', '\tReceived UUID: \t\t-',
         '\tCreation time: \t\t2016-07-27 17:01:09 +0100',
         '\tSubvolume ID: \t\t258', '\tGeneration: \t\t13',
         '\tGen at creation: \t13', '\tParent ID: \t\t5',
         '\tTop level ID: \t\t5', '\tFlags: \t\t\t-', '\tSnapshot(s):', ''
     ]
     e = ['']
     rc = 0
     # btrfs subvol show has return code of 0 (no errors) when subvol exists
     self.mock_run_command.return_value = (o, e, rc)
     self.assertTrue(is_subvol(mount_point),
                     msg='Did NOT return True for existing subvol')
예제 #8
0
def create_repclone(share, request, logger, snapshot):
    """
    Variant of create_clone but where the share already exists and is to be
    supplanted by a snapshot which is effectively moved into the shares prior
    position, both in the db and on the file system. This is achieved thus:
    Unmount target share - (via remove_share()).
    Btrfs subvol delete target share (via remove_share()).
    Remove prior target share mount point (dir).
    Move snap source to target share's former location (becomes share on disk).
    Update existing target share db entry with source snap's qgroup / usage.
    Remove source snap's db entry: updated share db entry makes it redundant.
    Remount share (which now represents the prior snap's subvol relocated).
    :param share: Share object to be supplanted
    :param request:
    :param logger: Logger object to reference
    :param snapshot: Source snapshot/quirk share object to supplant target.
    :return: response of serialized share (in it's updated form)
    """
    try:
        logger.info("Supplanting share ({}) with "
                    "snapshot ({}).".format(share.name, snapshot.name))
        # We first strip our snapshot.name of any path as when we encounter the
        # initially created receive subvol it is identified as a share with a
        # snapshots location as it's subvol name (current quirk of import sys).
        # E.g. first receive subvol/share-in-snapdir name example:
        # ".snapshots/C583C37F-...1712B_sharename/sharename_19_replication_1".
        # Subsequent more regular snapshots (in db as such) are named thus:
        # "sharename_19_replication_2" or "sharename_19_replication_2" and on.
        # The 19 in the above names is the generation of the replication task.
        #
        # Normalise source name across initial quirk share & subsequent snaps.
        source_name = snapshot.name.split("/")[-1]
        # Note in the above we have to use Object.name for polymorphism, but
        # our share is passed by it's subvol (potential fragility point).
        snap_path = "{}/.snapshots/{}/{}".format(share.pool.mnt_pt, share.name,
                                                 source_name).replace(
                                                     "//", "/")
        # e.g. for above: /mnt2/poolname/.snapshots/sharename/snapname
        # or /.snapshots/sharename/snapname for system pool shares
        share_path = ("{}/{}".format(share.pool.mnt_pt,
                                     share.name)).replace("//", "/")
        # e.g. for above: /mnt2/poolname/sharename or /sharename for system pool shares
        # Passed db snap assured by caller but this does not guarantee on disk.
        if not is_subvol(snap_path):
            raise Exception("Subvol with path ({}) does not exist. Aborting "
                            "replacement of share with path ({}).".format(
                                snap_path, share_path))
        # unmounts and then subvol deletes our on disk share
        remove_share(share.pool, share.name, PQGROUP_DEFAULT)
        # Remove read only flag on our snapshot subvol
        set_property(snap_path, "ro", "false", mount=False)
        # Ensure removed share path is clean, ie remove mount point.
        run_command(["/usr/bin/rm", "-rf", share_path], throw=False)
        # Now move snapshot to prior shares location. Given both a share and
        # a snapshot are subvols, we effectively promote the snap to a share.
        logger.info(
            "Moving snapshot ({}) to prior share's pool location ({})".format(
                snap_path, share_path))
        shutil.move(snap_path, share_path)
        # This should have re-established our just removed subvol.
        # Supplant share db info with snap info to reflect new on disk state.
        share.qgroup = snapshot.qgroup
        share.rusage = snapshot.rusage
        share.eusage = snapshot.eusage
        share.save()
        # delete our now redundant snapshot/quirky share db entry
        snapshot.delete()
        # update our share's quota
        update_quota(share.pool, share.pqgroup, share.size * 1024)
        # mount our newly supplanted share
        # We independently mount all shares, data pool or system pool, in /mnt2/name
        mnt_pt = "{}{}".format(settings.MNT_PT, share.name)
        mount_share(share, mnt_pt)
        return Response(ShareSerializer(share).data)
    except Exception as e:
        handle_exception(e, request)
예제 #9
0
    def run(self):
        msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' %
               (self.meta['uuid'], self.meta))
        with self._clean_exit_handler(msg):
            self.sender_ip = get_sender_ip(self.meta['uuid'], logger)

        msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: '
               '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta))
        with self._clean_exit_handler(msg):
            #@todo: add validation
            recv_sub = self.ctx.socket(zmq.SUB)
            recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port))
            recv_sub.RCVTIMEO = 100
            recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id']))

        msg = ('Failed to connect to the sender(%s) on '
               'meta_port(%d). meta: %s. Aborting.' %
               (self.sender_ip, self.meta_port, self.meta))
        with self._clean_exit_handler(msg):
            self.meta_push = self.ctx.socket(zmq.PUSH)
            self.meta_push.connect('tcp://%s:%d' % (self.sender_ip,
                                                    self.meta_port))

        sname = ('%s_%s' % (self.sender_id, self.src_share))
        if (not self.incremental):
            msg = ('Failed to verify/create share: %s. meta: %s. '
                   'Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                create_share(sname, self.dest_pool, logger)

            msg = ('Failed to create the replica metadata object '
                   'for share: %s. meta: %s. Aborting.' %
                   (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                data = {'share': sname,
                        'appliance': self.sender_ip,
                        'src_share': self.src_share,
                        'data_port': self.data_port,
                        'meta_port': self.meta_port, }
                self.rid = create_rshare(data, logger)

        else:
            msg = ('Failed to retreive the replica metadata object for '
                   'share: %s. meta: %s. Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg):
                self.rid = rshare_id(sname, logger)

        sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.meta['pool'],
                                           sname))
        if (not is_subvol(sub_vol)):
            msg = ('Failed to create parent subvolume %s' % sub_vol)
            with self._clean_exit_handler(msg, ack=True):
                run_command([BTRFS, 'subvolume', 'create', sub_vol])

        snap_fp = ('%s/%s' % (sub_vol, self.snap_name))
        with self._clean_exit_handler(msg):
            if (is_subvol(snap_fp)):
                ack = {'msg': 'snap_exists',
                       'id': self.meta['id'], }
                self.meta_push.send_json(ack)

        cmd = [BTRFS, 'receive', sub_vol]
        msg = ('Failed to start the low level btrfs receive command(%s)'
               '. Aborting.' % (cmd))
        with self._clean_exit_handler(msg, ack=True):
            rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)

        msg = ('Failed to send begin_ok to the sender for meta: %s' %
               self.meta)
        with self._clean_exit_handler(msg):
            ack = {'msg': 'begin_ok',
                   'id': self.meta['id'], }
            self.meta_push.send_json(ack)
        recv_timeout_counter = 0
        credit = settings.DEFAULT_SEND_CREDIT
        check_credit = True
        while True:
            if (check_credit is True and credit < 5):
                ack = {'msg': 'send_more',
                       'id': self.meta['id'],
                       'credit': settings.DEFAULT_SEND_CREDIT, }
                self.meta_push.send_json(ack)
                credit = credit + settings.DEFAULT_SEND_CREDIT
                logger.debug('%d KB received for %s' %
                             (int(self.kb_received / 1024), sname))

            try:
                recv_data = recv_sub.recv()
                recv_data = recv_data[len(self.meta['id']):]
                credit = credit - 1
                recv_timeout_counter = 0
                self.kb_received = self.kb_received + len(recv_data)
                if (self.rtid is None):
                    msg = ('Failed to create snapshot: %s. Aborting.' %
                           self.snap_name)
                    # create a snapshot only if it's not already from a previous failed attempt
                    with self._clean_exit_handler(msg, ack=True):
                        create_snapshot(sname, self.snap_name, logger,
                                        snap_type='receiver')

                    data = {'snap_name': self.snap_name}
                    msg = ('Failed to create receive trail for rid: %d'
                           '. meta: %s' % (self.rid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        self.rtid = create_receive_trail(self.rid, data,
                                                         logger)

                if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'):
                    check_credit = False
                    ts = datetime.utcnow().replace(tzinfo=utc)
                    data = {'kb_received': self.kb_received / 1024, }
                    if (recv_data == 'END_SUCCESS'):
                        data['receive_succeeded'] = ts.strftime(settings.SNAP_TS_FORMAT)
                        #delete the share, move the oldest snap to share
                        oldest_snap = get_oldest_snap(sub_vol, 3)
                        if (oldest_snap is not None):
                            snap_path = ('%s/%s' % (sub_vol, oldest_snap))
                            share_path = ('%s%s/%s' %
                                          (settings.MNT_PT, self.dest_pool,
                                           sname))
                            msg = ('Failed to promote the oldest Snapshot(%s) '
                                   'to Share(%s)' % (snap_path, share_path))
                            try:
                                pool = Pool.objects.get(name=self.dest_pool)
                                remove_share(pool, sname)
                                set_property(snap_path, 'ro', 'false',
                                             mount=False)
                                run_command(['/usr/bin/rm', '-rf', share_path],
                                            throw=False)
                                shutil.move(snap_path, share_path)
                                set_property(share_path, 'ro', 'true',
                                             mount=False)
                                delete_snapshot(sname, oldest_snap, logger)
                            except Exception, e:
                                logger.error(msg)
                                logger.exception(msg)
                    else:
                        logger.error('END_FAIL received for meta: %s. '
                                     'Terminating.' % self.meta)
                        rp.terminate()
                        data['receive_failed'] = ts
                        data['status'] = 'failed'

                    msg = ('Failed to update receive trail for rtid: %d'
                           '. meta: %s' % (self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        update_receive_trail(self.rtid, data, logger)
                    break
                if (rp.poll() is None):
                    rp.stdin.write(recv_data)
                    rp.stdin.flush()
                else:
                    logger.error('It seems the btrfs receive process died'
                                 ' unexpectedly.')
                    out, err = rp.communicate()
                    msg = ('Low level system error from btrfs receive '
                           'command. out: %s err: %s for rtid: %s meta: %s'
                           % (out, err, self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        ts = datetime.utcnow().replace(tzinfo=utc)
                        data = {'receive_failed': ts.strftime(settings.SNAP_TS_FORMAT),
                                'status': 'failed',
                                'error': msg, }
                        update_receive_trail(self.rtid, data, logger)
            except zmq.error.Again:
                recv_timeout_counter = recv_timeout_counter + 1
                if (recv_timeout_counter > 600):
                    logger.error('Nothing received in the last 60 seconds '
                                 'from the sender for meta: %s. Aborting.'
                                 % self.meta)
                    self._sys_exit(3)
예제 #10
0
def create_repclone(share, request, logger, snapshot):
    """
    Variant of create_clone but where the share already exists and is to be
    supplanted by a snapshot which is effectively moved into the shares prior
    position, both in the db and on the file system. This is achieved thus:
    Unmount target share - (via remove_share()).
    Btrfs subvol delete target share (via remove_share()).
    Remove prior target share mount point (dir).
    Move snap source to target share's former location (becomes share on disk).
    Update existing target share db entry with source snap's qgroup / usage.
    Remove source snap's db entry: updated share db entry makes it redundant.
    Remount share (which now represents the prior snap's subvol relocated).
    :param share: Share object to be supplanted
    :param request:
    :param logger: Logger object to reference
    :param snapshot: Source snapshot/quirk share object to supplant target.
    :return: response of serialized share (in it's updated form)
    """
    try:
        logger.info('Supplanting share ({}) with '
                     'snapshot ({}).'.format(share.name, snapshot.name))
        # We first strip our snapshot.name of any path as when we encounter the
        # initially created receive subvol it is identified as a share with a
        # snapshots location as it's subvol name (current quirk of import sys).
        # E.g. first receive subvol/share-in-snapdir name example:
        # ".snapshots/C583C37F-...1712B_sharename/sharename_19_replication_1".
        # Subsequent more regular snapshots (in db as such) are named thus:
        # "sharename_19_replication_2" or "sharename_19_replication_2" and on.
        # The 19 in the above names is the generation of the replication task.
        #
        # Normalise source name across initial quirk share & subsequent snaps.
        source_name = snapshot.name.split('/')[-1]
        # Note in the above we have to use Object.name for polymorphism, but
        # our share is passed by it's subvol (potential fragility point).
        snap_path = '{}{}/.snapshots/{}/{}'.format(settings.MNT_PT,
                                                   share.pool.name, share.name,
                                                   source_name)
        # eg /mnt2/poolname/.snapshots/sharename/snapname
        share_path = ('{}{}/{}'.format(settings.MNT_PT, share.pool.name,
                                       share.name))
        # eg /mnt2/poolname/sharename
        # Passed db snap assured by caller but this does not guarantee on disk.
        if not is_subvol(snap_path):
            raise Exception('Subvol with path ({}) does not exist. Aborting '
                            'replacement of share ({}).'.format(snap_path,
                                                                share.name))
        # unmounts and then subvol deletes our on disk share
        remove_share(share.pool, share.name, PQGROUP_DEFAULT)
        # Remove read only flag on our snapshot subvol
        set_property(snap_path, 'ro', 'false', mount=False)
        # Ensure removed share path is clean, ie remove mount point.
        run_command(['/usr/bin/rm', '-rf', share_path], throw=False)
        # Now move snapshot to prior shares location. Given both a share and
        # a snapshot are subvols, we effectively promote the snap to a share.
        shutil.move(snap_path, share_path)
        # This should have re-established our just removed subvol.
        # Supplant share db info with snap info to reflect new on disk state.
        share.qgroup = snapshot.qgroup
        share.rusage = snapshot.rusage
        share.eusage = snapshot.eusage
        share.save()
        # delete our now redundant snapshot/quirky share db entry
        snapshot.delete()
        # update our share's quota
        update_quota(share.pool, share.pqgroup, share.size * 1024)
        # mount our newly supplanted share
        mnt_pt = '{}{}'.format(settings.MNT_PT, share.name)
        mount_share(share, mnt_pt)
        return Response(ShareSerializer(share).data)
    except Exception as e:
        handle_exception(e, request)
예제 #11
0
    def run(self):
        msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' %
               (self.meta['uuid'], self.meta))
        with self._clean_exit_handler(msg):
            self.sender_ip = get_sender_ip(self.meta['uuid'], logger)

        msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: '
               '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta))
        with self._clean_exit_handler(msg):
            #@todo: add validation
            recv_sub = self.ctx.socket(zmq.SUB)
            recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port))
            recv_sub.RCVTIMEO = 100
            recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id']))

        msg = ('Failed to connect to the sender(%s) on '
               'meta_port(%d). meta: %s. Aborting.' %
               (self.sender_ip, self.meta_port, self.meta))
        with self._clean_exit_handler(msg):
            self.meta_push = self.ctx.socket(zmq.PUSH)
            self.meta_push.connect('tcp://%s:%d' % (self.sender_ip,
                                                    self.meta_port))

        sname = ('%s_%s' % (self.sender_id, self.src_share))
        if (not self.incremental):
            msg = ('Failed to verify/create share: %s. meta: %s. '
                   'Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                create_share(sname, self.dest_pool, logger)

            msg = ('Failed to create the replica metadata object '
                   'for share: %s. meta: %s. Aborting.' %
                   (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                data = {'share': sname,
                        'appliance': self.sender_ip,
                        'src_share': self.src_share,
                        'data_port': self.data_port,
                        'meta_port': self.meta_port, }
                self.rid = create_rshare(data, logger)

        else:
            msg = ('Failed to retreive the replica metadata object for '
                   'share: %s. meta: %s. Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg):
                self.rid = rshare_id(sname, logger)

        sub_vol = ('%s%s/.snapshots/%s' % (settings.MNT_PT, self.meta['pool'],
                                           sname))
        if (not is_subvol(sub_vol)):
            msg = ('Failed to create parent subvolume %s' % sub_vol)
            with self._clean_exit_handler(msg, ack=True):
                run_command([BTRFS, 'subvolume', 'create', sub_vol])

        snap_fp = ('%s/%s' % (sub_vol, self.snap_name))
        with self._clean_exit_handler(msg):
            if (is_subvol(snap_fp)):
                ack = {'msg': 'snap_exists',
                       'id': self.meta['id'], }
                self.meta_push.send_json(ack)

        cmd = [BTRFS, 'receive', sub_vol]
        msg = ('Failed to start the low level btrfs receive command(%s)'
               '. Aborting.' % (cmd))
        with self._clean_exit_handler(msg, ack=True):
            rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)

        msg = ('Failed to send begin_ok to the sender for meta: %s' %
               self.meta)
        with self._clean_exit_handler(msg):
            ack = {'msg': 'begin_ok',
                   'id': self.meta['id'], }
            self.meta_push.send_json(ack)
        recv_timeout_counter = 0
        credit = settings.DEFAULT_SEND_CREDIT
        check_credit = True
        while True:
            if (check_credit is True and credit < 5):
                ack = {'msg': 'send_more',
                       'id': self.meta['id'],
                       'credit': settings.DEFAULT_SEND_CREDIT, }
                self.meta_push.send_json(ack)
                credit = credit + settings.DEFAULT_SEND_CREDIT
                logger.debug('%d KB received for %s' %
                             (int(self.kb_received / 1024), sname))

            try:
                recv_data = recv_sub.recv()
                recv_data = recv_data[len(self.meta['id']):]
                credit = credit - 1
                recv_timeout_counter = 0
                self.kb_received = self.kb_received + len(recv_data)
                if (self.rtid is None):
                    msg = ('Failed to create snapshot: %s. Aborting.' %
                           self.snap_name)
                    # create a snapshot only if it's not already from a previous failed attempt
                    with self._clean_exit_handler(msg, ack=True):
                        create_snapshot(sname, self.snap_name, logger,
                                        snap_type='receiver')

                    data = {'snap_name': self.snap_name}
                    msg = ('Failed to create receive trail for rid: %d'
                           '. meta: %s' % (self.rid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        self.rtid = create_receive_trail(self.rid, data,
                                                         logger)

                if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'):
                    check_credit = False
                    ts = datetime.utcnow().replace(tzinfo=utc)
                    data = {'kb_received': self.kb_received / 1024, }
                    if (recv_data == 'END_SUCCESS'):
                        data['receive_succeeded'] = ts
                        #delete the share, move the oldest snap to share
                        oldest_snap = get_oldest_snap(sub_vol, 3)
                        if (oldest_snap is not None):
                            snap_path = ('%s/%s' % (sub_vol, oldest_snap))
                            share_path = ('%s%s/%s' %
                                          (settings.MNT_PT, self.dest_pool,
                                           sname))
                            msg = ('Failed to promote the oldest Snapshot(%s) '
                                   'to Share(%s)' % (snap_path, share_path))
                            try:
                                pool = Pool.objects.get(name=self.dest_pool)
                                remove_share(pool, sname)
                                set_property(snap_path, 'ro', 'false',
                                             mount=False)
                                run_command(['/usr/bin/rm', '-rf', share_path],
                                            throw=False)
                                shutil.move(snap_path, share_path)
                                set_property(share_path, 'ro', 'true',
                                             mount=False)
                                delete_snapshot(sname, oldest_snap, logger)
                            except Exception, e:
                                logger.error(msg)
                                logger.exception(msg)
                    else:
                        logger.error('END_FAIL received for meta: %s. '
                                     'Terminating.' % self.meta)
                        rp.terminate()
                        data['receive_failed'] = ts
                        data['status'] = 'failed'

                    msg = ('Failed to update receive trail for rtid: %d'
                           '. meta: %s' % (self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        update_receive_trail(self.rtid, data, logger)
                    break
                if (rp.poll() is None):
                    rp.stdin.write(recv_data)
                    rp.stdin.flush()
                else:
                    logger.error('It seems the btrfs receive process died'
                                 ' unexpectedly.')
                    out, err = rp.communicate()
                    msg = ('Low level system error from btrfs receive '
                           'command. out: %s err: %s for rtid: %s meta: %s'
                           % (out, err, self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        ts = datetime.utcnow().replace(tzinfo=utc)
                        data = {'receive_failed': ts,
                                'status': 'failed',
                                'error': msg, }
                        update_receive_trail(self.rtid, data, logger)
            except zmq.error.Again:
                recv_timeout_counter = recv_timeout_counter + 1
                if (recv_timeout_counter > 600):
                    logger.error('Nothing received in the last 60 seconds '
                                 'from the sender for meta: %s. Aborting.'
                                 % self.meta)
                    self._sys_exit(3)
예제 #12
0
    def run(self):
        logger.debug('Id: %s. Starting a new Receiver for meta: %s' %
                     (self.identity, self.meta))
        self.msg = ('Top level exception in receiver')
        latest_snap = None
        with self._clean_exit_handler():
            self.law = APIWrapper()
            self.poll = zmq.Poller()
            self.dealer = self.ctx.socket(zmq.DEALER)
            self.dealer.setsockopt_string(zmq.IDENTITY, u'%s' % self.identity)
            self.dealer.set_hwm(10)
            self.dealer.connect('ipc://%s' %
                                settings.REPLICATION.get('ipc_socket'))
            self.poll.register(self.dealer, zmq.POLLIN)

            self.ack = True
            self.msg = ('Failed to get the sender ip for appliance: %s' %
                        self.sender_id)
            self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip

            if (not self.incremental):
                self.msg = ('Failed to verify/create share: %s.' % self.sname)
                self.create_share(self.sname, self.dest_pool)

                self.msg = ('Failed to create the replica metadata object '
                            'for share: %s.' % self.sname)
                data = {
                    'share': self.sname,
                    'appliance': self.sender_ip,
                    'src_share': self.src_share,
                }
                self.rid = self.create_rshare(data)
            else:
                self.msg = ('Failed to retreive the replica metadata '
                            'object for share: %s.' % self.sname)
                rso = ReplicaShare.objects.get(share=self.sname)
                self.rid = rso.id
                # Find and send the current snapshot to the sender. This will
                # be used as the start by btrfs-send diff.
                self.msg = ('Failed to verify latest replication snapshot '
                            'on the system.')
                latest_snap = self._latest_snap(rso)

            self.msg = ('Failed to create receive trail for rid: %d' %
                        self.rid)
            data = {
                'snap_name': self.snap_name,
            }
            self.rtid = self.create_receive_trail(self.rid, data)

            # delete the share, move the oldest snap to share
            self.msg = ('Failed to promote the oldest Snapshot to Share.')
            oldest_snap = get_oldest_snap(self.snap_dir,
                                          self.num_retain_snaps,
                                          regex='_replication_')
            if (oldest_snap is not None):
                self.update_repclone(self.sname, oldest_snap)
                self.refresh_share_state()
                self.refresh_snapshot_state()

            self.msg = ('Failed to prune old Snapshots')
            self._delete_old_snaps(self.sname, self.snap_dir,
                                   self.num_retain_snaps + 1)

            # TODO: The following should be re-instantiated once we have a
            # TODO: working method for doing so. see validate_src_share.
            # self.msg = ('Failed to validate the source share(%s) on '
            #             'sender(uuid: %s '
            #             ') Did the ip of the sender change?' %
            #             (self.src_share, self.sender_id))
            # self.validate_src_share(self.sender_id, self.src_share)

            sub_vol = ('%s%s/%s' %
                       (settings.MNT_PT, self.dest_pool, self.sname))
            if (not is_subvol(sub_vol)):
                self.msg = ('Failed to create parent subvolume %s' % sub_vol)
                run_command([BTRFS, 'subvolume', 'create', sub_vol])

            self.msg = ('Failed to create snapshot directory: %s' %
                        self.snap_dir)
            run_command(['/usr/bin/mkdir', '-p', self.snap_dir])
            snap_fp = ('%s/%s' % (self.snap_dir, self.snap_name))

            # If the snapshot already exists, presumably from the previous
            # attempt and the sender tries to send the same, reply back with
            # snap_exists and do not start the btrfs-receive
            if (is_subvol(snap_fp)):
                logger.debug('Id: %s. Snapshot to be sent(%s) already '
                             'exists. Not starting a new receive process' %
                             (self.identity, snap_fp))
                self._send_recv('snap-exists')
                self._sys_exit(0)

            cmd = [BTRFS, 'receive', self.snap_dir]
            self.msg = ('Failed to start the low level btrfs receive '
                        'command(%s). Aborting.' % cmd)
            self.rp = subprocess.Popen(cmd,
                                       shell=False,
                                       stdin=subprocess.PIPE,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)

            self.msg = ('Failed to send receiver-ready')
            rcommand, rmsg = self._send_recv('receiver-ready', latest_snap
                                             or '')
            if (rcommand is None):
                logger.error('Id: %s. No response from the broker for '
                             'receiver-ready command. Aborting.' %
                             self.identity)
                self._sys_exit(3)

            term_commands = (
                'btrfs-send-init-error',
                'btrfs-send-unexpected-termination-error',
                'btrfs-send-nonzero-termination-error',
            )
            num_tries = 10
            poll_interval = 6000  # 6 seconds
            num_msgs = 0
            t0 = time.time()
            while (True):
                socks = dict(self.poll.poll(poll_interval))
                if (socks.get(self.dealer) == zmq.POLLIN):
                    # reset to wait upto 60(poll_interval x num_tries
                    # milliseconds) for every message
                    num_tries = 10
                    command, message = self.dealer.recv_multipart()
                    if (command == 'btrfs-send-stream-finished'):
                        # this command concludes fsdata transfer. After this,
                        # btrfs-recev process should be
                        # terminated(.communicate).
                        if (self.rp.poll() is None):
                            self.msg = ('Failed to terminate btrfs-recv '
                                        'command')
                            out, err = self.rp.communicate()
                            out = out.split('\n')
                            err = err.split('\n')
                            logger.debug('Id: %s. Terminated btrfs-recv. '
                                         'cmd = %s out = %s err: %s rc: %s' %
                                         (self.identity, cmd, out, err,
                                          self.rp.returncode))
                        if (self.rp.returncode != 0):
                            self.msg = ('btrfs-recv exited with unexpected '
                                        'exitcode(%s). ' % self.rp.returncode)
                            raise Exception(self.msg)
                        data = {
                            'status': 'succeeded',
                            'kb_received': self.total_bytes_received / 1024,
                        }
                        self.msg = ('Failed to update receive trail for '
                                    'rtid: %d' % self.rtid)
                        self.update_receive_trail(self.rtid, data)

                        self._send_recv('btrfs-recv-finished')
                        self.refresh_share_state()
                        self.refresh_snapshot_state()

                        dsize, drate = self.size_report(
                            self.total_bytes_received, t0)
                        logger.debug('Id: %s. Receive complete. Total data '
                                     'transferred: %s. Rate: %s/sec.' %
                                     (self.identity, dsize, drate))
                        self._sys_exit(0)

                    if (command in term_commands):
                        self.msg = ('Terminal command(%s) received from the '
                                    'sender. Aborting.' % command)
                        raise Exception(self.msg)

                    if (self.rp.poll() is None):
                        self.rp.stdin.write(message)
                        self.rp.stdin.flush()
                        # @todo: implement advanced credit request system.
                        self.dealer.send_multipart([b'send-more', ''])
                        num_msgs += 1
                        self.total_bytes_received += len(message)
                        if (num_msgs == 1000):
                            num_msgs = 0
                            data = {
                                'status': 'pending',
                                'kb_received':
                                self.total_bytes_received / 1024,
                            }
                            self.update_receive_trail(self.rtid, data)

                            dsize, drate = self.size_report(
                                self.total_bytes_received, t0)
                            logger.debug('Id: %s. Receiver alive. Data '
                                         'transferred: %s. Rate: %s/sec.' %
                                         (self.identity, dsize, drate))
                    else:
                        out, err = self.rp.communicate()
                        out = out.split('\n')
                        err = err.split('\n')
                        logger.error('Id: %s. btrfs-recv died unexpectedly. '
                                     'cmd: %s out: %s. err: %s' %
                                     (self.identity, cmd, out, err))
                        msg = (
                            'Low level system error from btrfs receive '
                            'command. cmd: %s out: %s err: %s for rtid: %s' %
                            (cmd, out, err, self.rtid))
                        data = {
                            'status': 'failed',
                            'error': msg,
                        }
                        self.msg = ('Failed to update receive trail for '
                                    'rtid: %d.' % self.rtid)
                        self.update_receive_trail(self.rtid, data)
                        self.msg = msg
                        raise Exception(self.msg)
                else:
                    num_tries -= 1
                    msg = ('No response received from the broker. '
                           'remaining tries: %d' % num_tries)
                    logger.error('Id: %s. %s' % (self.identity, msg))
                    if (num_tries == 0):
                        self.msg = ('%s. Terminating the receiver.' % msg)
                        raise Exception(self.msg)
예제 #13
0
    def run(self):
        logger.debug("Id: %s. Starting a new Receiver for meta: %s" %
                     (self.identity, self.meta))
        self.msg = "Top level exception in receiver"
        latest_snap = None
        with self._clean_exit_handler():
            self.law = APIWrapper()
            self.poll = zmq.Poller()
            self.dealer = self.ctx.socket(zmq.DEALER)
            self.dealer.setsockopt_string(zmq.IDENTITY, u"%s" % self.identity)
            self.dealer.set_hwm(10)
            self.dealer.connect("ipc://%s" %
                                settings.REPLICATION.get("ipc_socket"))
            self.poll.register(self.dealer, zmq.POLLIN)

            self.ack = True
            self.msg = "Failed to get the sender ip for appliance: %s" % self.sender_id
            self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip

            if not self.incremental:
                self.msg = "Failed to verify/create share: %s." % self.sname
                self.create_share(self.sname, self.dest_pool)

                self.msg = ("Failed to create the replica metadata object "
                            "for share: %s." % self.sname)
                data = {
                    "share": self.sname,
                    "appliance": self.sender_ip,
                    "src_share": self.src_share,
                }
                self.rid = self.create_rshare(data)
            else:
                self.msg = ("Failed to retreive the replica metadata "
                            "object for share: %s." % self.sname)
                rso = ReplicaShare.objects.get(share=self.sname)
                self.rid = rso.id
                # Find and send the current snapshot to the sender. This will
                # be used as the start by btrfs-send diff.
                self.msg = (
                    "Failed to verify latest replication snapshot on the system."
                )
                latest_snap = self._latest_snap(rso)

            self.msg = "Failed to create receive trail for rid: %d" % self.rid
            data = {
                "snap_name": self.snap_name,
            }
            self.rtid = self.create_receive_trail(self.rid, data)

            # delete the share, move the oldest snap to share
            self.msg = "Failed to promote the oldest Snapshot to Share."
            oldest_snap = get_oldest_snap(self.snap_dir,
                                          self.num_retain_snaps,
                                          regex="_replication_")
            if oldest_snap is not None:
                self.update_repclone(self.sname, oldest_snap)
                self.refresh_share_state()
                self.refresh_snapshot_state()

            self.msg = "Failed to prune old Snapshots"
            self._delete_old_snaps(self.sname, self.snap_dir,
                                   self.num_retain_snaps + 1)

            # TODO: The following should be re-instantiated once we have a
            # TODO: working method for doing so. see validate_src_share.
            # self.msg = ('Failed to validate the source share(%s) on '
            #             'sender(uuid: %s '
            #             ') Did the ip of the sender change?' %
            #             (self.src_share, self.sender_id))
            # self.validate_src_share(self.sender_id, self.src_share)

            sub_vol = "%s%s/%s" % (settings.MNT_PT, self.dest_pool, self.sname)
            if not is_subvol(sub_vol):
                self.msg = "Failed to create parent subvolume %s" % sub_vol
                run_command([BTRFS, "subvolume", "create", sub_vol])

            self.msg = "Failed to create snapshot directory: %s" % self.snap_dir
            run_command(["/usr/bin/mkdir", "-p", self.snap_dir])
            snap_fp = "%s/%s" % (self.snap_dir, self.snap_name)

            # If the snapshot already exists, presumably from the previous
            # attempt and the sender tries to send the same, reply back with
            # snap_exists and do not start the btrfs-receive
            if is_subvol(snap_fp):
                logger.debug("Id: %s. Snapshot to be sent(%s) already "
                             "exists. Not starting a new receive process" %
                             (self.identity, snap_fp))
                self._send_recv("snap-exists")
                self._sys_exit(0)

            cmd = [BTRFS, "receive", self.snap_dir]
            self.msg = ("Failed to start the low level btrfs receive "
                        "command(%s). Aborting." % cmd)
            self.rp = subprocess.Popen(
                cmd,
                shell=False,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )

            self.msg = "Failed to send receiver-ready"
            rcommand, rmsg = self._send_recv("receiver-ready", latest_snap
                                             or "")
            if rcommand is None:
                logger.error("Id: %s. No response from the broker for "
                             "receiver-ready command. Aborting." %
                             self.identity)
                self._sys_exit(3)

            term_commands = (
                "btrfs-send-init-error",
                "btrfs-send-unexpected-termination-error",
                "btrfs-send-nonzero-termination-error",
            )
            num_tries = 10
            poll_interval = 6000  # 6 seconds
            num_msgs = 0
            t0 = time.time()
            while True:
                socks = dict(self.poll.poll(poll_interval))
                if socks.get(self.dealer) == zmq.POLLIN:
                    # reset to wait upto 60(poll_interval x num_tries
                    # milliseconds) for every message
                    num_tries = 10
                    command, message = self.dealer.recv_multipart()
                    if command == "btrfs-send-stream-finished":
                        # this command concludes fsdata transfer. After this,
                        # btrfs-recev process should be
                        # terminated(.communicate).
                        if self.rp.poll() is None:
                            self.msg = "Failed to terminate btrfs-recv command"
                            out, err = self.rp.communicate()
                            out = out.split("\n")
                            err = err.split("\n")
                            logger.debug("Id: %s. Terminated btrfs-recv. "
                                         "cmd = %s out = %s err: %s rc: %s" %
                                         (self.identity, cmd, out, err,
                                          self.rp.returncode))
                        if self.rp.returncode != 0:
                            self.msg = ("btrfs-recv exited with unexpected "
                                        "exitcode(%s). " % self.rp.returncode)
                            raise Exception(self.msg)
                        data = {
                            "status": "succeeded",
                            "kb_received": self.total_bytes_received / 1024,
                        }
                        self.msg = (
                            "Failed to update receive trail for rtid: %d" %
                            self.rtid)
                        self.update_receive_trail(self.rtid, data)

                        self._send_recv("btrfs-recv-finished")
                        self.refresh_share_state()
                        self.refresh_snapshot_state()

                        dsize, drate = self.size_report(
                            self.total_bytes_received, t0)
                        logger.debug("Id: %s. Receive complete. Total data "
                                     "transferred: %s. Rate: %s/sec." %
                                     (self.identity, dsize, drate))
                        self._sys_exit(0)

                    if command in term_commands:
                        self.msg = ("Terminal command(%s) received from the "
                                    "sender. Aborting." % command)
                        raise Exception(self.msg)

                    if self.rp.poll() is None:
                        self.rp.stdin.write(message)
                        self.rp.stdin.flush()
                        # @todo: implement advanced credit request system.
                        self.dealer.send_multipart([b"send-more", ""])
                        num_msgs += 1
                        self.total_bytes_received += len(message)
                        if num_msgs == 1000:
                            num_msgs = 0
                            data = {
                                "status": "pending",
                                "kb_received":
                                self.total_bytes_received / 1024,
                            }
                            self.update_receive_trail(self.rtid, data)

                            dsize, drate = self.size_report(
                                self.total_bytes_received, t0)
                            logger.debug("Id: %s. Receiver alive. Data "
                                         "transferred: %s. Rate: %s/sec." %
                                         (self.identity, dsize, drate))
                    else:
                        out, err = self.rp.communicate()
                        out = out.split("\n")
                        err = err.split("\n")
                        logger.error("Id: %s. btrfs-recv died unexpectedly. "
                                     "cmd: %s out: %s. err: %s" %
                                     (self.identity, cmd, out, err))
                        msg = (
                            "Low level system error from btrfs receive "
                            "command. cmd: %s out: %s err: %s for rtid: %s" %
                            (cmd, out, err, self.rtid))
                        data = {
                            "status": "failed",
                            "error": msg,
                        }
                        self.msg = ("Failed to update receive trail for "
                                    "rtid: %d." % self.rtid)
                        self.update_receive_trail(self.rtid, data)
                        self.msg = msg
                        raise Exception(self.msg)
                else:
                    num_tries -= 1
                    msg = ("No response received from the broker. "
                           "remaining tries: %d" % num_tries)
                    logger.error("Id: %s. %s" % (self.identity, msg))
                    if num_tries == 0:
                        self.msg = "%s. Terminating the receiver." % msg
                        raise Exception(self.msg)