Пример #1
0
 def validate_src_share(self, sender_uuid, sname):
     url = 'https://'
     if (self.raw is None):
         a = Appliance.objects.get(uuid=sender_uuid)
         url = ('%s%s:%s' % (url, a.ip, a.mgmt_port))
         self.raw = APIWrapper(client_id=a.client_id,
                               client_secret=a.client_secret,
                               url=url)
     return self.raw.api_call(url='shares/%s' % sname)
Пример #2
0
 def validate_src_share(self, sender_uuid, sname):
     url = "https://"
     if self.raw is None:
         a = Appliance.objects.get(uuid=sender_uuid)
         url = "%s%s:%s" % (url, a.ip, a.mgmt_port)
         self.raw = APIWrapper(client_id=a.client_id,
                               client_secret=a.client_secret,
                               url=url)
     # TODO: update url to include senders shareId as sname is now invalid
     return self.raw.api_call(url="shares/%s" % sname)
Пример #3
0
    def run(self):
        self.law = APIWrapper()

        try:
            so = Service.objects.get(name="replication")
            config_d = json.loads(so.config)
            self.listener_port = int(config_d["listener_port"])
            nco = NetworkConnection.objects.get(name=config_d["network_interface"])
            self.listener_interface = nco.ipaddr
        except NetworkConnection.DoesNotExist:
            self.listener_interface = "0.0.0.0"
        except Exception as e:
            msg = (
                "Failed to fetch network interface for Listner/Broker. "
                "Exception: %s" % e.__str__()
            )
            return logger.error(msg)

        try:
            self.uuid = Appliance.objects.get(current_appliance=True).uuid
        except Exception as e:
            msg = (
                "Failed to get uuid of current appliance. Aborting. "
                "Exception: %s" % e.__str__()
            )
            return logger.error(msg)

        ctx = zmq.Context()
        frontend = ctx.socket(zmq.ROUTER)
        frontend.set_hwm(10)
        frontend.bind("tcp://%s:%d" % (self.listener_interface, self.listener_port))

        backend = ctx.socket(zmq.ROUTER)
        backend.bind("ipc://%s" % settings.REPLICATION.get("ipc_socket"))

        poller = zmq.Poller()
        poller.register(frontend, zmq.POLLIN)
        poller.register(backend, zmq.POLLIN)
        self.local_receivers = {}

        iterations = 10
        poll_interval = 6000  # 6 seconds
        msg_count = 0
        while True:
            # This loop may still continue even if replication service
            # is terminated, as long as data is coming in.
            socks = dict(poller.poll(timeout=poll_interval))
            if frontend in socks and socks[frontend] == zmq.POLLIN:
                address, command, msg = frontend.recv_multipart()
                if address not in self.remote_senders:
                    self.remote_senders[address] = 1
                else:
                    self.remote_senders[address] += 1
                msg_count += 1
                if msg_count == 1000:
                    msg_count = 0
                    for rs, count in self.remote_senders.items():
                        logger.debug(
                            "Active Receiver: %s. Messages processed:"
                            "%d" % (rs, count)
                        )
                if command == "sender-ready":
                    logger.debug("initial greeting from %s" % address)
                    # Start a new receiver and send the appropriate response
                    try:
                        start_nr = True
                        if address in self.local_receivers:
                            start_nr = False
                            ecode = self.local_receivers[address].exitcode
                            if ecode is not None:
                                del self.local_receivers[address]
                                logger.debug(
                                    "Receiver(%s) exited. exitcode: "
                                    "%s. Forcing removal from broker "
                                    "list." % (address, ecode)
                                )
                                start_nr = True
                            else:
                                msg = (
                                    "Receiver(%s) already exists. "
                                    "Will not start a new one." % address
                                )
                                logger.error(msg)
                                # @todo: There may be a different way to handle
                                # this. For example, we can pass the message to
                                # the active receiver and factor into it's
                                # retry/robust logic. But that is for later.
                                frontend.send_multipart(
                                    [address, "receiver-init-error", msg]
                                )
                        if start_nr:
                            nr = Receiver(address, msg)
                            nr.daemon = True
                            nr.start()
                            logger.debug("New Receiver(%s) started." % address)
                            self.local_receivers[address] = nr
                        continue
                    except Exception as e:
                        msg = (
                            "Exception while starting the "
                            "new receiver for %s: %s" % (address, e.__str__())
                        )
                        logger.error(msg)
                        frontend.send_multipart([address, "receiver-init-error", msg])
                else:
                    # do we hit hwm? is the dealer still connected?
                    backend.send_multipart([address, command, msg])

            elif backend in socks and socks[backend] == zmq.POLLIN:
                address, command, msg = backend.recv_multipart()
                if command == "new-send":
                    rid = int(msg)
                    logger.debug("new-send request received for %d" % rid)
                    rcommand = "ERROR"
                    try:
                        replica = Replica.objects.get(id=rid)
                        if replica.enabled:
                            self._process_send(replica)
                            msg = (
                                "A new Sender started successfully for "
                                "Replication Task(%d)." % rid
                            )
                            rcommand = "SUCCESS"
                        else:
                            msg = (
                                "Failed to start a new Sender for "
                                "Replication "
                                "Task(%d) because it is disabled." % rid
                            )
                    except Exception as e:
                        msg = (
                            "Failed to start a new Sender for Replication "
                            "Task(%d). Exception: %s" % (rid, e.__str__())
                        )
                        logger.error(msg)
                    finally:
                        backend.send_multipart([address, rcommand, str(msg)])
                elif address in self.remote_senders:
                    if command in (
                        "receiver-ready",
                        "receiver-error",
                        "btrfs-recv-finished",
                    ):  # noqa E501
                        logger.debug("Identitiy: %s command: %s" % (address, command))
                        backend.send_multipart([address, b"ACK", ""])
                        # a new receiver has started. reply to the sender that
                        # must be waiting
                    frontend.send_multipart([address, command, msg])

            else:
                iterations -= 1
                if iterations == 0:
                    iterations = 10
                    self._prune_senders()
                    self._delete_receivers()
                    cur_time = time.time()
                    if (
                        self.trail_prune_time is None
                        or (cur_time - self.trail_prune_time) > 3600
                    ):
                        # prune send/receive trails every hour or so.
                        self.trail_prune_time = cur_time
                        map(self.prune_replica_trail, Replica.objects.filter())
                        map(self.prune_receive_trail, ReplicaShare.objects.filter())
                        logger.debug("Replica trails are truncated successfully.")

                    if os.getppid() != self.ppid:
                        logger.error("Parent exited. Aborting.")
                        ctx.destroy()
                        # do some cleanup of senders before quitting?
                        break
Пример #4
0
    def run(self):
        logger.debug('Id: %s. Starting a new Receiver for meta: %s' %
                     (self.identity, self.meta))
        self.msg = ('Top level exception in receiver')
        latest_snap = None
        with self._clean_exit_handler():
            self.law = APIWrapper()
            self.poll = zmq.Poller()
            self.dealer = self.ctx.socket(zmq.DEALER)
            self.dealer.setsockopt_string(zmq.IDENTITY, u'%s' % self.identity)
            self.dealer.set_hwm(10)
            self.dealer.connect('ipc://%s' %
                                settings.REPLICATION.get('ipc_socket'))
            self.poll.register(self.dealer, zmq.POLLIN)

            self.ack = True
            self.msg = ('Failed to get the sender ip for appliance: %s' %
                        self.sender_id)
            self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip

            if (not self.incremental):
                self.msg = ('Failed to verify/create share: %s.' % self.sname)
                self.create_share(self.sname, self.dest_pool)

                self.msg = ('Failed to create the replica metadata object '
                            'for share: %s.' % self.sname)
                data = {
                    'share': self.sname,
                    'appliance': self.sender_ip,
                    'src_share': self.src_share,
                }
                self.rid = self.create_rshare(data)
            else:
                self.msg = ('Failed to retreive the replica metadata '
                            'object for share: %s.' % self.sname)
                rso = ReplicaShare.objects.get(share=self.sname)
                self.rid = rso.id
                # Find and send the current snapshot to the sender. This will
                # be used as the start by btrfs-send diff.
                self.msg = ('Failed to verify latest replication snapshot '
                            'on the system.')
                latest_snap = self._latest_snap(rso)

            self.msg = ('Failed to create receive trail for rid: %d' %
                        self.rid)
            data = {
                'snap_name': self.snap_name,
            }
            self.rtid = self.create_receive_trail(self.rid, data)

            # delete the share, move the oldest snap to share
            self.msg = ('Failed to promote the oldest Snapshot to Share.')
            oldest_snap = get_oldest_snap(self.snap_dir,
                                          self.num_retain_snaps,
                                          regex='_replication_')
            if (oldest_snap is not None):
                self.update_repclone(self.sname, oldest_snap)
                self.refresh_share_state()
                self.refresh_snapshot_state()

            self.msg = ('Failed to prune old Snapshots')
            self._delete_old_snaps(self.sname, self.snap_dir,
                                   self.num_retain_snaps + 1)

            # TODO: The following should be re-instantiated once we have a
            # TODO: working method for doing so. see validate_src_share.
            # self.msg = ('Failed to validate the source share(%s) on '
            #             'sender(uuid: %s '
            #             ') Did the ip of the sender change?' %
            #             (self.src_share, self.sender_id))
            # self.validate_src_share(self.sender_id, self.src_share)

            sub_vol = ('%s%s/%s' %
                       (settings.MNT_PT, self.dest_pool, self.sname))
            if (not is_subvol(sub_vol)):
                self.msg = ('Failed to create parent subvolume %s' % sub_vol)
                run_command([BTRFS, 'subvolume', 'create', sub_vol])

            self.msg = ('Failed to create snapshot directory: %s' %
                        self.snap_dir)
            run_command(['/usr/bin/mkdir', '-p', self.snap_dir])
            snap_fp = ('%s/%s' % (self.snap_dir, self.snap_name))

            # If the snapshot already exists, presumably from the previous
            # attempt and the sender tries to send the same, reply back with
            # snap_exists and do not start the btrfs-receive
            if (is_subvol(snap_fp)):
                logger.debug('Id: %s. Snapshot to be sent(%s) already '
                             'exists. Not starting a new receive process' %
                             (self.identity, snap_fp))
                self._send_recv('snap-exists')
                self._sys_exit(0)

            cmd = [BTRFS, 'receive', self.snap_dir]
            self.msg = ('Failed to start the low level btrfs receive '
                        'command(%s). Aborting.' % cmd)
            self.rp = subprocess.Popen(cmd,
                                       shell=False,
                                       stdin=subprocess.PIPE,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)

            self.msg = ('Failed to send receiver-ready')
            rcommand, rmsg = self._send_recv('receiver-ready', latest_snap
                                             or '')
            if (rcommand is None):
                logger.error('Id: %s. No response from the broker for '
                             'receiver-ready command. Aborting.' %
                             self.identity)
                self._sys_exit(3)

            term_commands = (
                'btrfs-send-init-error',
                'btrfs-send-unexpected-termination-error',
                'btrfs-send-nonzero-termination-error',
            )
            num_tries = 10
            poll_interval = 6000  # 6 seconds
            num_msgs = 0
            t0 = time.time()
            while (True):
                socks = dict(self.poll.poll(poll_interval))
                if (socks.get(self.dealer) == zmq.POLLIN):
                    # reset to wait upto 60(poll_interval x num_tries
                    # milliseconds) for every message
                    num_tries = 10
                    command, message = self.dealer.recv_multipart()
                    if (command == 'btrfs-send-stream-finished'):
                        # this command concludes fsdata transfer. After this,
                        # btrfs-recev process should be
                        # terminated(.communicate).
                        if (self.rp.poll() is None):
                            self.msg = ('Failed to terminate btrfs-recv '
                                        'command')
                            out, err = self.rp.communicate()
                            out = out.split('\n')
                            err = err.split('\n')
                            logger.debug('Id: %s. Terminated btrfs-recv. '
                                         'cmd = %s out = %s err: %s rc: %s' %
                                         (self.identity, cmd, out, err,
                                          self.rp.returncode))
                        if (self.rp.returncode != 0):
                            self.msg = ('btrfs-recv exited with unexpected '
                                        'exitcode(%s). ' % self.rp.returncode)
                            raise Exception(self.msg)
                        data = {
                            'status': 'succeeded',
                            'kb_received': self.total_bytes_received / 1024,
                        }
                        self.msg = ('Failed to update receive trail for '
                                    'rtid: %d' % self.rtid)
                        self.update_receive_trail(self.rtid, data)

                        self._send_recv('btrfs-recv-finished')
                        self.refresh_share_state()
                        self.refresh_snapshot_state()

                        dsize, drate = self.size_report(
                            self.total_bytes_received, t0)
                        logger.debug('Id: %s. Receive complete. Total data '
                                     'transferred: %s. Rate: %s/sec.' %
                                     (self.identity, dsize, drate))
                        self._sys_exit(0)

                    if (command in term_commands):
                        self.msg = ('Terminal command(%s) received from the '
                                    'sender. Aborting.' % command)
                        raise Exception(self.msg)

                    if (self.rp.poll() is None):
                        self.rp.stdin.write(message)
                        self.rp.stdin.flush()
                        # @todo: implement advanced credit request system.
                        self.dealer.send_multipart([b'send-more', ''])
                        num_msgs += 1
                        self.total_bytes_received += len(message)
                        if (num_msgs == 1000):
                            num_msgs = 0
                            data = {
                                'status': 'pending',
                                'kb_received':
                                self.total_bytes_received / 1024,
                            }
                            self.update_receive_trail(self.rtid, data)

                            dsize, drate = self.size_report(
                                self.total_bytes_received, t0)
                            logger.debug('Id: %s. Receiver alive. Data '
                                         'transferred: %s. Rate: %s/sec.' %
                                         (self.identity, dsize, drate))
                    else:
                        out, err = self.rp.communicate()
                        out = out.split('\n')
                        err = err.split('\n')
                        logger.error('Id: %s. btrfs-recv died unexpectedly. '
                                     'cmd: %s out: %s. err: %s' %
                                     (self.identity, cmd, out, err))
                        msg = (
                            'Low level system error from btrfs receive '
                            'command. cmd: %s out: %s err: %s for rtid: %s' %
                            (cmd, out, err, self.rtid))
                        data = {
                            'status': 'failed',
                            'error': msg,
                        }
                        self.msg = ('Failed to update receive trail for '
                                    'rtid: %d.' % self.rtid)
                        self.update_receive_trail(self.rtid, data)
                        self.msg = msg
                        raise Exception(self.msg)
                else:
                    num_tries -= 1
                    msg = ('No response received from the broker. '
                           'remaining tries: %d' % num_tries)
                    logger.error('Id: %s. %s' % (self.identity, msg))
                    if (num_tries == 0):
                        self.msg = ('%s. Terminating the receiver.' % msg)
                        raise Exception(self.msg)
Пример #5
0
    def run(self):

        self.msg = ('Top level exception in sender: %s' % self.identity)
        with self._clean_exit_handler():
            self.law = APIWrapper()
            self.poll = zmq.Poller()
            self._init_greeting()

            #  create a new replica trail if it's the very first time
            # or if the last one succeeded
            self.msg = ('Failed to create local replica trail for snap_name:'
                        ' %s. Aborting.' % self.snap_name)
            self.rt2 = self.create_replica_trail(self.replica.id,
                                                 self.snap_name)
            self.rt2_id = self.rt2['id']

            # prune old snapshots.
            self.update_trail = True
            self.msg = ('Failed to prune old snapshots')
            share_path = (
                '%s%s/.snapshots/%s' %
                (settings.MNT_PT, self.replica.pool, self.replica.share))
            self._delete_old_snaps(share_path)

            # Refresh replica trail.
            if (self.rt is not None):
                self.rt = self._refresh_rt()

            #  create a snapshot only if it's not already from a previous
            #  failed attempt.
            self.msg = ('Failed to create snapshot: %s. Aborting.' %
                        self.snap_name)
            self.create_snapshot(self.replica.share, self.snap_name)

            retries_left = 10
            poll_interval = 6000  # 6 seconds
            while (True):
                socks = dict(self.poll.poll(poll_interval))
                if (socks.get(self.send_req) == zmq.POLLIN):
                    # not really necessary because we just want one reply for
                    # now.
                    retries_left = 10
                    command, reply = self.send_req.recv_multipart()
                    if (command == 'receiver-ready'):
                        if (self.rt is not None):
                            self.rlatest_snap = reply
                            self.rt = self._refresh_rt()
                        logger.debug('Id: %s. command(%s) and message(%s) '
                                     'received. Proceeding to send fsdata.' %
                                     (self.identity, command, reply))
                        break
                    else:
                        if (command in 'receiver-init-error'):
                            self.msg = ('%s received for %s. extended reply: '
                                        '%s. Aborting.' %
                                        (command, self.identity, reply))
                        elif (command == 'snap-exists'):
                            logger.debug('Id: %s. %s received. Not sending '
                                         'fsdata' % (self.identity, command))
                            data = {
                                'status': 'succeeded',
                                'error':
                                'snapshot already exists on the receiver',
                            }  # noqa E501
                            self.msg = ('Failed to  update replica status for '
                                        '%s' % self.snap_id)
                            self.update_replica_status(self.rt2_id, data)
                            self._sys_exit(0)
                        else:
                            self.msg = ('unexpected reply(%s) for %s. '
                                        'extended reply: %s. Aborting' %
                                        (command, self.identity, reply))
                        raise Exception(self.msg)
                else:
                    retries_left -= 1
                    logger.debug('Id: %s. No response from receiver. Number '
                                 'of retry attempts left: %d' %
                                 (self.identity, retries_left))
                    if (retries_left == 0):
                        self.msg = ('Receiver(%s:%d) is unreachable. '
                                    'Aborting.' %
                                    (self.receiver_ip, self.receiver_port))
                        raise Exception(self.msg)
                    self.send_req.setsockopt(zmq.LINGER, 0)
                    self.send_req.close()
                    self.poll.unregister(self.send_req)
                    self._init_greeting()

            snap_path = ('%s%s/.snapshots/%s/%s' %
                         (settings.MNT_PT, self.replica.pool,
                          self.replica.share, self.snap_name))
            cmd = [BTRFS, 'send', snap_path]
            if (self.rt is not None):
                prev_snap = ('%s%s/.snapshots/%s/%s' %
                             (settings.MNT_PT, self.replica.pool,
                              self.replica.share, self.rt.snap_name))
                logger.info('Id: %s. Sending incremental replica between '
                            '%s -- %s' % (self.identity, prev_snap, snap_path))
                cmd = [BTRFS, 'send', '-p', prev_snap, snap_path]
            else:
                logger.info('Id: %s. Sending full replica: %s' %
                            (self.identity, snap_path))

            try:
                self.sp = subprocess.Popen(cmd,
                                           shell=False,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
                fcntl.fcntl(self.sp.stdout.fileno(), fcntl.F_SETFL,
                            os.O_NONBLOCK)
            except Exception as e:
                self.msg = ('Failed to start the low level btrfs send '
                            'command(%s). Aborting. Exception: ' %
                            (cmd, e.__str__()))
                logger.error('Id: %s. %s' % (self.identity, self.msg))
                self._send_recv('btrfs-send-init-error')
                self._sys_exit(3)

            alive = True
            num_msgs = 0
            t0 = time.time()
            while (alive):
                try:
                    if (self.sp.poll() is not None):
                        logger.debug(
                            'Id: %s. send process finished '
                            'for %s. rc: %d. stderr: %s' %
                            (self.identity, self.snap_id, self.sp.returncode,
                             self.sp.stderr.read()))
                        alive = False
                    fs_data = self.sp.stdout.read()
                except IOError:
                    continue
                except Exception as e:
                    self.msg = ('Exception occured while reading low '
                                'level btrfs '
                                'send data for %s. Aborting.' % self.snap_id)
                    if (alive):
                        self.sp.terminate()
                    self.update_trail = True
                    self._send_recv('btrfs-send-unexpected-termination-error')
                    self._sys_exit(3)

                self.msg = ('Failed to send fsdata to the receiver for %s. '
                            'Aborting.' % (self.snap_id))
                self.update_trail = True
                command, message = self._send_recv('', fs_data)
                self.total_bytes_sent += len(fs_data)
                num_msgs += 1
                if (num_msgs == 1000):
                    num_msgs = 0
                    dsize, drate = self.size_report(self.total_bytes_sent, t0)
                    logger.debug('Id: %s Sender alive. Data transferred: '
                                 '%s. Rate: %s/sec.' %
                                 (self.identity, dsize, drate))
                if (command is None or command == 'receiver-error'):
                    # command is None when the remote side vanishes.
                    self.msg = ('Got null or error command(%s) message(%s) '
                                'from the Receiver while'
                                ' transmitting fsdata. Aborting.' %
                                (command, message))
                    raise Exception(message)

                if (not alive):
                    if (self.sp.returncode != 0):
                        # do we mark failed?
                        command, message = self._send_recv(
                            'btrfs-send-nonzero-termination-error')
                    else:
                        command, message = self._send_recv(
                            'btrfs-send-stream-finished')

                if (os.getppid() != self.ppid):
                    logger.error('Id: %s. Scheduler exited. Sender for %s '
                                 'cannot go on. '
                                 'Aborting.' % (self.identity, self.snap_id))
                    self._sys_exit(3)

            data = {
                'status': 'succeeded',
                'kb_sent': self.total_bytes_sent / 1024,
            }
            self.msg = ('Failed to update final replica status for %s'
                        '. Aborting.' % self.snap_id)
            self.update_replica_status(self.rt2_id, data)
            dsize, drate = self.size_report(self.total_bytes_sent, t0)
            logger.debug('Id: %s. Send complete. Total data transferred: %s.'
                         ' Rate: %s/sec.' % (self.identity, dsize, drate))
            self._sys_exit(0)
Пример #6
0
    def run(self):
        logger.debug("Id: %s. Starting a new Receiver for meta: %s" %
                     (self.identity, self.meta))
        self.msg = "Top level exception in receiver"
        latest_snap = None
        with self._clean_exit_handler():
            self.law = APIWrapper()
            self.poll = zmq.Poller()
            self.dealer = self.ctx.socket(zmq.DEALER)
            self.dealer.setsockopt_string(zmq.IDENTITY, u"%s" % self.identity)
            self.dealer.set_hwm(10)
            self.dealer.connect("ipc://%s" %
                                settings.REPLICATION.get("ipc_socket"))
            self.poll.register(self.dealer, zmq.POLLIN)

            self.ack = True
            self.msg = "Failed to get the sender ip for appliance: %s" % self.sender_id
            self.sender_ip = Appliance.objects.get(uuid=self.sender_id).ip

            if not self.incremental:
                self.msg = "Failed to verify/create share: %s." % self.sname
                self.create_share(self.sname, self.dest_pool)

                self.msg = ("Failed to create the replica metadata object "
                            "for share: %s." % self.sname)
                data = {
                    "share": self.sname,
                    "appliance": self.sender_ip,
                    "src_share": self.src_share,
                }
                self.rid = self.create_rshare(data)
            else:
                self.msg = ("Failed to retreive the replica metadata "
                            "object for share: %s." % self.sname)
                rso = ReplicaShare.objects.get(share=self.sname)
                self.rid = rso.id
                # Find and send the current snapshot to the sender. This will
                # be used as the start by btrfs-send diff.
                self.msg = (
                    "Failed to verify latest replication snapshot on the system."
                )
                latest_snap = self._latest_snap(rso)

            self.msg = "Failed to create receive trail for rid: %d" % self.rid
            data = {
                "snap_name": self.snap_name,
            }
            self.rtid = self.create_receive_trail(self.rid, data)

            # delete the share, move the oldest snap to share
            self.msg = "Failed to promote the oldest Snapshot to Share."
            oldest_snap = get_oldest_snap(self.snap_dir,
                                          self.num_retain_snaps,
                                          regex="_replication_")
            if oldest_snap is not None:
                self.update_repclone(self.sname, oldest_snap)
                self.refresh_share_state()
                self.refresh_snapshot_state()

            self.msg = "Failed to prune old Snapshots"
            self._delete_old_snaps(self.sname, self.snap_dir,
                                   self.num_retain_snaps + 1)

            # TODO: The following should be re-instantiated once we have a
            # TODO: working method for doing so. see validate_src_share.
            # self.msg = ('Failed to validate the source share(%s) on '
            #             'sender(uuid: %s '
            #             ') Did the ip of the sender change?' %
            #             (self.src_share, self.sender_id))
            # self.validate_src_share(self.sender_id, self.src_share)

            sub_vol = "%s%s/%s" % (settings.MNT_PT, self.dest_pool, self.sname)
            if not is_subvol(sub_vol):
                self.msg = "Failed to create parent subvolume %s" % sub_vol
                run_command([BTRFS, "subvolume", "create", sub_vol])

            self.msg = "Failed to create snapshot directory: %s" % self.snap_dir
            run_command(["/usr/bin/mkdir", "-p", self.snap_dir])
            snap_fp = "%s/%s" % (self.snap_dir, self.snap_name)

            # If the snapshot already exists, presumably from the previous
            # attempt and the sender tries to send the same, reply back with
            # snap_exists and do not start the btrfs-receive
            if is_subvol(snap_fp):
                logger.debug("Id: %s. Snapshot to be sent(%s) already "
                             "exists. Not starting a new receive process" %
                             (self.identity, snap_fp))
                self._send_recv("snap-exists")
                self._sys_exit(0)

            cmd = [BTRFS, "receive", self.snap_dir]
            self.msg = ("Failed to start the low level btrfs receive "
                        "command(%s). Aborting." % cmd)
            self.rp = subprocess.Popen(
                cmd,
                shell=False,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )

            self.msg = "Failed to send receiver-ready"
            rcommand, rmsg = self._send_recv("receiver-ready", latest_snap
                                             or "")
            if rcommand is None:
                logger.error("Id: %s. No response from the broker for "
                             "receiver-ready command. Aborting." %
                             self.identity)
                self._sys_exit(3)

            term_commands = (
                "btrfs-send-init-error",
                "btrfs-send-unexpected-termination-error",
                "btrfs-send-nonzero-termination-error",
            )
            num_tries = 10
            poll_interval = 6000  # 6 seconds
            num_msgs = 0
            t0 = time.time()
            while True:
                socks = dict(self.poll.poll(poll_interval))
                if socks.get(self.dealer) == zmq.POLLIN:
                    # reset to wait upto 60(poll_interval x num_tries
                    # milliseconds) for every message
                    num_tries = 10
                    command, message = self.dealer.recv_multipart()
                    if command == "btrfs-send-stream-finished":
                        # this command concludes fsdata transfer. After this,
                        # btrfs-recev process should be
                        # terminated(.communicate).
                        if self.rp.poll() is None:
                            self.msg = "Failed to terminate btrfs-recv command"
                            out, err = self.rp.communicate()
                            out = out.split("\n")
                            err = err.split("\n")
                            logger.debug("Id: %s. Terminated btrfs-recv. "
                                         "cmd = %s out = %s err: %s rc: %s" %
                                         (self.identity, cmd, out, err,
                                          self.rp.returncode))
                        if self.rp.returncode != 0:
                            self.msg = ("btrfs-recv exited with unexpected "
                                        "exitcode(%s). " % self.rp.returncode)
                            raise Exception(self.msg)
                        data = {
                            "status": "succeeded",
                            "kb_received": self.total_bytes_received / 1024,
                        }
                        self.msg = (
                            "Failed to update receive trail for rtid: %d" %
                            self.rtid)
                        self.update_receive_trail(self.rtid, data)

                        self._send_recv("btrfs-recv-finished")
                        self.refresh_share_state()
                        self.refresh_snapshot_state()

                        dsize, drate = self.size_report(
                            self.total_bytes_received, t0)
                        logger.debug("Id: %s. Receive complete. Total data "
                                     "transferred: %s. Rate: %s/sec." %
                                     (self.identity, dsize, drate))
                        self._sys_exit(0)

                    if command in term_commands:
                        self.msg = ("Terminal command(%s) received from the "
                                    "sender. Aborting." % command)
                        raise Exception(self.msg)

                    if self.rp.poll() is None:
                        self.rp.stdin.write(message)
                        self.rp.stdin.flush()
                        # @todo: implement advanced credit request system.
                        self.dealer.send_multipart([b"send-more", ""])
                        num_msgs += 1
                        self.total_bytes_received += len(message)
                        if num_msgs == 1000:
                            num_msgs = 0
                            data = {
                                "status": "pending",
                                "kb_received":
                                self.total_bytes_received / 1024,
                            }
                            self.update_receive_trail(self.rtid, data)

                            dsize, drate = self.size_report(
                                self.total_bytes_received, t0)
                            logger.debug("Id: %s. Receiver alive. Data "
                                         "transferred: %s. Rate: %s/sec." %
                                         (self.identity, dsize, drate))
                    else:
                        out, err = self.rp.communicate()
                        out = out.split("\n")
                        err = err.split("\n")
                        logger.error("Id: %s. btrfs-recv died unexpectedly. "
                                     "cmd: %s out: %s. err: %s" %
                                     (self.identity, cmd, out, err))
                        msg = (
                            "Low level system error from btrfs receive "
                            "command. cmd: %s out: %s err: %s for rtid: %s" %
                            (cmd, out, err, self.rtid))
                        data = {
                            "status": "failed",
                            "error": msg,
                        }
                        self.msg = ("Failed to update receive trail for "
                                    "rtid: %d." % self.rtid)
                        self.update_receive_trail(self.rtid, data)
                        self.msg = msg
                        raise Exception(self.msg)
                else:
                    num_tries -= 1
                    msg = ("No response received from the broker. "
                           "remaining tries: %d" % num_tries)
                    logger.error("Id: %s. %s" % (self.identity, msg))
                    if num_tries == 0:
                        self.msg = "%s. Terminating the receiver." % msg
                        raise Exception(self.msg)
Пример #7
0
    def run(self):

        self.msg = "Top level exception in sender: %s" % self.identity
        with self._clean_exit_handler():
            self.law = APIWrapper()
            self.poll = zmq.Poller()
            self._init_greeting()

            #  create a new replica trail if it's the very first time
            # or if the last one succeeded
            self.msg = ("Failed to create local replica trail for snap_name:"
                        " %s. Aborting." % self.snap_name)
            self.rt2 = self.create_replica_trail(self.replica.id,
                                                 self.snap_name)
            self.rt2_id = self.rt2["id"]

            # prune old snapshots.
            self.update_trail = True
            self.msg = "Failed to prune old snapshots"
            share_path = "%s%s/.snapshots/%s" % (
                settings.MNT_PT,
                self.replica.pool,
                self.replica.share,
            )
            self._delete_old_snaps(share_path)

            # Refresh replica trail.
            if self.rt is not None:
                self.rt = self._refresh_rt()

            #  create a snapshot only if it's not already from a previous
            #  failed attempt.
            # TODO: If one does exist we fail which seems harsh as we may be
            # TODO: able to pickup where we left of depending on the failure.
            self.msg = "Failed to create snapshot: %s. Aborting." % self.snap_name
            self.create_snapshot(self.replica.share, self.snap_name)

            retries_left = settings.REPLICATION.get("max_send_attempts")

            poll_interval = 6000  # 6 seconds
            while True:
                socks = dict(self.poll.poll(poll_interval))
                if socks.get(self.send_req) == zmq.POLLIN:
                    # not really necessary because we just want one reply for
                    # now.
                    retries_left = settings.REPLICATION.get(
                        "max_send_attempts")
                    command, reply = self.send_req.recv_multipart()
                    if command == "receiver-ready":
                        if self.rt is not None:
                            self.rlatest_snap = reply
                            self.rt = self._refresh_rt()
                        logger.debug("Id: %s. command(%s) and message(%s) "
                                     "received. Proceeding to send fsdata." %
                                     (self.identity, command, reply))
                        break
                    else:
                        if command in "receiver-init-error":
                            self.msg = ("%s received for %s. extended reply: "
                                        "%s. Aborting." %
                                        (command, self.identity, reply))
                        elif command == "snap-exists":
                            logger.debug("Id: %s. %s received. Not sending "
                                         "fsdata" % (self.identity, command))
                            data = {
                                "status": "succeeded",
                                "error":
                                "snapshot already exists on the receiver",
                            }  # noqa E501
                            self.msg = ("Failed to  update replica status for "
                                        "%s" % self.snap_id)
                            self.update_replica_status(self.rt2_id, data)
                            self._sys_exit(0)
                        else:
                            self.msg = ("unexpected reply(%s) for %s. "
                                        "extended reply: %s. Aborting" %
                                        (command, self.identity, reply))
                        raise Exception(self.msg)
                else:
                    retries_left -= 1
                    logger.debug("Id: %s. No response from receiver. Number "
                                 "of retry attempts left: %d" %
                                 (self.identity, retries_left))
                    if retries_left == 0:
                        self.msg = "Receiver(%s:%d) is unreachable. Aborting." % (
                            self.receiver_ip,
                            self.receiver_port,
                        )
                        raise Exception(self.msg)
                    self.send_req.setsockopt(zmq.LINGER, 0)
                    self.send_req.close()
                    self.poll.unregister(self.send_req)
                    self._init_greeting()

            snap_path = "%s%s/.snapshots/%s/%s" % (
                settings.MNT_PT,
                self.replica.pool,
                self.replica.share,
                self.snap_name,
            )
            cmd = [BTRFS, "send", snap_path]
            if self.rt is not None:
                prev_snap = "%s%s/.snapshots/%s/%s" % (
                    settings.MNT_PT,
                    self.replica.pool,
                    self.replica.share,
                    self.rt.snap_name,
                )
                logger.info("Id: %s. Sending incremental replica between "
                            "%s -- %s" % (self.identity, prev_snap, snap_path))
                cmd = [BTRFS, "send", "-p", prev_snap, snap_path]
            else:
                logger.info("Id: %s. Sending full replica: %s" %
                            (self.identity, snap_path))

            try:
                self.sp = subprocess.Popen(cmd,
                                           shell=False,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
                fcntl.fcntl(self.sp.stdout.fileno(), fcntl.F_SETFL,
                            os.O_NONBLOCK)
            except Exception as e:
                self.msg = ("Failed to start the low level btrfs send "
                            "command(%s). Aborting. Exception: " %
                            (cmd, e.__str__()))
                logger.error("Id: %s. %s" % (self.identity, self.msg))
                self._send_recv("btrfs-send-init-error")
                self._sys_exit(3)

            alive = True
            num_msgs = 0
            t0 = time.time()
            while alive:
                try:
                    if self.sp.poll() is not None:
                        logger.debug("Id: %s. send process finished "
                                     "for %s. rc: %d. stderr: %s" % (
                                         self.identity,
                                         self.snap_id,
                                         self.sp.returncode,
                                         self.sp.stderr.read(),
                                     ))
                        alive = False
                    fs_data = self.sp.stdout.read()
                except IOError:
                    continue
                except Exception as e:
                    self.msg = ("Exception occurred while reading low "
                                "level btrfs "
                                "send data for %s. Aborting." % self.snap_id)
                    if alive:
                        self.sp.terminate()
                    self.update_trail = True
                    self._send_recv("btrfs-send-unexpected-termination-error")
                    self._sys_exit(3)

                self.msg = ("Failed to send fsdata to the receiver for %s. "
                            "Aborting." % (self.snap_id))
                self.update_trail = True
                command, message = self._send_recv("", fs_data)
                self.total_bytes_sent += len(fs_data)
                num_msgs += 1
                if num_msgs == 1000:
                    num_msgs = 0
                    dsize, drate = self.size_report(self.total_bytes_sent, t0)
                    logger.debug("Id: %s Sender alive. Data transferred: "
                                 "%s. Rate: %s/sec." %
                                 (self.identity, dsize, drate))
                if command is None or command == "receiver-error":
                    # command is None when the remote side vanishes.
                    self.msg = ("Got null or error command(%s) message(%s) "
                                "from the Receiver while"
                                " transmitting fsdata. Aborting." %
                                (command, message))
                    raise Exception(message)

                if not alive:
                    if self.sp.returncode != 0:
                        # do we mark failed?
                        command, message = self._send_recv(
                            "btrfs-send-nonzero-termination-error")
                    else:
                        command, message = self._send_recv(
                            "btrfs-send-stream-finished")

                if os.getppid() != self.ppid:
                    logger.error("Id: %s. Scheduler exited. Sender for %s "
                                 "cannot go on. "
                                 "Aborting." % (self.identity, self.snap_id))
                    self._sys_exit(3)

            data = {
                "status": "succeeded",
                "kb_sent": self.total_bytes_sent / 1024,
            }
            self.msg = ("Failed to update final replica status for %s"
                        ". Aborting." % self.snap_id)
            self.update_replica_status(self.rt2_id, data)
            dsize, drate = self.size_report(self.total_bytes_sent, t0)
            logger.debug("Id: %s. Send complete. Total data transferred: %s."
                         " Rate: %s/sec." % (self.identity, dsize, drate))
            self._sys_exit(0)