Exemplo n.º 1
0
    def run(self):
        set_token()
        msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' %
               (self.meta['uuid'], self.meta))
        with self._clean_exit_handler(msg):
            self.sender_ip = get_sender_ip(self.meta['uuid'], logger)
        logger.debug('sender ip: %s' % self.sender_ip)

        msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: '
               '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta))
        with self._clean_exit_handler(msg):
            #@todo: add validation
            recv_sub = self.ctx.socket(zmq.SUB)
            recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port))
            recv_sub.RCVTIMEO = 100
            recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id']))

        msg = ('Failed to connect to the sender(%s) on '
               'meta_port(%d). meta: %s. Aborting.' %
               (self.sender_ip, self.meta_port, self.meta))
        with self._clean_exit_handler(msg):
            self.meta_push = self.ctx.socket(zmq.PUSH)
            url = ('tcp://%s:%d' % (self.sender_ip, self.meta_port))
            logger.debug('meta url: %s' % url)
            self.meta_push.connect('tcp://%s:%d' %
                                   (self.sender_ip, self.meta_port))

        sname = ('%s-%s-%s' % (self.sender_id, self.sender_ip, self.src_share))
        if (not self.incremental):
            msg = ('Failed to verify/create share: %s. meta: %s. '
                   'Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                create_share(sname, self.dest_pool, logger)

            msg = ('Failed to create the replica metadata object '
                   'for share: %s. meta: %s. Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                data = {
                    'share': sname,
                    'appliance': self.sender_ip,
                    'src_share': self.src_share,
                    'data_port': self.data_port,
                    'meta_port': self.meta_port,
                }
                self.rid = create_rshare(data, logger)

        else:
            msg = ('Failed to retreive the replica metadata object for '
                   'share: %s. meta: %s. Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg):
                self.rid = rshare_id(sname, logger)

        sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.meta['pool'], sname))

        snap_fp = ('%s/%s_%s' %
                   (sub_vol, self.snap_name.split('_')[0], self.snap_name))
        logger.info('snap_fp: %s' % snap_fp)
        msg = ('Snaphost: %s already exists.' % snap_fp)
        with self._clean_exit_handler(msg):
            if (os.path.isdir(snap_fp)):
                ack = {
                    'msg': 'snap_exists',
                    'id': self.meta['id'],
                }
                self.meta_push.send_json(ack)
                logger.debug(msg)

        cmd = [BTRFS, 'receive', sub_vol]
        msg = ('Failed to start the low level btrfs receive command(%s)'
               '. Aborting.' % (cmd))
        with self._clean_exit_handler(msg, ack=True):
            rp = subprocess.Popen(cmd,
                                  shell=False,
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            logger.debug('Btrfs receive started for snap: %s' % sub_vol)

        msg = ('Failed to send begin_ok to the sender for meta: %s' %
               self.meta)
        with self._clean_exit_handler(msg):
            ack = {
                'msg': 'begin_ok',
                'id': self.meta['id'],
            }
            self.meta_push.send_json(ack)
            logger.debug('begin_ok sent for meta: %s' % self.meta)
        recv_timeout_counter = 0
        while True:
            try:
                recv_data = recv_sub.recv()
                recv_data = recv_data[len(self.meta['id']):]
                recv_timeout_counter = 0
                self.kb_received = self.kb_received + len(recv_data)
                if (self.rtid is None):
                    msg = ('Failed to create snapshot: %s. Aborting.' %
                           self.snap_name)
                    with self._clean_exit_handler(msg, ack=True):
                        create_snapshot(sname,
                                        self.snap_name,
                                        logger,
                                        snap_type='receiver')

                    data = {'snap_name': self.snap_name}
                    msg = ('Failed to create receive trail for rid: %d'
                           '. meta: %s' % (self.rid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        self.rtid = create_receive_trail(
                            self.rid, data, logger)

                if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'):
                    ts = datetime.utcnow().replace(tzinfo=utc)
                    data = {
                        'kb_received': self.kb_received / 1024,
                    }
                    if (recv_data == 'END_SUCCESS'):
                        logger.debug('END_SUCCESS received for meta: %s' %
                                     self.meta)
                        data['receive_succeeded'] = ts
                    else:
                        logger.error('END_FAIL received for meta: %s. '
                                     'Terminating.' % self.meta)
                        rp.terminate()
                        data['receive_failed'] = ts
                        data['status'] = 'failed'

                    msg = ('Failed to update receive trail for rtid: %d'
                           '. meta: %s' % (self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        update_receive_trail(self.rtid, data, logger)
                    break
                if (rp.poll() is None):
                    rp.stdin.write(recv_data)
                    rp.stdin.flush()
                else:
                    logger.error('It seems the btrfs receive process died'
                                 ' unexpectedly.')
                    out, err = rp.communicate()
                    logger.debug('btrfs receive out: %s err: %s' % (out, err))
                    msg = ('Low level system error from btrfs receive '
                           'command. out: %s err: %s for rtid: %s meta: %s' %
                           (out, err, self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        data = {
                            'receive_failed':
                            (datetime.utcnow().replace(tzinfo=utc)),
                            'status':
                            'failed',
                            'error':
                            msg,
                        }
                        update_receive_trail(self.rtid, data, logger)
            except zmq.error.Again:
                recv_timeout_counter = recv_timeout_counter + 1
                if (recv_timeout_counter > 300):
                    logger.error('Nothing received in the last 30 seconds '
                                 'from the sender for meta: %s. Aborting.' %
                                 self.meta)
                    raise
            except Exception, e:
                msg = ('Exception occured while receiving fsdata')
                logger.error(msg)
                logger.exception(e)
                rp.terminate()
                out, err = rp.communicate()
                logger.debug('rc: %d out: %s err: %s' %
                             (rp.returncode, out, err))
                data['receive_failed'] = datetime.utcnow().replace(tzinfo=utc)
                data['status'] = 'failed'
                data['error'] = msg

                msg = ('Failed to update receive trail for rtid: %d'
                       '. meta: %s' % (self.rtid, self.meta))
                with self._clean_exit_handler(msg, ack=True):
                    update_receive_trail(self.rtid, data, logger)
                self._sys_exit(3)
            finally:
Exemplo n.º 2
0
    def run(self):
        msg = "Failed to connect to receiver(%s) on meta port" "(%d) for snap_name: %s. Aborting." % (
            self.receiver_ip,
            self.rmeta_port,
            self.snap_name,
        )
        with self._clean_exit_handler(msg):
            meta_push = self.ctx.socket(zmq.PUSH)
            meta_push.connect("tcp://%s:%d" % (self.receiver_ip, self.rmeta_port))

        #  1. create a new replica trail if it's the very first time
        # of if the last one succeeded
        msg = "Failed to create local replica trail for snap_name:" " %s. Aborting." % self.snap_name
        with self._clean_exit_handler(msg):
            self.rt2 = create_replica_trail(self.replica.id, self.snap_name, logger)
            self.rt2_id = self.rt2["id"]

        #  2. create a snapshot only if it's not already from a previous
        #  failed attempt.
        msg = "Failed to create snapshot: %s. Aborting." % self.snap_name
        with self._clean_exit_handler(msg):
            create_snapshot(self.replica.share, self.snap_name, logger)

        #  let the receiver know that following diff is coming
        msg = (
            "Failed to send initial metadata communication to the "
            "receiver(%s), most likely due to a network error. Aborting." % self.receiver_ip
        )
        with self._update_trail_and_quit(msg):
            meta_push.send_json(self.meta_begin)

        msg = (
            "Timeout occured(60 seconds) while waiting for OK "
            "from the receiver(%s) to start sending data. Aborting." % self.receiver_ip
        )
        with self._update_trail_and_quit(msg):
            ack = self._process_q()
            if ack["msg"] == "snap_exists":
                data = {
                    "status": "succeeded",
                    "end_ts": datetime.utcnow().replace(tzinfo=utc),
                    "error": "snapshot already exists on the receiver",
                }
                msg = "Failed to update replica status for snap_name: %s. " "Aborting." % self.snap_name
                with self._clean_exit_handler(msg):
                    update_replica_status(self.rt2_id, data, logger)
                    self._sys_exit(0)

        snap_path = "%s%s/.snapshots/%s/%s" % (settings.MNT_PT, self.replica.pool, self.replica.share, self.snap_name)
        cmd = [BTRFS, "send", snap_path]
        if self.rt is not None:
            prev_snap = "%s%s/.snapshots/%s/%s" % (
                settings.MNT_PT,
                self.replica.pool,
                self.replica.share,
                self.rt.snap_name,
            )
            logger.info("Sending incremental replica between %s -- %s" % (prev_snap, snap_path))
            cmd = [BTRFS, "send", "-p", prev_snap, snap_path]
        else:
            logger.info("Sending full replica: %s" % snap_path)

        try:
            sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
        except Exception, e:
            msg = "Failed to start the low level btrfs send " "command(%s). Aborting" % cmd
            logger.error(msg)
            logger.exception(e)
            with self._update_trail_and_quit(msg):
                self.pub.put("%sEND_FAIL" % self.snap_id)
            self._sys_exit(3)
Exemplo n.º 3
0
    def run(self):
        msg = ('Failed to connect to receiver(%s) on meta port'
               '(%d) for snap_name: %s. Aborting.' %
               (self.receiver_ip, self.rmeta_port, self.snap_name))
        with self._clean_exit_handler(msg):
            meta_push = self.ctx.socket(zmq.PUSH)
            meta_push.connect('tcp://%s:%d' % (self.receiver_ip,
                                               self.rmeta_port))

        #  1. create a new replica trail if it's the very first time
        # or if the last one succeeded
        msg = ('Failed to create local replica trail for snap_name:'
               ' %s. Aborting.' % self.snap_name)
        with self._clean_exit_handler(msg):
            self.rt2 = create_replica_trail(self.replica.id,
                                            self.snap_name, logger)
            self.rt2_id = self.rt2['id']

        #  2. create a snapshot only if it's not already from a previous
        #  failed attempt.
        msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name)
        with self._clean_exit_handler(msg):
            create_snapshot(self.replica.share, self.snap_name, logger)

        #  let the receiver know that following diff is coming
        msg = ('Failed to send initial metadata communication to the '
               'receiver(%s), most likely due to a network error. Aborting.'
               % self.receiver_ip)
        with self._update_trail_and_quit(msg):
            meta_push.send_json(self.meta_begin)

        msg = ('Timeout occured(60 seconds) while waiting for OK '
               'from the receiver(%s) to start sending data. Aborting.'
               % self.receiver_ip)
        with self._update_trail_and_quit(msg):
            ack = self._process_q()
            if (ack['msg'] == 'snap_exists'):
                data = {'status': 'succeeded',
                        'end_ts': datetime.utcnow().replace(tzinfo=utc).strftime(settings.SNAP_TS_FORMAT),
                        'error': 'snapshot already exists on the receiver', }
                msg = ('Failed to update replica status for snap_name: %s. '
                       'Aborting.' % self.snap_name)
                with self._clean_exit_handler(msg):
                    update_replica_status(self.rt2_id, data, logger)
                    self._sys_exit(0)

        snap_path = ('%s%s/.snapshots/%s/%s' %
                     (settings.MNT_PT, self.replica.pool, self.replica.share,
                      self.snap_name))
        cmd = [BTRFS, 'send', snap_path]
        if (self.rt is not None):
            prev_snap = ('%s%s/.snapshots/%s/%s' %
                         (settings.MNT_PT, self.replica.pool,
                          self.replica.share, self.rt.snap_name))
            logger.info('Sending incremental replica between %s -- %s' %
                        (prev_snap, snap_path))
            cmd = [BTRFS, 'send', '-p', prev_snap, snap_path]
        else:
            logger.info('Sending full replica: %s' % snap_path)

        try:
            sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
        except Exception, e:
            msg = ('Failed to start the low level btrfs send '
                   'command(%s). Aborting. Exception: ' % (cmd, e.__str__()))
            logger.error(msg)
            with self._update_trail_and_quit(msg):
                self.pub.put('%sEND_FAIL' % self.snap_id)
            self._sys_exit(3)
Exemplo n.º 4
0
    def run(self):
        msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' %
               (self.meta['uuid'], self.meta))
        with self._clean_exit_handler(msg):
            self.sender_ip = get_sender_ip(self.meta['uuid'], logger)

        msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: '
               '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta))
        with self._clean_exit_handler(msg):
            #@todo: add validation
            recv_sub = self.ctx.socket(zmq.SUB)
            recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port))
            recv_sub.RCVTIMEO = 100
            recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id']))

        msg = ('Failed to connect to the sender(%s) on '
               'meta_port(%d). meta: %s. Aborting.' %
               (self.sender_ip, self.meta_port, self.meta))
        with self._clean_exit_handler(msg):
            self.meta_push = self.ctx.socket(zmq.PUSH)
            self.meta_push.connect('tcp://%s:%d' % (self.sender_ip,
                                                    self.meta_port))

        sname = ('%s_%s' % (self.sender_id, self.src_share))
        if (not self.incremental):
            msg = ('Failed to verify/create share: %s. meta: %s. '
                   'Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                create_share(sname, self.dest_pool, logger)

            msg = ('Failed to create the replica metadata object '
                   'for share: %s. meta: %s. Aborting.' %
                   (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                data = {'share': sname,
                        'appliance': self.sender_ip,
                        'src_share': self.src_share,
                        'data_port': self.data_port,
                        'meta_port': self.meta_port, }
                self.rid = create_rshare(data, logger)

        else:
            msg = ('Failed to retreive the replica metadata object for '
                   'share: %s. meta: %s. Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg):
                self.rid = rshare_id(sname, logger)

        sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.meta['pool'],
                                           sname))
        if (not is_subvol(sub_vol)):
            msg = ('Failed to create parent subvolume %s' % sub_vol)
            with self._clean_exit_handler(msg, ack=True):
                run_command([BTRFS, 'subvolume', 'create', sub_vol])

        snap_fp = ('%s/%s' % (sub_vol, self.snap_name))
        with self._clean_exit_handler(msg):
            if (is_subvol(snap_fp)):
                ack = {'msg': 'snap_exists',
                       'id': self.meta['id'], }
                self.meta_push.send_json(ack)

        cmd = [BTRFS, 'receive', sub_vol]
        msg = ('Failed to start the low level btrfs receive command(%s)'
               '. Aborting.' % (cmd))
        with self._clean_exit_handler(msg, ack=True):
            rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)

        msg = ('Failed to send begin_ok to the sender for meta: %s' %
               self.meta)
        with self._clean_exit_handler(msg):
            ack = {'msg': 'begin_ok',
                   'id': self.meta['id'], }
            self.meta_push.send_json(ack)
        recv_timeout_counter = 0
        credit = settings.DEFAULT_SEND_CREDIT
        check_credit = True
        while True:
            if (check_credit is True and credit < 5):
                ack = {'msg': 'send_more',
                       'id': self.meta['id'],
                       'credit': settings.DEFAULT_SEND_CREDIT, }
                self.meta_push.send_json(ack)
                credit = credit + settings.DEFAULT_SEND_CREDIT
                logger.debug('%d KB received for %s' %
                             (int(self.kb_received / 1024), sname))

            try:
                recv_data = recv_sub.recv()
                recv_data = recv_data[len(self.meta['id']):]
                credit = credit - 1
                recv_timeout_counter = 0
                self.kb_received = self.kb_received + len(recv_data)
                if (self.rtid is None):
                    msg = ('Failed to create snapshot: %s. Aborting.' %
                           self.snap_name)
                    # create a snapshot only if it's not already from a previous failed attempt
                    with self._clean_exit_handler(msg, ack=True):
                        create_snapshot(sname, self.snap_name, logger,
                                        snap_type='receiver')

                    data = {'snap_name': self.snap_name}
                    msg = ('Failed to create receive trail for rid: %d'
                           '. meta: %s' % (self.rid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        self.rtid = create_receive_trail(self.rid, data,
                                                         logger)

                if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'):
                    check_credit = False
                    ts = datetime.utcnow().replace(tzinfo=utc)
                    data = {'kb_received': self.kb_received / 1024, }
                    if (recv_data == 'END_SUCCESS'):
                        data['receive_succeeded'] = ts.strftime(settings.SNAP_TS_FORMAT)
                        #delete the share, move the oldest snap to share
                        oldest_snap = get_oldest_snap(sub_vol, 3)
                        if (oldest_snap is not None):
                            snap_path = ('%s/%s' % (sub_vol, oldest_snap))
                            share_path = ('%s%s/%s' %
                                          (settings.MNT_PT, self.dest_pool,
                                           sname))
                            msg = ('Failed to promote the oldest Snapshot(%s) '
                                   'to Share(%s)' % (snap_path, share_path))
                            try:
                                pool = Pool.objects.get(name=self.dest_pool)
                                remove_share(pool, sname)
                                set_property(snap_path, 'ro', 'false',
                                             mount=False)
                                run_command(['/usr/bin/rm', '-rf', share_path],
                                            throw=False)
                                shutil.move(snap_path, share_path)
                                set_property(share_path, 'ro', 'true',
                                             mount=False)
                                delete_snapshot(sname, oldest_snap, logger)
                            except Exception, e:
                                logger.error(msg)
                                logger.exception(msg)
                    else:
                        logger.error('END_FAIL received for meta: %s. '
                                     'Terminating.' % self.meta)
                        rp.terminate()
                        data['receive_failed'] = ts
                        data['status'] = 'failed'

                    msg = ('Failed to update receive trail for rtid: %d'
                           '. meta: %s' % (self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        update_receive_trail(self.rtid, data, logger)
                    break
                if (rp.poll() is None):
                    rp.stdin.write(recv_data)
                    rp.stdin.flush()
                else:
                    logger.error('It seems the btrfs receive process died'
                                 ' unexpectedly.')
                    out, err = rp.communicate()
                    msg = ('Low level system error from btrfs receive '
                           'command. out: %s err: %s for rtid: %s meta: %s'
                           % (out, err, self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        ts = datetime.utcnow().replace(tzinfo=utc)
                        data = {'receive_failed': ts.strftime(settings.SNAP_TS_FORMAT),
                                'status': 'failed',
                                'error': msg, }
                        update_receive_trail(self.rtid, data, logger)
            except zmq.error.Again:
                recv_timeout_counter = recv_timeout_counter + 1
                if (recv_timeout_counter > 600):
                    logger.error('Nothing received in the last 60 seconds '
                                 'from the sender for meta: %s. Aborting.'
                                 % self.meta)
                    self._sys_exit(3)
Exemplo n.º 5
0
    def run(self):
        set_token()
        msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' %
               (self.meta['uuid'], self.meta))
        with self._clean_exit_handler(msg):
            self.sender_ip = get_sender_ip(self.meta['uuid'], logger)
        logger.debug('sender ip: %s' % self.sender_ip)

        msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: '
               '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta))
        with self._clean_exit_handler(msg):
            #@todo: add validation
            recv_sub = self.ctx.socket(zmq.SUB)
            recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port))
            recv_sub.RCVTIMEO = 100
            recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id']))

        msg = ('Failed to connect to the sender(%s) on '
               'meta_port(%d). meta: %s. Aborting.' %
               (self.sender_ip, self.meta_port, self.meta))
        with self._clean_exit_handler(msg):
            self.meta_push = self.ctx.socket(zmq.PUSH)
            url = ('tcp://%s:%d' % (self.sender_ip, self.meta_port))
            logger.debug('meta url: %s' % url)
            self.meta_push.connect('tcp://%s:%d' % (self.sender_ip,
                                                    self.meta_port))

        sname = ('%s-%s-%s' % (self.sender_id, self.sender_ip, self.src_share))
        if (not self.incremental):
            msg = ('Failed to verify/create share: %s. meta: %s. '
                   'Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                create_share(sname, self.dest_pool, logger)

            msg = ('Failed to create the replica metadata object '
                   'for share: %s. meta: %s. Aborting.' %
                   (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                data = {'share': sname,
                        'appliance': self.sender_ip,
                        'src_share': self.src_share,
                        'data_port': self.data_port,
                        'meta_port': self.meta_port, }
                self.rid = create_rshare(data, logger)

        else:
            msg = ('Failed to retreive the replica metadata object for '
                   'share: %s. meta: %s. Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg):
                self.rid = rshare_id(sname, logger)

        sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.meta['pool'],
                                sname))

        snap_fp = ('%s/%s_%s' % (sub_vol, self.snap_name.split('_')[0],
                                 self.snap_name))
        logger.info('snap_fp: %s' % snap_fp)
        msg = ('Snaphost: %s already exists.' % snap_fp)
        with self._clean_exit_handler(msg):
            if (os.path.isdir(snap_fp)):
                ack = {'msg': 'snap_exists',
                       'id': self.meta['id'],}
                self.meta_push.send_json(ack)
                logger.debug(msg)

        cmd = [BTRFS, 'receive', sub_vol]
        msg = ('Failed to start the low level btrfs receive command(%s)'
               '. Aborting.' % (cmd))
        with self._clean_exit_handler(msg, ack=True):
            rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            logger.debug('Btrfs receive started for snap: %s' % sub_vol)

        msg = ('Failed to send begin_ok to the sender for meta: %s' %
               self.meta)
        with self._clean_exit_handler(msg):
            ack = {'msg': 'begin_ok',
                   'id': self.meta['id'], }
            self.meta_push.send_json(ack)
            logger.debug('begin_ok sent for meta: %s' % self.meta)
        recv_timeout_counter = 0
        while True:
            try:
                recv_data = recv_sub.recv()
                recv_data = recv_data[len(self.meta['id']):]
                recv_timeout_counter = 0
                self.kb_received = self.kb_received + len(recv_data)
                if (self.rtid is None):
                    msg = ('Failed to create snapshot: %s. Aborting.' %
                           self.snap_name)
                    with self._clean_exit_handler(msg, ack=True):
                        create_snapshot(sname, self.snap_name, logger,
                                        snap_type='receiver')

                    data = {'snap_name': self.snap_name}
                    msg = ('Failed to create receive trail for rid: %d'
                           '. meta: %s' % (self.rid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        self.rtid = create_receive_trail(self.rid, data,
                                                         logger)

                if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'):
                    ts = datetime.utcnow().replace(tzinfo=utc)
                    data = {'kb_received': self.kb_received / 1024,}
                    if (recv_data == 'END_SUCCESS'):
                        logger.debug('END_SUCCESS received for meta: %s' %
                                     self.meta)
                        data['receive_succeeded'] = ts
                    else:
                        logger.error('END_FAIL received for meta: %s. '
                                     'Terminating.' % self.meta)
                        rp.terminate()
                        data['receive_failed'] = ts
                        data['status'] = 'failed'

                    msg = ('Failed to update receive trail for rtid: %d'
                               '. meta: %s' % (self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        update_receive_trail(self.rtid, data, logger)
                    break
                if (rp.poll() is None):
                    rp.stdin.write(recv_data)
                    rp.stdin.flush()
                else:
                    logger.error('It seems the btrfs receive process died'
                                 ' unexpectedly.')
                    out, err = rp.communicate()
                    logger.debug('btrfs receive out: %s err: %s' % (out, err))
                    msg = ('Low level system error from btrfs receive '
                           'command. out: %s err: %s for rtid: %s meta: %s'
                           % (out, err, self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        data = {'receive_failed': (
                            datetime.utcnow().replace(tzinfo=utc)),
                                'status': 'failed',
                                'error': msg,}
                        update_receive_trail(self.rtid, data, logger)
            except zmq.error.Again:
                recv_timeout_counter = recv_timeout_counter + 1
                if (recv_timeout_counter > 300):
                    logger.error('Nothing received in the last 30 seconds '
                                 'from the sender for meta: %s. Aborting.'
                                 % self.meta)
                    raise
            except Exception, e:
                msg = ('Exception occured while receiving fsdata')
                logger.error(msg)
                logger.exception(e)
                rp.terminate()
                out, err = rp.communicate()
                logger.debug('rc: %d out: %s err: %s' % (rp.returncode, out,
                                                         err))
                data['receive_failed'] = datetime.utcnow().replace(tzinfo=utc)
                data['status'] = 'failed'
                data['error'] = msg

                msg = ('Failed to update receive trail for rtid: %d'
                       '. meta: %s' % (self.rtid, self.meta))
                with self._clean_exit_handler(msg, ack=True):
                    update_receive_trail(self.rtid, data, logger)
                self._sys_exit(3)
            finally:
Exemplo n.º 6
0
    def run(self):
        msg = ('Failed to connect to receiver(%s) on meta port'
               '(%d) for snap_name: %s. Aborting.' %
               (self.receiver_ip, self.rmeta_port, self.snap_name))
        with self._clean_exit_handler(msg):
            meta_push = self.ctx.socket(zmq.PUSH)
            meta_push.connect('tcp://%s:%d' %
                              (self.receiver_ip, self.rmeta_port))

        #  1. create a new replica trail if it's the very first time
        # of if the last one succeeded
        msg = ('Failed to create local replica trail for snap_name:'
               ' %s. Aborting.' % self.snap_name)
        with self._clean_exit_handler(msg):
            self.rt2 = create_replica_trail(self.replica.id, self.snap_name,
                                            logger)
            self.rt2_id = self.rt2['id']

        #  2. create a snapshot only if it's not already from a previous
        #  failed attempt.
        msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name)
        with self._clean_exit_handler(msg):
            create_snapshot(self.replica.share, self.snap_name, logger)

        #  let the receiver know that following diff is coming
        msg = ('Failed to send initial metadata communication to the '
               'receiver(%s), most likely due to a network error. Aborting.' %
               self.receiver_ip)
        with self._update_trail_and_quit(msg):
            meta_push.send_json(self.meta_begin)

        msg = ('Timeout occured(60 seconds) while waiting for OK '
               'from the receiver(%s) to start sending data. Aborting.' %
               self.receiver_ip)
        with self._update_trail_and_quit(msg):
            ack = self._process_q()
            if (ack['msg'] == 'snap_exists'):
                data = {
                    'status':
                    'succeeded',
                    'end_ts':
                    datetime.utcnow().replace(tzinfo=utc).strftime(
                        settings.SNAP_TS_FORMAT),
                    'error':
                    'snapshot already exists on the receiver',
                }
                msg = ('Failed to update replica status for snap_name: %s. '
                       'Aborting.' % self.snap_name)
                with self._clean_exit_handler(msg):
                    update_replica_status(self.rt2_id, data, logger)
                    self._sys_exit(0)

        snap_path = ('%s%s/.snapshots/%s/%s' %
                     (settings.MNT_PT, self.replica.pool, self.replica.share,
                      self.snap_name))
        cmd = [BTRFS, 'send', snap_path]
        if (self.rt is not None):
            prev_snap = ('%s%s/.snapshots/%s/%s' %
                         (settings.MNT_PT, self.replica.pool,
                          self.replica.share, self.rt.snap_name))
            logger.info('Sending incremental replica between %s -- %s' %
                        (prev_snap, snap_path))
            cmd = [BTRFS, 'send', '-p', prev_snap, snap_path]
        else:
            logger.info('Sending full replica: %s' % snap_path)

        try:
            sp = subprocess.Popen(cmd,
                                  shell=False,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
        except Exception, e:
            msg = ('Failed to start the low level btrfs send '
                   'command(%s). Aborting' % cmd)
            logger.error(msg)
            logger.exception(e)
            with self._update_trail_and_quit(msg):
                self.pub.put('%sEND_FAIL' % self.snap_id)
            self._sys_exit(3)
Exemplo n.º 7
0
    def run(self):
        msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' %
               (self.meta['uuid'], self.meta))
        with self._clean_exit_handler(msg):
            self.sender_ip = get_sender_ip(self.meta['uuid'], logger)

        msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: '
               '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta))
        with self._clean_exit_handler(msg):
            #@todo: add validation
            recv_sub = self.ctx.socket(zmq.SUB)
            recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port))
            recv_sub.RCVTIMEO = 100
            recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id']))

        msg = ('Failed to connect to the sender(%s) on '
               'meta_port(%d). meta: %s. Aborting.' %
               (self.sender_ip, self.meta_port, self.meta))
        with self._clean_exit_handler(msg):
            self.meta_push = self.ctx.socket(zmq.PUSH)
            self.meta_push.connect('tcp://%s:%d' % (self.sender_ip,
                                                    self.meta_port))

        sname = ('%s_%s' % (self.sender_id, self.src_share))
        if (not self.incremental):
            msg = ('Failed to verify/create share: %s. meta: %s. '
                   'Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                create_share(sname, self.dest_pool, logger)

            msg = ('Failed to create the replica metadata object '
                   'for share: %s. meta: %s. Aborting.' %
                   (sname, self.meta))
            with self._clean_exit_handler(msg, ack=True):
                data = {'share': sname,
                        'appliance': self.sender_ip,
                        'src_share': self.src_share,
                        'data_port': self.data_port,
                        'meta_port': self.meta_port, }
                self.rid = create_rshare(data, logger)

        else:
            msg = ('Failed to retreive the replica metadata object for '
                   'share: %s. meta: %s. Aborting.' % (sname, self.meta))
            with self._clean_exit_handler(msg):
                self.rid = rshare_id(sname, logger)

        sub_vol = ('%s%s/.snapshots/%s' % (settings.MNT_PT, self.meta['pool'],
                                           sname))
        if (not is_subvol(sub_vol)):
            msg = ('Failed to create parent subvolume %s' % sub_vol)
            with self._clean_exit_handler(msg, ack=True):
                run_command([BTRFS, 'subvolume', 'create', sub_vol])

        snap_fp = ('%s/%s' % (sub_vol, self.snap_name))
        with self._clean_exit_handler(msg):
            if (is_subvol(snap_fp)):
                ack = {'msg': 'snap_exists',
                       'id': self.meta['id'], }
                self.meta_push.send_json(ack)

        cmd = [BTRFS, 'receive', sub_vol]
        msg = ('Failed to start the low level btrfs receive command(%s)'
               '. Aborting.' % (cmd))
        with self._clean_exit_handler(msg, ack=True):
            rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)

        msg = ('Failed to send begin_ok to the sender for meta: %s' %
               self.meta)
        with self._clean_exit_handler(msg):
            ack = {'msg': 'begin_ok',
                   'id': self.meta['id'], }
            self.meta_push.send_json(ack)
        recv_timeout_counter = 0
        credit = settings.DEFAULT_SEND_CREDIT
        check_credit = True
        while True:
            if (check_credit is True and credit < 5):
                ack = {'msg': 'send_more',
                       'id': self.meta['id'],
                       'credit': settings.DEFAULT_SEND_CREDIT, }
                self.meta_push.send_json(ack)
                credit = credit + settings.DEFAULT_SEND_CREDIT
                logger.debug('%d KB received for %s' %
                             (int(self.kb_received / 1024), sname))

            try:
                recv_data = recv_sub.recv()
                recv_data = recv_data[len(self.meta['id']):]
                credit = credit - 1
                recv_timeout_counter = 0
                self.kb_received = self.kb_received + len(recv_data)
                if (self.rtid is None):
                    msg = ('Failed to create snapshot: %s. Aborting.' %
                           self.snap_name)
                    # create a snapshot only if it's not already from a previous failed attempt
                    with self._clean_exit_handler(msg, ack=True):
                        create_snapshot(sname, self.snap_name, logger,
                                        snap_type='receiver')

                    data = {'snap_name': self.snap_name}
                    msg = ('Failed to create receive trail for rid: %d'
                           '. meta: %s' % (self.rid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        self.rtid = create_receive_trail(self.rid, data,
                                                         logger)

                if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'):
                    check_credit = False
                    ts = datetime.utcnow().replace(tzinfo=utc)
                    data = {'kb_received': self.kb_received / 1024, }
                    if (recv_data == 'END_SUCCESS'):
                        data['receive_succeeded'] = ts
                        #delete the share, move the oldest snap to share
                        oldest_snap = get_oldest_snap(sub_vol, 3)
                        if (oldest_snap is not None):
                            snap_path = ('%s/%s' % (sub_vol, oldest_snap))
                            share_path = ('%s%s/%s' %
                                          (settings.MNT_PT, self.dest_pool,
                                           sname))
                            msg = ('Failed to promote the oldest Snapshot(%s) '
                                   'to Share(%s)' % (snap_path, share_path))
                            try:
                                pool = Pool.objects.get(name=self.dest_pool)
                                remove_share(pool, sname)
                                set_property(snap_path, 'ro', 'false',
                                             mount=False)
                                run_command(['/usr/bin/rm', '-rf', share_path],
                                            throw=False)
                                shutil.move(snap_path, share_path)
                                set_property(share_path, 'ro', 'true',
                                             mount=False)
                                delete_snapshot(sname, oldest_snap, logger)
                            except Exception, e:
                                logger.error(msg)
                                logger.exception(msg)
                    else:
                        logger.error('END_FAIL received for meta: %s. '
                                     'Terminating.' % self.meta)
                        rp.terminate()
                        data['receive_failed'] = ts
                        data['status'] = 'failed'

                    msg = ('Failed to update receive trail for rtid: %d'
                           '. meta: %s' % (self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        update_receive_trail(self.rtid, data, logger)
                    break
                if (rp.poll() is None):
                    rp.stdin.write(recv_data)
                    rp.stdin.flush()
                else:
                    logger.error('It seems the btrfs receive process died'
                                 ' unexpectedly.')
                    out, err = rp.communicate()
                    msg = ('Low level system error from btrfs receive '
                           'command. out: %s err: %s for rtid: %s meta: %s'
                           % (out, err, self.rtid, self.meta))
                    with self._clean_exit_handler(msg, ack=True):
                        ts = datetime.utcnow().replace(tzinfo=utc)
                        data = {'receive_failed': ts,
                                'status': 'failed',
                                'error': msg, }
                        update_receive_trail(self.rtid, data, logger)
            except zmq.error.Again:
                recv_timeout_counter = recv_timeout_counter + 1
                if (recv_timeout_counter > 600):
                    logger.error('Nothing received in the last 60 seconds '
                                 'from the sender for meta: %s. Aborting.'
                                 % self.meta)
                    self._sys_exit(3)