Exemplo n.º 1
0
 def _update_trail_and_quit(self, msg):
     try:
         yield
     except Exception, e:
         logger.error(msg)
         logger.exception(e)
         try:
             data = {"status": "failed", "error": msg, "end_ts": datetime.utcnow().replace(tzinfo=utc)}
             update_replica_status(self.rt2_id, data, logger)
         except Exception, e:
             logger.error("Exception occured in cleanup handler")
             logger.exception(e)
Exemplo n.º 2
0
 def _update_trail_and_quit(self, msg):
     try:
         yield
     except Exception, e:
         logger.error('%s. Exception: %s' % (msg, e.__str__()))
         try:
             data = {'status': 'failed',
                     'error': msg,
                     'end_ts': datetime.utcnow().replace(tzinfo=utc).strftime(settings.SNAP_TS_FORMAT), }
             update_replica_status(self.rt2_id, data, logger)
         except Exception, e:
             logger.error('Exception occured in cleanup handler: %s' % e.__str__())
Exemplo n.º 3
0
 def _update_trail_and_quit(self, msg):
     try:
         yield
     except Exception, e:
         logger.error(msg)
         logger.exception(e)
         try:
             data = {'status': 'failed',
                     'error': msg,
                     'end_ts': datetime.utcnow().replace(tzinfo=utc), }
             update_replica_status(self.rt2_id, data, logger)
         except Exception, e:
             logger.error('Exception occured in cleanup handler')
             logger.exception(e)
Exemplo n.º 4
0
 def _update_trail_and_quit(self, msg):
     try:
         yield
     except Exception, e:
         logger.error(msg)
         logger.exception(e)
         try:
             data = {
                 'status': 'failed',
                 'error': msg,
                 'end_ts': datetime.utcnow().replace(tzinfo=utc),
             }
             update_replica_status(self.rt2_id, data, logger)
         except Exception, e:
             logger.error('Exception occured in cleanup handler')
             logger.exception(e)
Exemplo n.º 5
0
    def run(self):
        while True:
            try:
                self.rep_ip = self._replication_interface()
                self.uuid = self._my_uuid()
                break
            except:
                msg = ('Failed to get replication interface or uuid. '
                       'Aborting.')
                return logger.error(msg)

        ctx = zmq.Context()
        #  fs diffs are sent via this publisher.
        rep_pub = ctx.socket(zmq.PUB)
        rep_pub.bind('tcp://%s:%d' % (self.rep_ip, self.data_port))

        #  synchronization messages are received in this pull socket
        meta_pull = ctx.socket(zmq.PULL)
        meta_pull.RCVTIMEO = 100
        meta_pull.bind('tcp://%s:%d' % (self.rep_ip, self.meta_port))

        total_sleep = 0
        while True:
            if (os.getppid() != self.ppid):
                logger.error('Parent exited. Aborting.')
                break

            while(not self.pubq.empty()):
                msg = self.pubq.get()
                rep_pub.send(msg)

            #  check for any recv's coming
            num_msgs = 0
            while (num_msgs < 1000):
                try:
                    self.recv_meta = meta_pull.recv_json()
                    num_msgs = num_msgs + 1
                    snap_id = self.recv_meta['id']
                    if (self.recv_meta['msg'] == 'begin'):
                        rw = Receiver(self.recv_meta)
                        self.receivers[snap_id] = rw
                        rw.start()
                    elif (snap_id not in self.senders):
                        logger.error('Unknown snap_id(%s) received. Ignoring'
                                     % snap_id)
                    else:
                        self.senders[snap_id].q.put(self.recv_meta)
                except zmq.error.Again:
                    break

            self._prune_workers((self.receivers, self.senders))

            if (int(time.time()) - self.prune_time > 3600):
                self.prune_time = int(time.time())
                for rs in ReplicaShare.objects.all():
                    prune_receive_trail(rs.id, logger)
                for r in Replica.objects.all():
                    prune_replica_trail(r.id, logger)

            if (total_sleep >= 60 and len(self.senders) < 50):

                try:
                    for r in get_replicas(logger):
                        rt = get_replica_trail(r.id, logger)
                        now = datetime.utcnow().replace(second=0,
                                                        microsecond=0,
                                                        tzinfo=utc)
                        sw = None
                        snap_name = 'replication'
                        rt2 = ReplicaTrail.objects.filter().order_by('-id')
                        if (len(rt2) != 0):
                            snap_name = ('%s_%d' % (snap_name, rt2[0].id + 1))
                        else:
                            snap_name = ('%s_1' % snap_name)
                        snap_id = ('%s_%s_%s_%s' %
                                   (self.uuid, r.pool, r.share, snap_name))
                        if (len(rt) == 0):
                            logger.debug('new sender for snap: %s' % snap_id)
                            sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                        snap_name, self.meta_port,
                                        self.data_port, r.meta_port, self.uuid,
                                        snap_id)
                        elif (rt[0].status == 'succeeded'):
                            if (((now - rt[0].end_ts).total_seconds() >
                                 (r.frequency * 60))):
                                logger.debug('incremental sender for snap: %s'
                                             % snap_id)
                                sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                            snap_name, self.meta_port,
                                            self.data_port, r.meta_port,
                                            self.uuid, snap_id, rt[0])
                            else:
                                continue
                        elif (rt[0].status == 'pending'):
                            prev_snap_id = ('%s_%s_%s_%s' % (self.uuid,
                                            r.pool, r.share, rt[0].snap_name))
                            if (prev_snap_id in self.senders):
                                logger.debug('send process ongoing for snap: '
                                             '%s' % snap_id)
                                continue
                            logger.debug('%s not found in senders. Previous '
                                         'sender must have Aborted. Marking '
                                         'it as failed' % prev_snap_id)
                            msg = ('Sender process Aborted. See logs for '
                                   'more information')
                            data = {'status': 'failed',
                                    'end_ts': now.strftime(settings.SNAP_TS_FORMAT),
                                    'error': msg,
                                    'send_failed': now, }
                            update_replica_status(rt[0].id, data, logger)
                            continue
                        elif (rt[0].status == 'failed'):
                            snap_name = rt[0].snap_name
                            #  if num_failed attempts > 10, disable the replica
                            num_tries = 0
                            for rto in rt:
                                if (rto.status != 'failed' or
                                    num_tries >= self.MAX_ATTEMPTS or
                                    rto.end_ts < r.ts):
                                    break
                                num_tries = num_tries + 1
                            if (num_tries >= self.MAX_ATTEMPTS):
                                logger.info('Maximum attempts(%d) reached '
                                            'for snap: %s. Disabling the '
                                            'replica.' %
                                            (self.MAX_ATTEMPTS, snap_id))
                                disable_replica(r.id, logger)
                                continue
                            logger.info('previous backup failed for snap: '
                                        '%s. Starting a new one. Attempt '
                                        '%d/%d.' % (snap_id, num_tries,
                                                    self.MAX_ATTEMPTS))
                            prev_rt = None
                            for rto in rt:
                                if (rto.status == 'succeeded'):
                                    prev_rt = rto
                                    break
                            sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                        snap_name, self.meta_port,
                                        self.data_port, r.meta_port,
                                        self.uuid, snap_id, prev_rt)
                        else:
                            logger.error('unknown replica trail status: %s. '
                                         'ignoring snap: %s' %
                                         (rt[0].status, snap_id))
                            continue
                        self.senders[snap_id] = sw
                        sw.daemon = True
                        sw.start()
                    total_sleep = 0
                except DatabaseError, e:
                    e_msg = ('Error getting the list of enabled replica '
                             'tasks. Moving on')
                    logger.error(e_msg)
                    logger.exception(e)

            time.sleep(1)
            total_sleep = total_sleep + 1
Exemplo n.º 6
0
    def run(self):
        msg = "Failed to connect to receiver(%s) on meta port" "(%d) for snap_name: %s. Aborting." % (
            self.receiver_ip,
            self.rmeta_port,
            self.snap_name,
        )
        with self._clean_exit_handler(msg):
            meta_push = self.ctx.socket(zmq.PUSH)
            meta_push.connect("tcp://%s:%d" % (self.receiver_ip, self.rmeta_port))

        #  1. create a new replica trail if it's the very first time
        # of if the last one succeeded
        msg = "Failed to create local replica trail for snap_name:" " %s. Aborting." % self.snap_name
        with self._clean_exit_handler(msg):
            self.rt2 = create_replica_trail(self.replica.id, self.snap_name, logger)
            self.rt2_id = self.rt2["id"]

        #  2. create a snapshot only if it's not already from a previous
        #  failed attempt.
        msg = "Failed to create snapshot: %s. Aborting." % self.snap_name
        with self._clean_exit_handler(msg):
            create_snapshot(self.replica.share, self.snap_name, logger)

        #  let the receiver know that following diff is coming
        msg = (
            "Failed to send initial metadata communication to the "
            "receiver(%s), most likely due to a network error. Aborting." % self.receiver_ip
        )
        with self._update_trail_and_quit(msg):
            meta_push.send_json(self.meta_begin)

        msg = (
            "Timeout occured(60 seconds) while waiting for OK "
            "from the receiver(%s) to start sending data. Aborting." % self.receiver_ip
        )
        with self._update_trail_and_quit(msg):
            ack = self._process_q()
            if ack["msg"] == "snap_exists":
                data = {
                    "status": "succeeded",
                    "end_ts": datetime.utcnow().replace(tzinfo=utc),
                    "error": "snapshot already exists on the receiver",
                }
                msg = "Failed to update replica status for snap_name: %s. " "Aborting." % self.snap_name
                with self._clean_exit_handler(msg):
                    update_replica_status(self.rt2_id, data, logger)
                    self._sys_exit(0)

        snap_path = "%s%s/.snapshots/%s/%s" % (settings.MNT_PT, self.replica.pool, self.replica.share, self.snap_name)
        cmd = [BTRFS, "send", snap_path]
        if self.rt is not None:
            prev_snap = "%s%s/.snapshots/%s/%s" % (
                settings.MNT_PT,
                self.replica.pool,
                self.replica.share,
                self.rt.snap_name,
            )
            logger.info("Sending incremental replica between %s -- %s" % (prev_snap, snap_path))
            cmd = [BTRFS, "send", "-p", prev_snap, snap_path]
        else:
            logger.info("Sending full replica: %s" % snap_path)

        try:
            sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
        except Exception, e:
            msg = "Failed to start the low level btrfs send " "command(%s). Aborting" % cmd
            logger.error(msg)
            logger.exception(e)
            with self._update_trail_and_quit(msg):
                self.pub.put("%sEND_FAIL" % self.snap_id)
            self._sys_exit(3)
Exemplo n.º 7
0
                self._sys_exit(3)

        msg = (
            "Timeout occured(60 seconds) while waiting for final "
            "send confirmation from the receiver(%s) for %s. Aborting." % (self.receiver_ip, self.snap_id)
        )
        with self._update_trail_and_quit(msg):
            ack = self._process_q()

        end_ts = datetime.utcnow().replace(tzinfo=utc)
        data = {"status": "succeeded", "kb_sent": self.kb_sent / 1024, "end_ts": end_ts}
        if ack["msg"] == "receive_error":
            msg = "Receiver(%s) returned a processing error for " " %s. Check it for more information." % (
                self.receiver_ip,
                self.snap_id,
            )
            data["status"] = "failed"
            data["error"] = msg
            data["send_failed"] = end_ts
        else:
            share_path = "%s%s/.snapshots/%s" % (settings.MNT_PT, self.replica.pool, self.replica.share)
            oldest_snap = get_oldest_snap(share_path, 3)
            if oldest_snap is not None:
                msg = "Failed to delete snapshot: %s. Aborting." % oldest_snap
                with self._clean_exit_handler(msg):
                    delete_snapshot(self.replica.share, oldest_snap, logger)

        msg = "Failed to update final replica status for %s" ". Aborting." % self.snap_id
        with self._clean_exit_handler(msg):
            update_replica_status(self.rt2_id, data, logger)
Exemplo n.º 8
0
    def run(self):
        msg = ('Failed to connect to receiver(%s) on meta port'
               '(%d) for snap_name: %s. Aborting.' %
               (self.receiver_ip, self.rmeta_port, self.snap_name))
        with self._clean_exit_handler(msg):
            meta_push = self.ctx.socket(zmq.PUSH)
            meta_push.connect('tcp://%s:%d' % (self.receiver_ip,
                                               self.rmeta_port))

        #  1. create a new replica trail if it's the very first time
        # or if the last one succeeded
        msg = ('Failed to create local replica trail for snap_name:'
               ' %s. Aborting.' % self.snap_name)
        with self._clean_exit_handler(msg):
            self.rt2 = create_replica_trail(self.replica.id,
                                            self.snap_name, logger)
            self.rt2_id = self.rt2['id']

        #  2. create a snapshot only if it's not already from a previous
        #  failed attempt.
        msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name)
        with self._clean_exit_handler(msg):
            create_snapshot(self.replica.share, self.snap_name, logger)

        #  let the receiver know that following diff is coming
        msg = ('Failed to send initial metadata communication to the '
               'receiver(%s), most likely due to a network error. Aborting.'
               % self.receiver_ip)
        with self._update_trail_and_quit(msg):
            meta_push.send_json(self.meta_begin)

        msg = ('Timeout occured(60 seconds) while waiting for OK '
               'from the receiver(%s) to start sending data. Aborting.'
               % self.receiver_ip)
        with self._update_trail_and_quit(msg):
            ack = self._process_q()
            if (ack['msg'] == 'snap_exists'):
                data = {'status': 'succeeded',
                        'end_ts': datetime.utcnow().replace(tzinfo=utc).strftime(settings.SNAP_TS_FORMAT),
                        'error': 'snapshot already exists on the receiver', }
                msg = ('Failed to update replica status for snap_name: %s. '
                       'Aborting.' % self.snap_name)
                with self._clean_exit_handler(msg):
                    update_replica_status(self.rt2_id, data, logger)
                    self._sys_exit(0)

        snap_path = ('%s%s/.snapshots/%s/%s' %
                     (settings.MNT_PT, self.replica.pool, self.replica.share,
                      self.snap_name))
        cmd = [BTRFS, 'send', snap_path]
        if (self.rt is not None):
            prev_snap = ('%s%s/.snapshots/%s/%s' %
                         (settings.MNT_PT, self.replica.pool,
                          self.replica.share, self.rt.snap_name))
            logger.info('Sending incremental replica between %s -- %s' %
                        (prev_snap, snap_path))
            cmd = [BTRFS, 'send', '-p', prev_snap, snap_path]
        else:
            logger.info('Sending full replica: %s' % snap_path)

        try:
            sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
        except Exception, e:
            msg = ('Failed to start the low level btrfs send '
                   'command(%s). Aborting. Exception: ' % (cmd, e.__str__()))
            logger.error(msg)
            with self._update_trail_and_quit(msg):
                self.pub.put('%sEND_FAIL' % self.snap_id)
            self._sys_exit(3)
Exemplo n.º 9
0
    def run(self):
        msg = ('Failed to connect to receiver(%s) on meta port'
               '(%d) for snap_name: %s. Aborting.' %
               (self.receiver_ip, self.rmeta_port, self.snap_name))
        with self._clean_exit_handler(msg):
            meta_push = self.ctx.socket(zmq.PUSH)
            meta_push.connect('tcp://%s:%d' %
                              (self.receiver_ip, self.rmeta_port))

        #  1. create a new replica trail if it's the very first time
        # of if the last one succeeded
        msg = ('Failed to create local replica trail for snap_name:'
               ' %s. Aborting.' % self.snap_name)
        with self._clean_exit_handler(msg):
            self.rt2 = create_replica_trail(self.replica.id, self.snap_name,
                                            logger)
            self.rt2_id = self.rt2['id']

        #  2. create a snapshot only if it's not already from a previous
        #  failed attempt.
        msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name)
        with self._clean_exit_handler(msg):
            create_snapshot(self.replica.share, self.snap_name, logger)

        #  let the receiver know that following diff is coming
        msg = ('Failed to send initial metadata communication to the '
               'receiver(%s), most likely due to a network error. Aborting.' %
               self.receiver_ip)
        with self._update_trail_and_quit(msg):
            meta_push.send_json(self.meta_begin)

        msg = ('Timeout occured(60 seconds) while waiting for OK '
               'from the receiver(%s) to start sending data. Aborting.' %
               self.receiver_ip)
        with self._update_trail_and_quit(msg):
            ack = self._process_q()
            if (ack['msg'] == 'snap_exists'):
                data = {
                    'status':
                    'succeeded',
                    'end_ts':
                    datetime.utcnow().replace(tzinfo=utc).strftime(
                        settings.SNAP_TS_FORMAT),
                    'error':
                    'snapshot already exists on the receiver',
                }
                msg = ('Failed to update replica status for snap_name: %s. '
                       'Aborting.' % self.snap_name)
                with self._clean_exit_handler(msg):
                    update_replica_status(self.rt2_id, data, logger)
                    self._sys_exit(0)

        snap_path = ('%s%s/.snapshots/%s/%s' %
                     (settings.MNT_PT, self.replica.pool, self.replica.share,
                      self.snap_name))
        cmd = [BTRFS, 'send', snap_path]
        if (self.rt is not None):
            prev_snap = ('%s%s/.snapshots/%s/%s' %
                         (settings.MNT_PT, self.replica.pool,
                          self.replica.share, self.rt.snap_name))
            logger.info('Sending incremental replica between %s -- %s' %
                        (prev_snap, snap_path))
            cmd = [BTRFS, 'send', '-p', prev_snap, snap_path]
        else:
            logger.info('Sending full replica: %s' % snap_path)

        try:
            sp = subprocess.Popen(cmd,
                                  shell=False,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
            fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK)
        except Exception, e:
            msg = ('Failed to start the low level btrfs send '
                   'command(%s). Aborting' % cmd)
            logger.error(msg)
            logger.exception(e)
            with self._update_trail_and_quit(msg):
                self.pub.put('%sEND_FAIL' % self.snap_id)
            self._sys_exit(3)
Exemplo n.º 10
0
        end_ts = datetime.utcnow().replace(tzinfo=utc).strftime(
            settings.SNAP_TS_FORMAT)
        data = {
            'status': 'succeeded',
            'kb_sent': self.kb_sent / 1024,
            'end_ts': end_ts,
        }
        if (ack['msg'] == 'receive_error'):
            msg = ('Receiver(%s) returned a processing error for '
                   ' %s. Check it for more information.' %
                   (self.receiver_ip, self.snap_id))
            data['status'] = 'failed'
            data['error'] = msg
            data['send_failed'] = end_ts
        else:
            share_path = (
                '%s%s/.snapshots/%s' %
                (settings.MNT_PT, self.replica.pool, self.replica.share))
            oldest_snap = get_oldest_snap(share_path, 3)
            if (oldest_snap is not None):
                msg = ('Failed to delete snapshot: %s. Aborting.' %
                       oldest_snap)
                with self._clean_exit_handler(msg):
                    delete_snapshot(self.replica.share, oldest_snap, logger)

        msg = ('Failed to update final replica status for %s'
               '. Aborting.' % self.snap_id)
        with self._clean_exit_handler(msg):
            update_replica_status(self.rt2_id, data, logger)
Exemplo n.º 11
0
    def run(self):
        while True:
            try:
                self.rep_ip = self._replication_interface()
                self.uuid = self._my_uuid()
                break
            except:
                msg = ('Failed to get replication interface or uuid. '
                       'Aborting.')
                return logger.error(msg)

        ctx = zmq.Context()
        #  fs diffs are sent via this publisher.
        rep_pub = ctx.socket(zmq.PUB)
        rep_pub.bind('tcp://%s:%d' % (self.rep_ip, self.data_port))

        #  synchronization messages are received in this pull socket
        meta_pull = ctx.socket(zmq.PULL)
        meta_pull.RCVTIMEO = 100
        meta_pull.bind('tcp://%s:%d' % (self.rep_ip, self.meta_port))

        total_sleep = 0
        while True:
            if (os.getppid() != self.ppid):
                logger.error('Parent exited. Aborting.')
                break

            while (not self.pubq.empty()):
                msg = self.pubq.get()
                rep_pub.send(msg)

            #  check for any recv's coming
            try:
                self.recv_meta = meta_pull.recv_json()
                snap_id = self.recv_meta['id']
                logger.debug('meta received: %s' % self.recv_meta)
                if (self.recv_meta['msg'] == 'begin'):
                    logger.debug('begin received. meta: %s' % self.recv_meta)
                    rw = Receiver(self.recv_meta, Queue())
                    self.receivers[snap_id] = rw
                    rw.start()
                elif (snap_id not in self.senders):
                    logger.error('Unknown snap_id(%s) received. Ignoring' %
                                 snap_id)
                else:
                    self.senders[snap_id].q.put(self.recv_meta)
            except zmq.error.Again:
                pass

            self._prune_workers((self.receivers, self.senders))

            if (total_sleep >= 60 and len(self.senders) < 50):
                logger.debug('scanning for replicas')

                try:
                    for r in Replica.objects.filter(enabled=True):
                        rt = ReplicaTrail.objects.filter(
                            replica=r).order_by('-snapshot_created')
                        now = datetime.utcnow().replace(second=0,
                                                        microsecond=0,
                                                        tzinfo=utc)
                        sw = None
                        snap_name = ('%s_replica_snap' % r.share)
                        if (len(rt) == 0):
                            snap_name = ('%s_1' % snap_name)
                            logger.debug('new sender for snap: %s' % snap_name)
                            sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                        snap_name, self.meta_port,
                                        self.data_port, r.meta_port, self.uuid)
                        elif (rt[0].status == 'succeeded'):
                            snap_name = ('%s_%d' % (snap_name, rt[0].id + 1))
                            if ((now - rt[0].end_ts).total_seconds() >
                                    r.frequency):
                                logger.debug(
                                    'incremental sender for snap: %s' %
                                    snap_name)
                                sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                            snap_name, self.meta_port,
                                            self.data_port, r.meta_port,
                                            self.uuid, rt[0])
                            else:
                                logger.debug('its not time yet for '
                                             'incremental sender for snap: '
                                             '%s' % snap_name)
                                continue
                        elif (rt[0].status == 'pending'):
                            prev_snap_id = ('%s_%s_%s_%s' %
                                            (self.rep_ip, r.pool, r.share,
                                             rt[0].snap_name))
                            if (prev_snap_id in self.senders):
                                logger.debug('send process ongoing for snap: '
                                             '%s' % snap_name)
                                continue
                            logger.debug('%s not found in senders. Previous '
                                         'sender must have Aborted. Marking '
                                         'it as failed' % prev_snap_id)
                            msg = ('Sender process Aborted. See logs for '
                                   'more information')
                            data = {
                                'status': 'failed',
                                'end_ts': now,
                                'error': msg,
                                'send_failed': now,
                            }
                            update_replica_status(rt[0].id, data, logger)
                            continue
                        elif (rt[0].status == 'failed'):
                            snap_name = rt[0].snap_name
                            #  if num_failed attempts > 10, disable the replica
                            num_tries = 0
                            MAX_ATTEMPTS = 10
                            for rto in rt:
                                if (rto.status != 'failed'
                                        or num_tries >= MAX_ATTEMPTS
                                        or rto.end_ts < r.ts):
                                    break
                                num_tries = num_tries + 1
                            if (num_tries >= MAX_ATTEMPTS):
                                logger.info('Maximum attempts(%d) reached '
                                            'for snap: %s. Disabling the '
                                            'replica.' %
                                            (MAX_ATTEMPTS, snap_name))
                                disable_replica(r.id, logger)
                                continue
                            logger.info('previous backup failed for snap: '
                                        '%s. Starting a new one. Attempt '
                                        '%d/%d.' %
                                        (snap_name, num_tries, MAX_ATTEMPTS))
                            prev_rt = None
                            for rto in rt:
                                if (rto.status == 'succeeded'):
                                    prev_rt = rto
                                    break
                            sw = Sender(r, self.rep_ip, self.pubq, Queue(),
                                        snap_name, self.meta_port,
                                        self.data_port, r.meta_port, self.uuid,
                                        prev_rt)
                        else:
                            logger.error('unknown replica trail status: %s. '
                                         'ignoring snap: %s' %
                                         (rt[0].status, snap_name))
                            continue
                        snap_id = ('%s_%s_%s_%s' %
                                   (self.rep_ip, r.pool, r.share, snap_name))
                        self.senders[snap_id] = sw
                        sw.daemon = True
                        sw.start()
                    total_sleep = 0
                except DatabaseError, e:
                    e_msg = ('Error getting the list of enabled replica '
                             'tasks. Moving on')
                    logger.error(e_msg)
                    logger.exception(e)

            time.sleep(1)
            total_sleep = total_sleep + 1
Exemplo n.º 12
0
     prev_snap_id = ('%s_%s_%s_%s' % (self.uuid,
                     r.pool, r.share, rt[0].snap_name))
     if (prev_snap_id in self.senders):
         logger.debug('send process ongoing for snap: '
                      '%s' % prev_snap_id)
         continue
     logger.debug('%s not found in senders. Previous '
                  'sender must have Aborted. Marking '
                  'it as failed' % prev_snap_id)
     msg = ('Sender process Aborted. See logs for '
            'more information')
     data = {'status': 'failed',
             'end_ts': now.strftime(settings.SNAP_TS_FORMAT),
             'error': msg,
             'send_failed': now, }
     update_replica_status(rt[0].id, data, logger)
     continue
 elif (rt[0].status == 'failed'):
     snap_name = rt[0].snap_name
     #  if num_failed attempts > 10, disable the replica
     num_tries = 0
     for rto in rt:
         if (rto.status != 'failed' or
             num_tries >= self.MAX_ATTEMPTS or
             rto.end_ts < r.ts):
             break
         num_tries = num_tries + 1
     if (num_tries >= self.MAX_ATTEMPTS):
         logger.info('Maximum attempts(%d) reached '
                     'for snap: %s. Disabling the '
                     'replica.' %
Exemplo n.º 13
0
    def run(self):
        while True:
            try:
                self.rep_ip = self._replication_interface()
                self.uuid = self._my_uuid()
                break
            except:
                msg = "Failed to get replication interface or uuid. " "Aborting."
                return logger.error(msg)

        ctx = zmq.Context()
        #  fs diffs are sent via this publisher.
        rep_pub = ctx.socket(zmq.PUB)
        rep_pub.bind("tcp://%s:%d" % (self.rep_ip, self.data_port))

        #  synchronization messages are received in this pull socket
        meta_pull = ctx.socket(zmq.PULL)
        meta_pull.RCVTIMEO = 100
        meta_pull.bind("tcp://%s:%d" % (self.rep_ip, self.meta_port))

        total_sleep = 0
        while True:
            if os.getppid() != self.ppid:
                logger.error("Parent exited. Aborting.")
                break

            while not self.pubq.empty():
                msg = self.pubq.get()
                rep_pub.send(msg)

            #  check for any recv's coming
            num_msgs = 0
            while num_msgs < 1000:
                try:
                    self.recv_meta = meta_pull.recv_json()
                    num_msgs = num_msgs + 1
                    snap_id = self.recv_meta["id"]
                    if self.recv_meta["msg"] == "begin":
                        rw = Receiver(self.recv_meta)
                        self.receivers[snap_id] = rw
                        rw.start()
                    elif snap_id not in self.senders:
                        logger.error("Unknown snap_id(%s) received. Ignoring" % snap_id)
                    else:
                        self.senders[snap_id].q.put(self.recv_meta)
                except zmq.error.Again:
                    break

            self._prune_workers((self.receivers, self.senders))

            if int(time.time()) - self.prune_time > 3600:
                self.prune_time = int(time.time())
                for rs in ReplicaShare.objects.all():
                    prune_receive_trail(rs.id, logger)
                for r in Replica.objects.all():
                    prune_replica_trail(r.id, logger)

            if total_sleep >= 60 and len(self.senders) < 50:

                try:
                    for r in get_replicas(logger):
                        rt = get_replica_trail(r.id, logger)
                        now = datetime.utcnow().replace(second=0, microsecond=0, tzinfo=utc)
                        sw = None
                        snap_name = "replication"
                        rt2 = ReplicaTrail.objects.filter().order_by("-id")
                        if len(rt2) != 0:
                            snap_name = "%s_%d" % (snap_name, rt2[0].id + 1)
                        else:
                            snap_name = "%s_1" % snap_name
                        snap_id = "%s_%s_%s_%s" % (self.uuid, r.pool, r.share, snap_name)
                        if len(rt) == 0:
                            logger.debug("new sender for snap: %s" % snap_id)
                            sw = Sender(
                                r,
                                self.rep_ip,
                                self.pubq,
                                Queue(),
                                snap_name,
                                self.meta_port,
                                self.data_port,
                                r.meta_port,
                                self.uuid,
                                snap_id,
                            )
                        elif rt[0].status == "succeeded":
                            if (now - rt[0].end_ts).total_seconds() > (r.frequency * 60):
                                logger.debug("incremental sender for snap: %s" % snap_id)
                                sw = Sender(
                                    r,
                                    self.rep_ip,
                                    self.pubq,
                                    Queue(),
                                    snap_name,
                                    self.meta_port,
                                    self.data_port,
                                    r.meta_port,
                                    self.uuid,
                                    snap_id,
                                    rt[0],
                                )
                            else:
                                continue
                        elif rt[0].status == "pending":
                            prev_snap_id = "%s_%s_%s_%s" % (self.uuid, r.pool, r.share, rt[0].snap_name)
                            if prev_snap_id in self.senders:
                                logger.debug("send process ongoing for snap: " "%s" % snap_id)
                                continue
                            logger.debug(
                                "%s not found in senders. Previous "
                                "sender must have Aborted. Marking "
                                "it as failed" % prev_snap_id
                            )
                            msg = "Sender process Aborted. See logs for " "more information"
                            data = {
                                "status": "failed",
                                "end_ts": now.strftime(settings.SNAP_TS_FORMAT),
                                "error": msg,
                                "send_failed": now,
                            }
                            update_replica_status(rt[0].id, data, logger)
                            continue
                        elif rt[0].status == "failed":
                            snap_name = rt[0].snap_name
                            #  if num_failed attempts > 10, disable the replica
                            num_tries = 0
                            for rto in rt:
                                if rto.status != "failed" or num_tries >= self.MAX_ATTEMPTS or rto.end_ts < r.ts:
                                    break
                                num_tries = num_tries + 1
                            if num_tries >= self.MAX_ATTEMPTS:
                                logger.info(
                                    "Maximum attempts(%d) reached "
                                    "for snap: %s. Disabling the "
                                    "replica." % (self.MAX_ATTEMPTS, snap_id)
                                )
                                disable_replica(r.id, logger)
                                continue
                            logger.info(
                                "previous backup failed for snap: "
                                "%s. Starting a new one. Attempt "
                                "%d/%d." % (snap_id, num_tries, self.MAX_ATTEMPTS)
                            )
                            prev_rt = None
                            for rto in rt:
                                if rto.status == "succeeded":
                                    prev_rt = rto
                                    break
                            sw = Sender(
                                r,
                                self.rep_ip,
                                self.pubq,
                                Queue(),
                                snap_name,
                                self.meta_port,
                                self.data_port,
                                r.meta_port,
                                self.uuid,
                                snap_id,
                                prev_rt,
                            )
                        else:
                            logger.error(
                                "unknown replica trail status: %s. " "ignoring snap: %s" % (rt[0].status, snap_id)
                            )
                            continue
                        self.senders[snap_id] = sw
                        sw.daemon = True
                        sw.start()
                    total_sleep = 0
                except DatabaseError, e:
                    e_msg = "Error getting the list of enabled replica " "tasks. Moving on"
                    logger.error(e_msg)
                    logger.exception(e)

            time.sleep(1)
            total_sleep = total_sleep + 1