def run(self): while True: try: self.rep_ip = self._replication_interface() self.uuid = self._my_uuid() break except: msg = ('Failed to get replication interface or uuid. ' 'Aborting.') return logger.error(msg) ctx = zmq.Context() # fs diffs are sent via this publisher. rep_pub = ctx.socket(zmq.PUB) rep_pub.bind('tcp://%s:%d' % (self.rep_ip, self.data_port)) # synchronization messages are received in this pull socket meta_pull = ctx.socket(zmq.PULL) meta_pull.RCVTIMEO = 100 meta_pull.bind('tcp://%s:%d' % (self.rep_ip, self.meta_port)) total_sleep = 0 while True: if (os.getppid() != self.ppid): logger.error('Parent exited. Aborting.') break while(not self.pubq.empty()): msg = self.pubq.get() rep_pub.send(msg) # check for any recv's coming num_msgs = 0 while (num_msgs < 1000): try: self.recv_meta = meta_pull.recv_json() num_msgs = num_msgs + 1 snap_id = self.recv_meta['id'] if (self.recv_meta['msg'] == 'begin'): rw = Receiver(self.recv_meta) self.receivers[snap_id] = rw rw.start() elif (snap_id not in self.senders): logger.error('Unknown snap_id(%s) received. Ignoring' % snap_id) else: self.senders[snap_id].q.put(self.recv_meta) except zmq.error.Again: break self._prune_workers((self.receivers, self.senders)) if (int(time.time()) - self.prune_time > 3600): self.prune_time = int(time.time()) for rs in ReplicaShare.objects.all(): prune_receive_trail(rs.id, logger) for r in Replica.objects.all(): prune_replica_trail(r.id, logger) if (total_sleep >= 60 and len(self.senders) < 50): try: for r in get_replicas(logger): rt = get_replica_trail(r.id, logger) now = datetime.utcnow().replace(second=0, microsecond=0, tzinfo=utc) sw = None snap_name = 'replication' rt2 = ReplicaTrail.objects.filter().order_by('-id') if (len(rt2) != 0): snap_name = ('%s_%d' % (snap_name, rt2[0].id + 1)) else: snap_name = ('%s_1' % snap_name) snap_id = ('%s_%s_%s_%s' % (self.uuid, r.pool, r.share, snap_name)) if (len(rt) == 0): logger.debug('new sender for snap: %s' % snap_id) sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id) elif (rt[0].status == 'succeeded'): if (((now - rt[0].end_ts).total_seconds() > (r.frequency * 60))): logger.debug('incremental sender for snap: %s' % snap_id) sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, rt[0]) else: continue elif (rt[0].status == 'pending'): prev_snap_id = ('%s_%s_%s_%s' % (self.uuid, r.pool, r.share, rt[0].snap_name)) if (prev_snap_id in self.senders): logger.debug('send process ongoing for snap: ' '%s' % snap_id) continue logger.debug('%s not found in senders. Previous ' 'sender must have Aborted. Marking ' 'it as failed' % prev_snap_id) msg = ('Sender process Aborted. See logs for ' 'more information') data = {'status': 'failed', 'end_ts': now.strftime(settings.SNAP_TS_FORMAT), 'error': msg, 'send_failed': now, } update_replica_status(rt[0].id, data, logger) continue elif (rt[0].status == 'failed'): snap_name = rt[0].snap_name # if num_failed attempts > 10, disable the replica num_tries = 0 for rto in rt: if (rto.status != 'failed' or num_tries >= self.MAX_ATTEMPTS or rto.end_ts < r.ts): break num_tries = num_tries + 1 if (num_tries >= self.MAX_ATTEMPTS): logger.info('Maximum attempts(%d) reached ' 'for snap: %s. Disabling the ' 'replica.' % (self.MAX_ATTEMPTS, snap_id)) disable_replica(r.id, logger) continue logger.info('previous backup failed for snap: ' '%s. Starting a new one. Attempt ' '%d/%d.' % (snap_id, num_tries, self.MAX_ATTEMPTS)) prev_rt = None for rto in rt: if (rto.status == 'succeeded'): prev_rt = rto break sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, prev_rt) else: logger.error('unknown replica trail status: %s. ' 'ignoring snap: %s' % (rt[0].status, snap_id)) continue self.senders[snap_id] = sw sw.daemon = True sw.start() total_sleep = 0 except DatabaseError, e: e_msg = ('Error getting the list of enabled replica ' 'tasks. Moving on') logger.error(e_msg) logger.exception(e) time.sleep(1) total_sleep = total_sleep + 1
def run(self): while True: try: self.rep_ip = self._replication_interface() self.uuid = self._my_uuid() break except: msg = ('Failed to get replication interface or uuid. ' 'Aborting.') return logger.error(msg) ctx = zmq.Context() # fs diffs are sent via this publisher. rep_pub = ctx.socket(zmq.PUB) rep_pub.bind('tcp://%s:%d' % (self.rep_ip, self.data_port)) # synchronization messages are received in this pull socket meta_pull = ctx.socket(zmq.PULL) meta_pull.RCVTIMEO = 100 meta_pull.bind('tcp://%s:%d' % (self.rep_ip, self.meta_port)) total_sleep = 0 while True: if (os.getppid() != self.ppid): logger.error('Parent exited. Aborting.') break while (not self.pubq.empty()): msg = self.pubq.get() rep_pub.send(msg) # check for any recv's coming num_msgs = 0 while (num_msgs < 1000): try: self.recv_meta = meta_pull.recv_json() num_msgs = num_msgs + 1 snap_id = self.recv_meta['id'] if (self.recv_meta['msg'] == 'begin'): rw = Receiver(self.recv_meta) self.receivers[snap_id] = rw rw.start() elif (snap_id not in self.senders): logger.error('Unknown snap_id(%s) received. Ignoring' % snap_id) else: self.senders[snap_id].q.put(self.recv_meta) except zmq.error.Again: break self._prune_workers((self.receivers, self.senders)) if (int(time.time()) - self.prune_time > 3600): self.prune_time = int(time.time()) for rs in ReplicaShare.objects.all(): prune_receive_trail(rs.id, logger) for r in Replica.objects.all(): prune_replica_trail(r.id, logger) if (total_sleep >= 60 and len(self.senders) < 50): try: for r in get_replicas(logger): rt = get_replica_trail(r.id, logger) now = datetime.utcnow().replace(second=0, microsecond=0, tzinfo=utc) sw = None snap_name = 'replication' rt2 = ReplicaTrail.objects.filter().order_by('-id') if (len(rt2) != 0): snap_name = ('%s_%d' % (snap_name, rt2[0].id + 1)) else: snap_name = ('%s_1' % snap_name) snap_id = ('%s_%s_%s_%s' % (self.uuid, r.pool, r.share, snap_name)) if (len(rt) == 0): logger.debug('new sender for snap: %s' % snap_id) sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id) elif (rt[0].status == 'succeeded'): if (((now - rt[0].end_ts).total_seconds() > (r.frequency * 60))): logger.debug( 'incremental sender for snap: %s' % snap_id) sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, rt[0]) else: continue elif (rt[0].status == 'pending'): prev_snap_id = ( '%s_%s_%s_%s' % (self.uuid, r.pool, r.share, rt[0].snap_name)) if (prev_snap_id in self.senders): logger.debug('send process ongoing for snap: ' '%s' % snap_id) continue logger.debug('%s not found in senders. Previous ' 'sender must have Aborted. Marking ' 'it as failed' % prev_snap_id) msg = ('Sender process Aborted. See logs for ' 'more information') data = { 'status': 'failed', 'end_ts': now.strftime(settings.SNAP_TS_FORMAT), 'error': msg, 'send_failed': now, } update_replica_status(rt[0].id, data, logger) continue elif (rt[0].status == 'failed'): snap_name = rt[0].snap_name # if num_failed attempts > 10, disable the replica num_tries = 0 for rto in rt: if (rto.status != 'failed' or num_tries >= self.MAX_ATTEMPTS or rto.end_ts < r.ts): break num_tries = num_tries + 1 if (num_tries >= self.MAX_ATTEMPTS): logger.info('Maximum attempts(%d) reached ' 'for snap: %s. Disabling the ' 'replica.' % (self.MAX_ATTEMPTS, snap_id)) disable_replica(r.id, logger) continue logger.info( 'previous backup failed for snap: ' '%s. Starting a new one. Attempt ' '%d/%d.' % (snap_id, num_tries, self.MAX_ATTEMPTS)) prev_rt = None for rto in rt: if (rto.status == 'succeeded'): prev_rt = rto break sw = Sender(r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, prev_rt) else: logger.error('unknown replica trail status: %s. ' 'ignoring snap: %s' % (rt[0].status, snap_id)) continue self.senders[snap_id] = sw sw.daemon = True sw.start() total_sleep = 0 except DatabaseError, e: e_msg = ('Error getting the list of enabled replica ' 'tasks. Moving on') logger.error(e_msg) logger.exception(e) time.sleep(1) total_sleep = total_sleep + 1
else: self.senders[snap_id].q.put(self.recv_meta) except zmq.error.Again: #recv_json throws this exception if nothing is received #for 100 milliseconds. Break, do other stuff and come back here. break self._prune_workers((self.receivers, self.senders)) if (int(time.time()) - self.prune_time > self.trail_prune_interval): #trail objects keep accumulating and may grow to be quite large. #so once every trail_prune_interval seconds, deleted the old ones #by default the prune helper methods delete records older than 7 days self.prune_time = int(time.time()) #reset for rs in ReplicaShare.objects.all(): prune_receive_trail(rs.id, logger) for r in Replica.objects.all(): prune_replica_trail(r.id, logger) #seconds spent processing messages at once. should be counted as #part of sleep time so new senders are not stalled. total_sleep += int(time.time() - t0) if (total_sleep >= self.sender_check_interval and len(self.senders) < self.max_senders): total_sleep = 0 #reset #check to see if we can start any new senders. try: for r in get_replicas(logger): rt = get_replica_trail(r.id, logger) now = datetime.utcnow().replace(second=0, microsecond=0,
def run(self): while True: try: self.rep_ip = self._replication_interface() self.uuid = self._my_uuid() break except: msg = "Failed to get replication interface or uuid. " "Aborting." return logger.error(msg) ctx = zmq.Context() # fs diffs are sent via this publisher. rep_pub = ctx.socket(zmq.PUB) rep_pub.bind("tcp://%s:%d" % (self.rep_ip, self.data_port)) # synchronization messages are received in this pull socket meta_pull = ctx.socket(zmq.PULL) meta_pull.RCVTIMEO = 100 meta_pull.bind("tcp://%s:%d" % (self.rep_ip, self.meta_port)) total_sleep = 0 while True: if os.getppid() != self.ppid: logger.error("Parent exited. Aborting.") break while not self.pubq.empty(): msg = self.pubq.get() rep_pub.send(msg) # check for any recv's coming num_msgs = 0 while num_msgs < 1000: try: self.recv_meta = meta_pull.recv_json() num_msgs = num_msgs + 1 snap_id = self.recv_meta["id"] if self.recv_meta["msg"] == "begin": rw = Receiver(self.recv_meta) self.receivers[snap_id] = rw rw.start() elif snap_id not in self.senders: logger.error("Unknown snap_id(%s) received. Ignoring" % snap_id) else: self.senders[snap_id].q.put(self.recv_meta) except zmq.error.Again: break self._prune_workers((self.receivers, self.senders)) if int(time.time()) - self.prune_time > 3600: self.prune_time = int(time.time()) for rs in ReplicaShare.objects.all(): prune_receive_trail(rs.id, logger) for r in Replica.objects.all(): prune_replica_trail(r.id, logger) if total_sleep >= 60 and len(self.senders) < 50: try: for r in get_replicas(logger): rt = get_replica_trail(r.id, logger) now = datetime.utcnow().replace(second=0, microsecond=0, tzinfo=utc) sw = None snap_name = "replication" rt2 = ReplicaTrail.objects.filter().order_by("-id") if len(rt2) != 0: snap_name = "%s_%d" % (snap_name, rt2[0].id + 1) else: snap_name = "%s_1" % snap_name snap_id = "%s_%s_%s_%s" % (self.uuid, r.pool, r.share, snap_name) if len(rt) == 0: logger.debug("new sender for snap: %s" % snap_id) sw = Sender( r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, ) elif rt[0].status == "succeeded": if (now - rt[0].end_ts).total_seconds() > (r.frequency * 60): logger.debug("incremental sender for snap: %s" % snap_id) sw = Sender( r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, rt[0], ) else: continue elif rt[0].status == "pending": prev_snap_id = "%s_%s_%s_%s" % (self.uuid, r.pool, r.share, rt[0].snap_name) if prev_snap_id in self.senders: logger.debug("send process ongoing for snap: " "%s" % snap_id) continue logger.debug( "%s not found in senders. Previous " "sender must have Aborted. Marking " "it as failed" % prev_snap_id ) msg = "Sender process Aborted. See logs for " "more information" data = { "status": "failed", "end_ts": now.strftime(settings.SNAP_TS_FORMAT), "error": msg, "send_failed": now, } update_replica_status(rt[0].id, data, logger) continue elif rt[0].status == "failed": snap_name = rt[0].snap_name # if num_failed attempts > 10, disable the replica num_tries = 0 for rto in rt: if rto.status != "failed" or num_tries >= self.MAX_ATTEMPTS or rto.end_ts < r.ts: break num_tries = num_tries + 1 if num_tries >= self.MAX_ATTEMPTS: logger.info( "Maximum attempts(%d) reached " "for snap: %s. Disabling the " "replica." % (self.MAX_ATTEMPTS, snap_id) ) disable_replica(r.id, logger) continue logger.info( "previous backup failed for snap: " "%s. Starting a new one. Attempt " "%d/%d." % (snap_id, num_tries, self.MAX_ATTEMPTS) ) prev_rt = None for rto in rt: if rto.status == "succeeded": prev_rt = rto break sw = Sender( r, self.rep_ip, self.pubq, Queue(), snap_name, self.meta_port, self.data_port, r.meta_port, self.uuid, snap_id, prev_rt, ) else: logger.error( "unknown replica trail status: %s. " "ignoring snap: %s" % (rt[0].status, snap_id) ) continue self.senders[snap_id] = sw sw.daemon = True sw.start() total_sleep = 0 except DatabaseError, e: e_msg = "Error getting the list of enabled replica " "tasks. Moving on" logger.error(e_msg) logger.exception(e) time.sleep(1) total_sleep = total_sleep + 1