self._sys_exit(3) msg = ( "Timeout occured(60 seconds) while waiting for final " "send confirmation from the receiver(%s) for %s. Aborting." % (self.receiver_ip, self.snap_id) ) with self._update_trail_and_quit(msg): ack = self._process_q() end_ts = datetime.utcnow().replace(tzinfo=utc) data = {"status": "succeeded", "kb_sent": self.kb_sent / 1024, "end_ts": end_ts} if ack["msg"] == "receive_error": msg = "Receiver(%s) returned a processing error for " " %s. Check it for more information." % ( self.receiver_ip, self.snap_id, ) data["status"] = "failed" data["error"] = msg data["send_failed"] = end_ts else: share_path = "%s%s/.snapshots/%s" % (settings.MNT_PT, self.replica.pool, self.replica.share) oldest_snap = get_oldest_snap(share_path, 3) if oldest_snap is not None: msg = "Failed to delete snapshot: %s. Aborting." % oldest_snap with self._clean_exit_handler(msg): delete_snapshot(self.replica.share, oldest_snap, logger) msg = "Failed to update final replica status for %s" ". Aborting." % self.snap_id with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger)
end_ts = datetime.utcnow().replace(tzinfo=utc).strftime( settings.SNAP_TS_FORMAT) data = { 'status': 'succeeded', 'kb_sent': self.kb_sent / 1024, 'end_ts': end_ts, } if (ack['msg'] == 'receive_error'): msg = ('Receiver(%s) returned a processing error for ' ' %s. Check it for more information.' % (self.receiver_ip, self.snap_id)) data['status'] = 'failed' data['error'] = msg data['send_failed'] = end_ts else: share_path = ( '%s%s/.snapshots/%s' % (settings.MNT_PT, self.replica.pool, self.replica.share)) oldest_snap = get_oldest_snap(share_path, 3) if (oldest_snap is not None): msg = ('Failed to delete snapshot: %s. Aborting.' % oldest_snap) with self._clean_exit_handler(msg): delete_snapshot(self.replica.share, oldest_snap, logger) msg = ('Failed to update final replica status for %s' '. Aborting.' % self.snap_id) with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger)
def run(self): msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' % (self.meta['uuid'], self.meta)) with self._clean_exit_handler(msg): self.sender_ip = get_sender_ip(self.meta['uuid'], logger) msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: ' '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta)) with self._clean_exit_handler(msg): #@todo: add validation recv_sub = self.ctx.socket(zmq.SUB) recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port)) recv_sub.RCVTIMEO = 100 recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id'])) msg = ('Failed to connect to the sender(%s) on ' 'meta_port(%d). meta: %s. Aborting.' % (self.sender_ip, self.meta_port, self.meta)) with self._clean_exit_handler(msg): self.meta_push = self.ctx.socket(zmq.PUSH) self.meta_push.connect('tcp://%s:%d' % (self.sender_ip, self.meta_port)) sname = ('%s_%s' % (self.sender_id, self.src_share)) if (not self.incremental): msg = ('Failed to verify/create share: %s. meta: %s. ' 'Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): create_share(sname, self.dest_pool, logger) msg = ('Failed to create the replica metadata object ' 'for share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): data = {'share': sname, 'appliance': self.sender_ip, 'src_share': self.src_share, 'data_port': self.data_port, 'meta_port': self.meta_port, } self.rid = create_rshare(data, logger) else: msg = ('Failed to retreive the replica metadata object for ' 'share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg): self.rid = rshare_id(sname, logger) sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.meta['pool'], sname)) if (not is_subvol(sub_vol)): msg = ('Failed to create parent subvolume %s' % sub_vol) with self._clean_exit_handler(msg, ack=True): run_command([BTRFS, 'subvolume', 'create', sub_vol]) snap_fp = ('%s/%s' % (sub_vol, self.snap_name)) with self._clean_exit_handler(msg): if (is_subvol(snap_fp)): ack = {'msg': 'snap_exists', 'id': self.meta['id'], } self.meta_push.send_json(ack) cmd = [BTRFS, 'receive', sub_vol] msg = ('Failed to start the low level btrfs receive command(%s)' '. Aborting.' % (cmd)) with self._clean_exit_handler(msg, ack=True): rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) msg = ('Failed to send begin_ok to the sender for meta: %s' % self.meta) with self._clean_exit_handler(msg): ack = {'msg': 'begin_ok', 'id': self.meta['id'], } self.meta_push.send_json(ack) recv_timeout_counter = 0 credit = settings.DEFAULT_SEND_CREDIT check_credit = True while True: if (check_credit is True and credit < 5): ack = {'msg': 'send_more', 'id': self.meta['id'], 'credit': settings.DEFAULT_SEND_CREDIT, } self.meta_push.send_json(ack) credit = credit + settings.DEFAULT_SEND_CREDIT logger.debug('%d KB received for %s' % (int(self.kb_received / 1024), sname)) try: recv_data = recv_sub.recv() recv_data = recv_data[len(self.meta['id']):] credit = credit - 1 recv_timeout_counter = 0 self.kb_received = self.kb_received + len(recv_data) if (self.rtid is None): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) # create a snapshot only if it's not already from a previous failed attempt with self._clean_exit_handler(msg, ack=True): create_snapshot(sname, self.snap_name, logger, snap_type='receiver') data = {'snap_name': self.snap_name} msg = ('Failed to create receive trail for rid: %d' '. meta: %s' % (self.rid, self.meta)) with self._clean_exit_handler(msg, ack=True): self.rtid = create_receive_trail(self.rid, data, logger) if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'): check_credit = False ts = datetime.utcnow().replace(tzinfo=utc) data = {'kb_received': self.kb_received / 1024, } if (recv_data == 'END_SUCCESS'): data['receive_succeeded'] = ts.strftime(settings.SNAP_TS_FORMAT) #delete the share, move the oldest snap to share oldest_snap = get_oldest_snap(sub_vol, 3) if (oldest_snap is not None): snap_path = ('%s/%s' % (sub_vol, oldest_snap)) share_path = ('%s%s/%s' % (settings.MNT_PT, self.dest_pool, sname)) msg = ('Failed to promote the oldest Snapshot(%s) ' 'to Share(%s)' % (snap_path, share_path)) try: pool = Pool.objects.get(name=self.dest_pool) remove_share(pool, sname) set_property(snap_path, 'ro', 'false', mount=False) run_command(['/usr/bin/rm', '-rf', share_path], throw=False) shutil.move(snap_path, share_path) set_property(share_path, 'ro', 'true', mount=False) delete_snapshot(sname, oldest_snap, logger) except Exception, e: logger.error(msg) logger.exception(msg) else: logger.error('END_FAIL received for meta: %s. ' 'Terminating.' % self.meta) rp.terminate() data['receive_failed'] = ts data['status'] = 'failed' msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) break if (rp.poll() is None): rp.stdin.write(recv_data) rp.stdin.flush() else: logger.error('It seems the btrfs receive process died' ' unexpectedly.') out, err = rp.communicate() msg = ('Low level system error from btrfs receive ' 'command. out: %s err: %s for rtid: %s meta: %s' % (out, err, self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): ts = datetime.utcnow().replace(tzinfo=utc) data = {'receive_failed': ts.strftime(settings.SNAP_TS_FORMAT), 'status': 'failed', 'error': msg, } update_receive_trail(self.rtid, data, logger) except zmq.error.Again: recv_timeout_counter = recv_timeout_counter + 1 if (recv_timeout_counter > 600): logger.error('Nothing received in the last 60 seconds ' 'from the sender for meta: %s. Aborting.' % self.meta) self._sys_exit(3)
def run(self): msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' % (self.meta['uuid'], self.meta)) with self._clean_exit_handler(msg): self.sender_ip = get_sender_ip(self.meta['uuid'], logger) msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: ' '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta)) with self._clean_exit_handler(msg): #@todo: add validation recv_sub = self.ctx.socket(zmq.SUB) recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port)) recv_sub.RCVTIMEO = 100 recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id'])) msg = ('Failed to connect to the sender(%s) on ' 'meta_port(%d). meta: %s. Aborting.' % (self.sender_ip, self.meta_port, self.meta)) with self._clean_exit_handler(msg): self.meta_push = self.ctx.socket(zmq.PUSH) self.meta_push.connect('tcp://%s:%d' % (self.sender_ip, self.meta_port)) sname = ('%s_%s' % (self.sender_id, self.src_share)) if (not self.incremental): msg = ('Failed to verify/create share: %s. meta: %s. ' 'Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): create_share(sname, self.dest_pool, logger) msg = ('Failed to create the replica metadata object ' 'for share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): data = {'share': sname, 'appliance': self.sender_ip, 'src_share': self.src_share, 'data_port': self.data_port, 'meta_port': self.meta_port, } self.rid = create_rshare(data, logger) else: msg = ('Failed to retreive the replica metadata object for ' 'share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg): self.rid = rshare_id(sname, logger) sub_vol = ('%s%s/.snapshots/%s' % (settings.MNT_PT, self.meta['pool'], sname)) if (not is_subvol(sub_vol)): msg = ('Failed to create parent subvolume %s' % sub_vol) with self._clean_exit_handler(msg, ack=True): run_command([BTRFS, 'subvolume', 'create', sub_vol]) snap_fp = ('%s/%s' % (sub_vol, self.snap_name)) with self._clean_exit_handler(msg): if (is_subvol(snap_fp)): ack = {'msg': 'snap_exists', 'id': self.meta['id'], } self.meta_push.send_json(ack) cmd = [BTRFS, 'receive', sub_vol] msg = ('Failed to start the low level btrfs receive command(%s)' '. Aborting.' % (cmd)) with self._clean_exit_handler(msg, ack=True): rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) msg = ('Failed to send begin_ok to the sender for meta: %s' % self.meta) with self._clean_exit_handler(msg): ack = {'msg': 'begin_ok', 'id': self.meta['id'], } self.meta_push.send_json(ack) recv_timeout_counter = 0 credit = settings.DEFAULT_SEND_CREDIT check_credit = True while True: if (check_credit is True and credit < 5): ack = {'msg': 'send_more', 'id': self.meta['id'], 'credit': settings.DEFAULT_SEND_CREDIT, } self.meta_push.send_json(ack) credit = credit + settings.DEFAULT_SEND_CREDIT logger.debug('%d KB received for %s' % (int(self.kb_received / 1024), sname)) try: recv_data = recv_sub.recv() recv_data = recv_data[len(self.meta['id']):] credit = credit - 1 recv_timeout_counter = 0 self.kb_received = self.kb_received + len(recv_data) if (self.rtid is None): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) # create a snapshot only if it's not already from a previous failed attempt with self._clean_exit_handler(msg, ack=True): create_snapshot(sname, self.snap_name, logger, snap_type='receiver') data = {'snap_name': self.snap_name} msg = ('Failed to create receive trail for rid: %d' '. meta: %s' % (self.rid, self.meta)) with self._clean_exit_handler(msg, ack=True): self.rtid = create_receive_trail(self.rid, data, logger) if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'): check_credit = False ts = datetime.utcnow().replace(tzinfo=utc) data = {'kb_received': self.kb_received / 1024, } if (recv_data == 'END_SUCCESS'): data['receive_succeeded'] = ts #delete the share, move the oldest snap to share oldest_snap = get_oldest_snap(sub_vol, 3) if (oldest_snap is not None): snap_path = ('%s/%s' % (sub_vol, oldest_snap)) share_path = ('%s%s/%s' % (settings.MNT_PT, self.dest_pool, sname)) msg = ('Failed to promote the oldest Snapshot(%s) ' 'to Share(%s)' % (snap_path, share_path)) try: pool = Pool.objects.get(name=self.dest_pool) remove_share(pool, sname) set_property(snap_path, 'ro', 'false', mount=False) run_command(['/usr/bin/rm', '-rf', share_path], throw=False) shutil.move(snap_path, share_path) set_property(share_path, 'ro', 'true', mount=False) delete_snapshot(sname, oldest_snap, logger) except Exception, e: logger.error(msg) logger.exception(msg) else: logger.error('END_FAIL received for meta: %s. ' 'Terminating.' % self.meta) rp.terminate() data['receive_failed'] = ts data['status'] = 'failed' msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) break if (rp.poll() is None): rp.stdin.write(recv_data) rp.stdin.flush() else: logger.error('It seems the btrfs receive process died' ' unexpectedly.') out, err = rp.communicate() msg = ('Low level system error from btrfs receive ' 'command. out: %s err: %s for rtid: %s meta: %s' % (out, err, self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): ts = datetime.utcnow().replace(tzinfo=utc) data = {'receive_failed': ts, 'status': 'failed', 'error': msg, } update_receive_trail(self.rtid, data, logger) except zmq.error.Again: recv_timeout_counter = recv_timeout_counter + 1 if (recv_timeout_counter > 600): logger.error('Nothing received in the last 60 seconds ' 'from the sender for meta: %s. Aborting.' % self.meta) self._sys_exit(3)