def _get_remote_appliance(self, request, ip, port, client_id, client_secret): with self._handle_exception(request): base_url = ('https://%s:%s' % (ip, port)) set_token(client_id=client_id, client_secret=client_secret, url=base_url) ad = api_call('%s/api/appliances/1' % base_url) logger.debug('remote appliance: %s' % ad) return ad['uuid']
def main(): # bootstrap the machine. success of quit url = 'https://localhost/api/commands/bootstrap' time.sleep(10) try: set_token() api_call(url, calltype='post') except Exception, e: logger.error('Unable to bootstrap the machine. Moving on..') logger.exception(e)
def __init__(self): self.ppid = os.getpid() self.senders = {} self.receivers = {} self.data_port = settings.REPLICA_DATA_PORT self.meta_port = settings.REPLICA_META_PORT self.recv_meta = None self.pubq = Queue() self.uuid = None set_token() super(ReplicaScheduler, self).__init__()
def main(): # bootstrap the machine. success of quit api_url = 'https://localhost/api' bootstrap_url = ('%s/commands/bootstrap' % api_url) diskscan_url = ('%s/disks/scan' % api_url) netscan_url = ('%s/network' % api_url) time.sleep(10) try: set_token() api_call(diskscan_url, calltype='post') api_call(netscan_url, calltype='get') api_call(bootstrap_url, calltype='post') except Exception, e: logger.error('Unable to bootstrap the machine. Moving on..') logger.exception(e)
def _get_remote_appliance(self, request, ip, port, client_id, client_secret): try: base_url = ('https://%s:%s' % (ip, port)) set_token(client_id=client_id, client_secret=client_secret, url=base_url) ad = api_call('%s/api/appliances/1' % base_url) logger.debug('remote appliance: %s' % ad) return ad['uuid'] except RockStorAPIException: raise except Exception, e: e_msg = ('Failed to get remote appliance uuid') logger.error(e_msg) logger.exception(e) handle_exception(e_msg, request)
def _get_remote_appliance(self, request, ip, port, client_id, client_secret): with self._handle_exception(request): base_url = ('https://%s:%s' % (ip, port)) try: set_token(client_id=client_id, client_secret=client_secret, url=base_url) except Exception, e: e_msg = ('Failed to authenticate on remote appliance. Verify ' 'port number, id and secret are correct and try ' 'again.') handle_exception(Exception(e_msg), request) try: ad = api_call('%s/api/appliances/1' % base_url, save_error=False) return ad['uuid'] except Exception, e: logger.exception(e) e_msg = ('Failed to get remote appliance information. Verify ' 'all inputs and try again.') handle_exception(Exception(e_msg), request)
def run(self): set_token() msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' % (self.meta['uuid'], self.meta)) with self._clean_exit_handler(msg): self.sender_ip = get_sender_ip(self.meta['uuid'], logger) logger.debug('sender ip: %s' % self.sender_ip) msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: ' '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta)) with self._clean_exit_handler(msg): #@todo: add validation recv_sub = self.ctx.socket(zmq.SUB) recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port)) recv_sub.RCVTIMEO = 100 recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id'])) msg = ('Failed to connect to the sender(%s) on ' 'meta_port(%d). meta: %s. Aborting.' % (self.sender_ip, self.meta_port, self.meta)) with self._clean_exit_handler(msg): self.meta_push = self.ctx.socket(zmq.PUSH) url = ('tcp://%s:%d' % (self.sender_ip, self.meta_port)) logger.debug('meta url: %s' % url) self.meta_push.connect('tcp://%s:%d' % (self.sender_ip, self.meta_port)) sname = ('%s-%s-%s' % (self.sender_id, self.sender_ip, self.src_share)) if (not self.incremental): msg = ('Failed to verify/create share: %s. meta: %s. ' 'Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): create_share(sname, self.dest_pool, logger) msg = ('Failed to create the replica metadata object ' 'for share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): data = { 'share': sname, 'appliance': self.sender_ip, 'src_share': self.src_share, 'data_port': self.data_port, 'meta_port': self.meta_port, } self.rid = create_rshare(data, logger) else: msg = ('Failed to retreive the replica metadata object for ' 'share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg): self.rid = rshare_id(sname, logger) sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.meta['pool'], sname)) snap_fp = ('%s/%s_%s' % (sub_vol, self.snap_name.split('_')[0], self.snap_name)) logger.info('snap_fp: %s' % snap_fp) msg = ('Snaphost: %s already exists.' % snap_fp) with self._clean_exit_handler(msg): if (os.path.isdir(snap_fp)): ack = { 'msg': 'snap_exists', 'id': self.meta['id'], } self.meta_push.send_json(ack) logger.debug(msg) cmd = [BTRFS, 'receive', sub_vol] msg = ('Failed to start the low level btrfs receive command(%s)' '. Aborting.' % (cmd)) with self._clean_exit_handler(msg, ack=True): rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) logger.debug('Btrfs receive started for snap: %s' % sub_vol) msg = ('Failed to send begin_ok to the sender for meta: %s' % self.meta) with self._clean_exit_handler(msg): ack = { 'msg': 'begin_ok', 'id': self.meta['id'], } self.meta_push.send_json(ack) logger.debug('begin_ok sent for meta: %s' % self.meta) recv_timeout_counter = 0 while True: try: recv_data = recv_sub.recv() recv_data = recv_data[len(self.meta['id']):] recv_timeout_counter = 0 self.kb_received = self.kb_received + len(recv_data) if (self.rtid is None): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg, ack=True): create_snapshot(sname, self.snap_name, logger, snap_type='receiver') data = {'snap_name': self.snap_name} msg = ('Failed to create receive trail for rid: %d' '. meta: %s' % (self.rid, self.meta)) with self._clean_exit_handler(msg, ack=True): self.rtid = create_receive_trail( self.rid, data, logger) if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'): ts = datetime.utcnow().replace(tzinfo=utc) data = { 'kb_received': self.kb_received / 1024, } if (recv_data == 'END_SUCCESS'): logger.debug('END_SUCCESS received for meta: %s' % self.meta) data['receive_succeeded'] = ts else: logger.error('END_FAIL received for meta: %s. ' 'Terminating.' % self.meta) rp.terminate() data['receive_failed'] = ts data['status'] = 'failed' msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) break if (rp.poll() is None): rp.stdin.write(recv_data) rp.stdin.flush() else: logger.error('It seems the btrfs receive process died' ' unexpectedly.') out, err = rp.communicate() logger.debug('btrfs receive out: %s err: %s' % (out, err)) msg = ('Low level system error from btrfs receive ' 'command. out: %s err: %s for rtid: %s meta: %s' % (out, err, self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): data = { 'receive_failed': (datetime.utcnow().replace(tzinfo=utc)), 'status': 'failed', 'error': msg, } update_receive_trail(self.rtid, data, logger) except zmq.error.Again: recv_timeout_counter = recv_timeout_counter + 1 if (recv_timeout_counter > 300): logger.error('Nothing received in the last 30 seconds ' 'from the sender for meta: %s. Aborting.' % self.meta) raise except Exception, e: msg = ('Exception occured while receiving fsdata') logger.error(msg) logger.exception(e) rp.terminate() out, err = rp.communicate() logger.debug('rc: %d out: %s err: %s' % (rp.returncode, out, err)) data['receive_failed'] = datetime.utcnow().replace(tzinfo=utc) data['status'] = 'failed' data['error'] = msg msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) self._sys_exit(3) finally:
def run(self): set_token() msg = ('Failed to get the sender ip from the uuid(%s) for meta: %s' % (self.meta['uuid'], self.meta)) with self._clean_exit_handler(msg): self.sender_ip = get_sender_ip(self.meta['uuid'], logger) logger.debug('sender ip: %s' % self.sender_ip) msg = ('Failed to connect to the sender(%s) on data_port(%s). meta: ' '%s. Aborting.' % (self.sender_ip, self.data_port, self.meta)) with self._clean_exit_handler(msg): #@todo: add validation recv_sub = self.ctx.socket(zmq.SUB) recv_sub.connect('tcp://%s:%d' % (self.sender_ip, self.data_port)) recv_sub.RCVTIMEO = 100 recv_sub.setsockopt(zmq.SUBSCRIBE, str(self.meta['id'])) msg = ('Failed to connect to the sender(%s) on ' 'meta_port(%d). meta: %s. Aborting.' % (self.sender_ip, self.meta_port, self.meta)) with self._clean_exit_handler(msg): self.meta_push = self.ctx.socket(zmq.PUSH) url = ('tcp://%s:%d' % (self.sender_ip, self.meta_port)) logger.debug('meta url: %s' % url) self.meta_push.connect('tcp://%s:%d' % (self.sender_ip, self.meta_port)) sname = ('%s-%s-%s' % (self.sender_id, self.sender_ip, self.src_share)) if (not self.incremental): msg = ('Failed to verify/create share: %s. meta: %s. ' 'Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): create_share(sname, self.dest_pool, logger) msg = ('Failed to create the replica metadata object ' 'for share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg, ack=True): data = {'share': sname, 'appliance': self.sender_ip, 'src_share': self.src_share, 'data_port': self.data_port, 'meta_port': self.meta_port, } self.rid = create_rshare(data, logger) else: msg = ('Failed to retreive the replica metadata object for ' 'share: %s. meta: %s. Aborting.' % (sname, self.meta)) with self._clean_exit_handler(msg): self.rid = rshare_id(sname, logger) sub_vol = ('%s%s/%s' % (settings.MNT_PT, self.meta['pool'], sname)) snap_fp = ('%s/%s_%s' % (sub_vol, self.snap_name.split('_')[0], self.snap_name)) logger.info('snap_fp: %s' % snap_fp) msg = ('Snaphost: %s already exists.' % snap_fp) with self._clean_exit_handler(msg): if (os.path.isdir(snap_fp)): ack = {'msg': 'snap_exists', 'id': self.meta['id'],} self.meta_push.send_json(ack) logger.debug(msg) cmd = [BTRFS, 'receive', sub_vol] msg = ('Failed to start the low level btrfs receive command(%s)' '. Aborting.' % (cmd)) with self._clean_exit_handler(msg, ack=True): rp = subprocess.Popen(cmd, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) logger.debug('Btrfs receive started for snap: %s' % sub_vol) msg = ('Failed to send begin_ok to the sender for meta: %s' % self.meta) with self._clean_exit_handler(msg): ack = {'msg': 'begin_ok', 'id': self.meta['id'], } self.meta_push.send_json(ack) logger.debug('begin_ok sent for meta: %s' % self.meta) recv_timeout_counter = 0 while True: try: recv_data = recv_sub.recv() recv_data = recv_data[len(self.meta['id']):] recv_timeout_counter = 0 self.kb_received = self.kb_received + len(recv_data) if (self.rtid is None): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg, ack=True): create_snapshot(sname, self.snap_name, logger, snap_type='receiver') data = {'snap_name': self.snap_name} msg = ('Failed to create receive trail for rid: %d' '. meta: %s' % (self.rid, self.meta)) with self._clean_exit_handler(msg, ack=True): self.rtid = create_receive_trail(self.rid, data, logger) if (recv_data == 'END_SUCCESS' or recv_data == 'END_FAIL'): ts = datetime.utcnow().replace(tzinfo=utc) data = {'kb_received': self.kb_received / 1024,} if (recv_data == 'END_SUCCESS'): logger.debug('END_SUCCESS received for meta: %s' % self.meta) data['receive_succeeded'] = ts else: logger.error('END_FAIL received for meta: %s. ' 'Terminating.' % self.meta) rp.terminate() data['receive_failed'] = ts data['status'] = 'failed' msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) break if (rp.poll() is None): rp.stdin.write(recv_data) rp.stdin.flush() else: logger.error('It seems the btrfs receive process died' ' unexpectedly.') out, err = rp.communicate() logger.debug('btrfs receive out: %s err: %s' % (out, err)) msg = ('Low level system error from btrfs receive ' 'command. out: %s err: %s for rtid: %s meta: %s' % (out, err, self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): data = {'receive_failed': ( datetime.utcnow().replace(tzinfo=utc)), 'status': 'failed', 'error': msg,} update_receive_trail(self.rtid, data, logger) except zmq.error.Again: recv_timeout_counter = recv_timeout_counter + 1 if (recv_timeout_counter > 300): logger.error('Nothing received in the last 30 seconds ' 'from the sender for meta: %s. Aborting.' % self.meta) raise except Exception, e: msg = ('Exception occured while receiving fsdata') logger.error(msg) logger.exception(e) rp.terminate() out, err = rp.communicate() logger.debug('rc: %d out: %s err: %s' % (rp.returncode, out, err)) data['receive_failed'] = datetime.utcnow().replace(tzinfo=utc) data['status'] = 'failed' data['error'] = msg msg = ('Failed to update receive trail for rtid: %d' '. meta: %s' % (self.rtid, self.meta)) with self._clean_exit_handler(msg, ack=True): update_receive_trail(self.rtid, data, logger) self._sys_exit(3) finally:
def validate_src_share(sender_uuid, sname): #do a simple get on the share of the sender. a = Appliance.objects.get(uuid=sender_uuid) url = ('https://%s:%s' % (a.ip, a.mgmt_port)) set_token(client_id=a.client_id, client_secret=a.client_secret, url=url) api_call(url='%s/api/shares/%s' % (url, sname))
def run(self): set_token() msg = ('Failed to connect to receiver(%s) on meta port' '(%d) for snap_name: %s. Aborting.' % (self.receiver_ip, self.rmeta_port, self.snap_name)) with self._clean_exit_handler(msg): meta_push = self.ctx.socket(zmq.PUSH) meta_push.connect('tcp://%s:%d' % (self.receiver_ip, self.rmeta_port)) # 1. create a new replica trail if it's the very first time # of if the last one succeeded msg = ('Failed to create local replica trail for snap_name:' ' %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg): self.rt2 = create_replica_trail(self.replica.id, self.snap_name, logger) self.rt2_id = self.rt2['id'] # 2. create a snapshot only if it's not already from a previous # failed attempt. if (not is_snapshot(self.replica.share, self.snap_name, logger)): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg): create_snapshot(self.replica.share, self.snap_name, logger) # let the receiver know that following diff is coming msg = ('Failed to send initial metadata communication to the ' 'receiver(%s), most likely due to a network error. Aborting.' % self.receiver_ip) with self._update_trail_and_quit(msg): logger.debug('sending meta_begin: %s' % self.meta_begin) meta_push.send_json(self.meta_begin) logger.debug('meta_begin sent. waiting on get') msg = ('Timeout occured(60 seconds) while waiting for OK ' 'from the receiver(%s) to start sending data. Aborting.' % self.receiver_ip) with self._update_trail_and_quit(msg): ack = self._process_q() logger.info('suman ack = %s' % ack) if (ack['msg'] == 'snap_exists'): data = {'status': 'succeeded', 'end_ts': datetime.utcnow().replace(tzinfo=utc), 'error': 'snapshot already exists on the receiver', } msg = ('Failed to update replica status for snap_name: %s. ' 'Aborting.' % self.snap_name) with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger) self._sys_exit(0) snap_path = ('%s%s/%s_%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.snap_name)) logger.debug('current snap: %s' % snap_path) cmd = [BTRFS, 'send', snap_path] if (self.rt is not None): prev_snap = ('%s%s/%s_%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.rt.snap_name)) logger.info('Sending incremental replica between %s -- %s' % (prev_snap, snap_path)) cmd = [BTRFS, 'send', '-p', prev_snap, snap_path] else: logger.info('Sending full replica: %s' % snap_path) try: sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) logger.debug('send started. snap: %s' % snap_path) except Exception, e: msg = ('Failed to start the low level btrfs send ' 'command(%s). Aborting' % cmd) logger.error(msg) logger.exception(e) with self._update_trail_and_quit(msg): self.pub.put('%sEND_FAIL' % self.snap_id) self._sys_exit(3)
def run(self): set_token() msg = ('Failed to connect to receiver(%s) on meta port' '(%d) for snap_name: %s. Aborting.' % (self.receiver_ip, self.rmeta_port, self.snap_name)) with self._clean_exit_handler(msg): meta_push = self.ctx.socket(zmq.PUSH) meta_push.connect('tcp://%s:%d' % (self.receiver_ip, self.rmeta_port)) # 1. create a new replica trail if it's the very first time # of if the last one succeeded msg = ('Failed to create local replica trail for snap_name:' ' %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg): self.rt2 = create_replica_trail(self.replica.id, self.snap_name, logger) self.rt2_id = self.rt2['id'] # 2. create a snapshot only if it's not already from a previous # failed attempt. if (not is_snapshot(self.replica.share, self.snap_name, logger)): msg = ('Failed to create snapshot: %s. Aborting.' % self.snap_name) with self._clean_exit_handler(msg): create_snapshot(self.replica.share, self.snap_name, logger) # let the receiver know that following diff is coming msg = ('Failed to send initial metadata communication to the ' 'receiver(%s), most likely due to a network error. Aborting.' % self.receiver_ip) with self._update_trail_and_quit(msg): logger.debug('sending meta_begin: %s' % self.meta_begin) meta_push.send_json(self.meta_begin) logger.debug('meta_begin sent. waiting on get') msg = ('Timeout occured(60 seconds) while waiting for OK ' 'from the receiver(%s) to start sending data. Aborting.' % self.receiver_ip) with self._update_trail_and_quit(msg): ack = self._process_q() logger.info('suman ack = %s' % ack) if (ack['msg'] == 'snap_exists'): data = { 'status': 'succeeded', 'end_ts': datetime.utcnow().replace(tzinfo=utc), 'error': 'snapshot already exists on the receiver', } msg = ('Failed to update replica status for snap_name: %s. ' 'Aborting.' % self.snap_name) with self._clean_exit_handler(msg): update_replica_status(self.rt2_id, data, logger) self._sys_exit(0) snap_path = ('%s%s/%s_%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.snap_name)) logger.debug('current snap: %s' % snap_path) cmd = [BTRFS, 'send', snap_path] if (self.rt is not None): prev_snap = ('%s%s/%s_%s' % (settings.MNT_PT, self.replica.pool, self.replica.share, self.rt.snap_name)) logger.info('Sending incremental replica between %s -- %s' % (prev_snap, snap_path)) cmd = [BTRFS, 'send', '-p', prev_snap, snap_path] else: logger.info('Sending full replica: %s' % snap_path) try: sp = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) fcntl.fcntl(sp.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) logger.debug('send started. snap: %s' % snap_path) except Exception, e: msg = ('Failed to start the low level btrfs send ' 'command(%s). Aborting' % cmd) logger.error(msg) logger.exception(e) with self._update_trail_and_quit(msg): self.pub.put('%sEND_FAIL' % self.snap_id) self._sys_exit(3)