def subcmd_delete(args): import logging import shutil import glob import sys from errno import ENOENT, ENODATA import struct from syncdutils import GsyncdError, Xattr, errno_wrap import gsyncdconfig as gconf logging.info('geo-replication delete') # remove the stime xattr from all the brick paths so that # a re-create of a session will start sync all over again stime_xattr_prefix = gconf.get('stime-xattr-prefix', None) # Delete pid file, status file, socket file cleanup_paths = [] cleanup_paths.append(gconf.get("pid-file")) # Cleanup Session dir try: shutil.rmtree(gconf.get("georep-session-working-dir")) except (IOError, OSError): if sys.exc_info()[1].errno == ENOENT: pass else: raise GsyncdError('Error while removing working dir: %s' % gconf.get("georep-session-working-dir")) # Cleanup changelog working dirs try: shutil.rmtree(gconf.get("working-dir")) except (IOError, OSError): if sys.exc_info()[1].errno == ENOENT: pass else: raise GsyncdError('Error while removing working dir: %s' % gconf.get("working-dir")) for path in cleanup_paths: # To delete temp files for f in glob.glob(path + "*"): _unlink(f) if args.reset_sync_time and stime_xattr_prefix: for p in args.paths: if p != "": # set stime to (0,0) to trigger full volume content resync # to slave on session recreation # look at master.py::Xcrawl hint: zero_zero errno_wrap(Xattr.lsetxattr, (p, stime_xattr_prefix + ".stime", struct.pack("!II", 0, 0)), [ENOENT, ENODATA]) errno_wrap(Xattr.lremovexattr, (p, stime_xattr_prefix + ".entry_stime"), [ENOENT, ENODATA]) return
def monitor(local, remote): # Check if gsyncd restarted in pause state. If # yes, send SIGSTOP to negative of monitor pid # to go back to pause state. if rconf.args.pause_on_start: errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH]) """oh yeah, actually Monitor is used as singleton, too""" return Monitor().multiplex(*distribute(local, remote))
def wmon(w): cpid, _ = self.monitor(w, argv, cpids, slave_vol, slave_host, master, suuid, slavenodes) time.sleep(1) self.lock.acquire() for cpid in cpids: errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) self.lock.release() finalize(exval=1)
def monitor(*resources): # Check if gsyncd restarted in pause state. If # yes, send SIGSTOP to negative of monitor pid # to go back to pause state. if gconf.pause_on_start: errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH]) """oh yeah, actually Monitor is used as singleton, too""" return Monitor().multiplex(*distribute(*resources))
def meta_ops(cls, meta_entries): logging.debug('Meta-entries: %s' % repr(meta_entries)) for e in meta_entries: mode = e['stat']['mode'] uid = e['stat']['uid'] gid = e['stat']['gid'] go = e['go'] errno_wrap(os.chmod, [go, mode], [ENOENT], [ESTALE, EINVAL]) errno_wrap(os.chown, [go, uid, gid], [ENOENT], [ESTALE, EINVAL])
def meta_ops(cls, meta_entries): logging.debug("Meta-entries: %s" % repr(meta_entries)) for e in meta_entries: mode = e["stat"]["mode"] uid = e["stat"]["uid"] gid = e["stat"]["gid"] go = e["go"] errno_wrap(os.chmod, [go, mode], [ENOENT], [ESTALE, EINVAL]) errno_wrap(os.chown, [go, uid, gid], [ENOENT], [ESTALE, EINVAL])
def wmon(w): cpid, _ = self.monitor(w, argv, cpids, agents, slave_vol, slave_host, master) time.sleep(1) self.lock.acquire() for cpid in cpids: errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) for apid in agents: errno_wrap(os.kill, [apid, signal.SIGKILL], [ESRCH]) self.lock.release() finalize(exval=1)
def entry_purge(entry, gfid): # This is an extremely racy code and needs to be fixed ASAP. # The GFID check here is to be sure that the pargfid/bname # to be purged is the GFID gotten from the changelog. # (a stat(changelog_gfid) would also be valid here) # The race here is between the GFID check and the purge. disk_gfid = cls.gfid(entry) if isinstance(disk_gfid, int): return if not gfid == disk_gfid: return er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR]) if isinstance(er, int): if er == EISDIR: er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY]) if er == ENOTEMPTY: return er
def entry_purge(entry, gfid): # This is an extremely racy code and needs to be fixed ASAP. # The GFID check here is to be sure that the pargfid/bname # to be purged is the GFID gotten from the changelog. # (a stat(changelog_gfid) would also be valid here) # The race here is between the GFID check and the purge. disk_gfid = cls.gfid_mnt(entry) if isinstance(disk_gfid, int): return if not gfid == disk_gfid: return er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR]) if isinstance(er, int): if er == EISDIR: er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY]) if er == ENOTEMPTY: return er
def entry_ops(cls, entries): pfx = gauxpfx() logging.debug('entries: %s' % repr(entries)) # regular file def entry_pack_reg(gf, bn, mo, uid, gid): blen = len(bn) return struct.pack(cls._fmt_mknod(blen), uid, gid, gf, mo, bn, stat.S_IMODE(mo), 0, umask()) def entry_pack_reg_stat(gf, bn, st): blen = len(bn) mo = st['mode'] return struct.pack(cls._fmt_mknod(blen), st['uid'], st['gid'], gf, mo, bn, stat.S_IMODE(mo), 0, umask()) # mkdir def entry_pack_mkdir(gf, bn, mo, uid, gid): blen = len(bn) return struct.pack(cls._fmt_mkdir(blen), uid, gid, gf, mo, bn, stat.S_IMODE(mo), umask()) # symlink def entry_pack_symlink(gf, bn, lnk, st): blen = len(bn) llen = len(lnk) return struct.pack(cls._fmt_symlink(blen, llen), st['uid'], st['gid'], gf, st['mode'], bn, lnk) def entry_purge(entry, gfid): # This is an extremely racy code and needs to be fixed ASAP. # The GFID check here is to be sure that the pargfid/bname # to be purged is the GFID gotten from the changelog. # (a stat(changelog_gfid) would also be valid here) # The race here is between the GFID check and the purge. disk_gfid = cls.gfid_mnt(entry) if isinstance(disk_gfid, int): return if not gfid == disk_gfid: return er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR]) if isinstance(er, int): if er == EISDIR: er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY]) if er == ENOTEMPTY: return er for e in entries: blob = None op = e['op'] gfid = e['gfid'] entry = e['entry'] (pg, bname) = entry2pb(entry) if op in ['RMDIR', 'UNLINK']: while True: er = entry_purge(entry, gfid) if isinstance(er, int): time.sleep(1) else: break elif op in ['CREATE', 'MKNOD']: blob = entry_pack_reg( gfid, bname, e['mode'], e['uid'], e['uid']) elif op == 'MKDIR': blob = entry_pack_mkdir( gfid, bname, e['mode'], e['uid'], e['uid']) elif op == 'LINK': slink = os.path.join(pfx, gfid) st = lstat(slink) if isinstance(st, int): (pg, bname) = entry2pb(entry) blob = entry_pack_reg_stat(gfid, bname, e['stat']) else: errno_wrap(os.link, [slink, entry], [ENOENT, EEXIST]) elif op == 'SYMLINK': blob = entry_pack_symlink(gfid, bname, e['link'], e['stat']) elif op == 'RENAME': en = e['entry1'] st = lstat(entry) if isinstance(st, int): (pg, bname) = entry2pb(en) blob = entry_pack_reg_stat(gfid, bname, e['stat']) else: errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST]) if blob: errno_wrap(Xattr.lsetxattr_l, [pg, 'glusterfs.gfid.newfile', blob], [EEXIST], [ENOENT, ESTALE, EINVAL])
def gfid_mnt(cls, gfidpath): return errno_wrap(Xattr.lgetxattr, [gfidpath, 'glusterfs.gfid.string', cls.GX_GFID_CANONICAL_LEN], [ENOENT])
def entry_ops(cls, entries): pfx = gauxpfx() logging.debug('entries: %s' % repr(entries)) # regular file def entry_pack_reg(gf, bn, st): blen = len(bn) mo = st['mode'] return struct.pack(cls._fmt_mknod(blen), st['uid'], st['gid'], gf, mo, bn, stat.S_IMODE(mo), 0, umask()) # mkdir def entry_pack_mkdir(gf, bn, st): blen = len(bn) mo = st['mode'] return struct.pack(cls._fmt_mkdir(blen), st['uid'], st['gid'], gf, mo, bn, stat.S_IMODE(mo), umask()) #symlink def entry_pack_symlink(gf, bn, lnk, st): blen = len(bn) llen = len(lnk) return struct.pack(cls._fmt_symlink(blen, llen), st['uid'], st['gid'], gf, st['mode'], bn, lnk) def entry_purge(entry, gfid): # This is an extremely racy code and needs to be fixed ASAP. # The GFID check here is to be sure that the pargfid/bname # to be purged is the GFID gotten from the changelog. # (a stat(changelog_gfid) would also be valid here) # The race here is between the GFID check and the purge. disk_gfid = cls.gfid(entry) if isinstance(disk_gfid, int): return if not gfid == disk_gfid: return er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR]) if isinstance(er, int): if er == EISDIR: er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY]) if er == ENOTEMPTY: return er for e in entries: blob = None op = e['op'] gfid = e['gfid'] entry = e['entry'] (pg, bname) = entry2pb(entry) if op in ['RMDIR', 'UNLINK']: while True: er = entry_purge(entry, gfid) if isinstance(er, int): time.sleep(1) else: break elif op == 'CREATE': blob = entry_pack_reg(gfid, bname, e['stat']) elif op == 'MKDIR': blob = entry_pack_mkdir(gfid, bname, e['stat']) elif op == 'LINK': errno_wrap(os.link, [os.path.join(pfx, gfid), entry], [ENOENT, EEXIST]) elif op == 'SYMLINK': blob = entry_pack_symlink(gfid, bname, e['link'], e['stat']) elif op == 'RENAME': en = e['entry1'] errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST]) if blob: errno_wrap(Xattr.lsetxattr_l, [pg, 'glusterfs.gfid.newfile', blob], [ENOENT, EEXIST])
def crawl(self, path='.', xtr=None, done=0): """ generate a CHANGELOG file consumable by process_change """ if path == '.': self.open() self.crawls += 1 if not xtr: # get the root stime and use it for all comparisons xtr = self.xtime('.', self.slave) if isinstance(xtr, int): if xtr != ENOENT: raise GsyncdError('slave is corrupt') xtr = self.minus_infinity xtl = self.xtime(path) if isinstance(xtl, int): raise GsyncdError('master is corrupt') if xtr == xtl: if path == '.': self.close() return self.xtime_reversion_hook(path, xtl, xtr) logging.debug("entering " + path) dem = self.master.server.entries(path) pargfid = self.master.server.gfid(path) if isinstance(pargfid, int): logging.warn('skipping directory %s' % (path)) for e in dem: bname = e e = os.path.join(path, e) st = lstat(e) if isinstance(st, int): logging.warn('%s got purged in the interim..' % e) continue gfid = self.master.server.gfid(e) if isinstance(gfid, int): logging.warn('skipping entry %s..' % (e)) continue xte = self.xtime(e) if isinstance(xte, int): raise GsyncdError('master is corrupt') if not self.need_sync(e, xte, xtr): continue mo = st.st_mode if stat.S_ISDIR(mo): self.write_entry_change("E", [gfid, 'MKDIR', escape(os.path.join(pargfid, bname))]) self.crawl(e, xtr) elif stat.S_ISLNK(mo): rl = errno_wrap(os.readlink, [en], [ENOENT]) if isinstance(rl, int): continue self.write_entry_change("E", [gfid, 'SYMLINK', escape(os.path.join(pargfid, bname)), rl]) else: # if a file has a hardlink, create a Changelog entry as 'LINK' so the slave # side will decide if to create the new entry, or to create link. if st.st_nlink == 1: self.write_entry_change("E", [gfid, 'MKNOD', escape(os.path.join(pargfid, bname))]) else: self.write_entry_change("E", [gfid, 'LINK', escape(os.path.join(pargfid, bname))]) if stat.S_ISREG(mo): self.write_entry_change("D", [gfid]) if path == '.': logging.info('processing xsync changelog %s' % self.fname()) self.close() self.process([self.fname()], done) self.upd_stime(xtl)
def terminate(): # relax one SIGTERM by setting a handler that sets back # standard handler set_term_handler(lambda *a: set_term_handler()) # give a chance to graceful exit errno_wrap(os.kill, [-os.getpid(), signal.SIGTERM], [ESRCH])
def subcmd_delete(args): import logging import shutil import glob import sys from errno import ENOENT, ENODATA import struct from syncdutils import GsyncdError, Xattr, errno_wrap import gsyncdconfig as gconf logging.info('geo-replication delete') # remove the stime xattr from all the brick paths so that # a re-create of a session will start sync all over again stime_xattr_prefix = gconf.get('stime-xattr-prefix', None) # Delete pid file, status file, socket file cleanup_paths = [] cleanup_paths.append(gconf.get("pid-file")) # Cleanup Session dir try: shutil.rmtree(gconf.get("georep-session-working-dir")) except (IOError, OSError): if sys.exc_info()[1].errno == ENOENT: pass else: raise GsyncdError( 'Error while removing working dir: %s' % gconf.get("georep-session-working-dir")) # Cleanup changelog working dirs try: shutil.rmtree(gconf.get("working-dir")) except (IOError, OSError): if sys.exc_info()[1].errno == ENOENT: pass else: raise GsyncdError( 'Error while removing working dir: %s' % gconf.get("working-dir")) for path in cleanup_paths: # To delete temp files for f in glob.glob(path + "*"): _unlink(f) if args.reset_sync_time and stime_xattr_prefix: for p in args.paths: if p != "": # set stime to (0,0) to trigger full volume content resync # to slave on session recreation # look at master.py::Xcrawl hint: zero_zero errno_wrap(Xattr.lsetxattr, (p, stime_xattr_prefix + ".stime", struct.pack("!II", 0, 0)), [ENOENT, ENODATA]) errno_wrap(Xattr.lremovexattr, (p, stime_xattr_prefix + ".entry_stime"), [ENOENT, ENODATA]) return
def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master): """the monitor loop Basic logic is a blantantly simple blunt heuristics: if spawned client survives 60 secs, it's considered OK. This servers us pretty well as it's not vulneralbe to any kind of irregular behavior of the child... ... well, except for one: if children is hung up on waiting for some event, it can survive aeons, still will be defunct. So we tweak the above logic to expect the worker to send us a signal within 60 secs (in the form of closing its end of a pipe). The worker does this when it's done with the setup stage ready to enter the service loop (note it's the setup stage which is vulnerable to hangs -- the full blown worker blows up on EPIPE if the net goes down, due to the keep-alive thread) """ if not self.status.get(w[0], None): self.status[w[0]] = GeorepStatus(gconf.state_file, w[0]) set_monitor_status(gconf.state_file, self.ST_STARTED) self.status[w[0]].set_worker_status(self.ST_INIT) ret = 0 def nwait(p, o=0): try: p2, r = waitpid(p, o) if not p2: return return r except OSError as e: # no child process, this happens if the child process # already died and has been cleaned up if e.errno == ECHILD: return -1 else: raise def exit_signalled(s): """ child teminated due to receipt of SIGUSR1 """ return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1)) def exit_status(s): if os.WIFEXITED(s): return os.WEXITSTATUS(s) return 1 conn_timeout = int(gconf.connection_timeout) while ret in (0, 1): remote_host = w[1] # Check the status of the connected slave node # If the connected slave node is down then try to connect to # different up node. m = re.match("(ssh|gluster|file):\/\/(.+)@([^:]+):(.+)", remote_host) if m: current_slave_host = m.group(3) slave_up_hosts = get_slave_bricks_status( slave_host, slave_vol) if current_slave_host not in slave_up_hosts: if len(slave_up_hosts) > 0: remote_host = "%s://%s@%s:%s" % (m.group(1), m.group(2), random.choice( slave_up_hosts), m.group(4)) # Spawn the worker and agent in lock to avoid fd leak self.lock.acquire() logging.info('-' * conn_timeout) logging.info('starting gsyncd worker') # Couple of pipe pairs for RPC communication b/w # worker and changelog agent. # read/write end for agent (ra, ww) = os.pipe() # read/write end for worker (rw, wa) = os.pipe() # spawn the agent process apid = os.fork() if apid == 0: os.close(rw) os.close(ww) os.execv(sys.executable, argv + ['--local-path', w[0], '--agent', '--rpc-fd', ','.join([str(ra), str(wa), str(rw), str(ww)])]) pr, pw = os.pipe() cpid = os.fork() if cpid == 0: os.close(pr) os.close(ra) os.close(wa) os.execv(sys.executable, argv + ['--feedback-fd', str(pw), '--local-path', w[0], '--local-id', '.' + escape(w[0]), '--rpc-fd', ','.join([str(rw), str(ww), str(ra), str(wa)]), '--subvol-num', str(w[2])] + (['--is-hottier'] if w[3] else []) + ['--resource-remote', remote_host]) cpids.add(cpid) agents.add(apid) os.close(pw) # close all RPC pipes in monitor os.close(ra) os.close(wa) os.close(rw) os.close(ww) self.lock.release() t0 = time.time() so = select((pr,), (), (), conn_timeout)[0] os.close(pr) if so: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret_agent is not None: # Agent is died Kill Worker logging.info("Changelog Agent died, " "Aborting Worker(%s)" % w[0]) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) if ret is not None: logging.info("worker(%s) died before establishing " "connection" % w[0]) nwait(apid) # wait for agent else: logging.debug("worker(%s) connected" % w[0]) while time.time() < t0 + conn_timeout: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret is not None: logging.info("worker(%s) died in startup " "phase" % w[0]) nwait(apid) # wait for agent break if ret_agent is not None: # Agent is died Kill Worker logging.info("Changelog Agent died, Aborting " "Worker(%s)" % w[0]) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) break time.sleep(1) else: logging.info("worker(%s) not confirmed in %d sec, " "aborting it" % (w[0], conn_timeout)) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(apid) # wait for agent ret = nwait(cpid) if ret is None: self.status[w[0]].set_worker_status(self.ST_STABLE) # If worker dies, agent terminates on EOF. # So lets wait for agent first. nwait(apid) ret = nwait(cpid) if exit_signalled(ret): ret = 0 else: ret = exit_status(ret) if ret in (0, 1): self.status[w[0]].set_worker_status(self.ST_FAULTY) time.sleep(10) self.status[w[0]].set_worker_status(self.ST_INCON) return ret
def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master): """the monitor loop Basic logic is a blantantly simple blunt heuristics: if spawned client survives 60 secs, it's considered OK. This servers us pretty well as it's not vulneralbe to any kind of irregular behavior of the child... ... well, except for one: if children is hung up on waiting for some event, it can survive aeons, still will be defunct. So we tweak the above logic to expect the worker to send us a signal within 60 secs (in the form of closing its end of a pipe). The worker does this when it's done with the setup stage ready to enter the service loop (note it's the setup stage which is vulnerable to hangs -- the full blown worker blows up on EPIPE if the net goes down, due to the keep-alive thread) """ if not self.status.get(w[0]['dir'], None): self.status[w[0]['dir']] = GeorepStatus( gconf.state_file, w[0]['dir'], master, "%s::%s" % (slave_host, slave_vol)) set_monitor_status(gconf.state_file, self.ST_STARTED) self.status[w[0]['dir']].set_worker_status(self.ST_INIT) ret = 0 def nwait(p, o=0): try: p2, r = waitpid(p, o) if not p2: return return r except OSError as e: # no child process, this happens if the child process # already died and has been cleaned up if e.errno == ECHILD: return -1 else: raise def exit_signalled(s): """ child teminated due to receipt of SIGUSR1 """ return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1)) def exit_status(s): if os.WIFEXITED(s): return os.WEXITSTATUS(s) return 1 conn_timeout = int(gconf.connection_timeout) while ret in (0, 1): remote_host = w[1] # Check the status of the connected slave node # If the connected slave node is down then try to connect to # different up node. m = re.match("(ssh|gluster|file):\/\/(.+)@([^:]+):(.+)", remote_host) if m: current_slave_host = m.group(3) slave_up_hosts = get_slave_bricks_status(slave_host, slave_vol) if current_slave_host not in slave_up_hosts: if len(slave_up_hosts) > 0: remote_host = "%s://%s@%s:%s" % (m.group(1), m.group( 2), random.choice(slave_up_hosts), m.group(4)) # Spawn the worker and agent in lock to avoid fd leak self.lock.acquire() logging.info('starting gsyncd worker(%s). Slave node: %s' % (w[0]['dir'], remote_host)) # Couple of pipe pairs for RPC communication b/w # worker and changelog agent. # read/write end for agent (ra, ww) = os.pipe() # read/write end for worker (rw, wa) = os.pipe() # spawn the agent process apid = os.fork() if apid == 0: os.close(rw) os.close(ww) os.execv( sys.executable, argv + [ '--local-path', w[0]['dir'], '--agent', '--rpc-fd', ','.join([str(ra), str(wa), str(rw), str(ww)]) ]) pr, pw = os.pipe() cpid = os.fork() if cpid == 0: os.close(pr) os.close(ra) os.close(wa) os.execv( sys.executable, argv + [ '--feedback-fd', str(pw), '--local-path', w[0]['dir'], '--local-id', '.' + escape(w[0]['dir']), '--rpc-fd', ','.join( [str(rw), str(ww), str(ra), str(wa)]), '--subvol-num', str(w[2]) ] + (['--is-hottier'] if w[3] else []) + ['--resource-remote', remote_host]) cpids.add(cpid) agents.add(apid) os.close(pw) # close all RPC pipes in monitor os.close(ra) os.close(wa) os.close(rw) os.close(ww) self.lock.release() t0 = time.time() so = select((pr, ), (), (), conn_timeout)[0] os.close(pr) if so: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret_agent is not None: # Agent is died Kill Worker logging.info("Changelog Agent died, " "Aborting Worker(%s)" % w[0]['dir']) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) if ret is not None: logging.info("worker(%s) died before establishing " "connection" % w[0]['dir']) nwait(apid) # wait for agent else: logging.debug("worker(%s) connected" % w[0]['dir']) while time.time() < t0 + conn_timeout: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret is not None: logging.info("worker(%s) died in startup " "phase" % w[0]['dir']) nwait(apid) # wait for agent break if ret_agent is not None: # Agent is died Kill Worker logging.info("Changelog Agent died, Aborting " "Worker(%s)" % w[0]['dir']) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) break time.sleep(1) else: logging.info("worker(%s) not confirmed in %d sec, " "aborting it" % (w[0]['dir'], conn_timeout)) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(apid) # wait for agent ret = nwait(cpid) if ret is None: self.status[w[0]['dir']].set_worker_status(self.ST_STABLE) # If worker dies, agent terminates on EOF. # So lets wait for agent first. nwait(apid) ret = nwait(cpid) if exit_signalled(ret): ret = 0 else: ret = exit_status(ret) if ret in (0, 1): self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY) gf_event(EVENT_GEOREP_FAULTY, master_volume=master.volume, master_node=w[0]['host'], slave_host=slave_host, slave_volume=slave_vol, current_slave_host=current_slave_host, brick_path=w[0]['dir']) time.sleep(10) self.status[w[0]['dir']].set_worker_status(self.ST_INCON) return ret
def entry_ops(cls, entries): pfx = gauxpfx() logging.debug("entries: %s" % repr(entries)) # regular file def entry_pack_reg(gf, bn, mo, uid, gid): blen = len(bn) return struct.pack(cls._fmt_mknod(blen), uid, gid, gf, mo, bn, stat.S_IMODE(mo), 0, umask()) def entry_pack_reg_stat(gf, bn, st): blen = len(bn) mo = st["mode"] return struct.pack(cls._fmt_mknod(blen), st["uid"], st["gid"], gf, mo, bn, stat.S_IMODE(mo), 0, umask()) # mkdir def entry_pack_mkdir(gf, bn, mo, uid, gid): blen = len(bn) return struct.pack(cls._fmt_mkdir(blen), uid, gid, gf, mo, bn, stat.S_IMODE(mo), umask()) # symlink def entry_pack_symlink(gf, bn, lnk, st): blen = len(bn) llen = len(lnk) return struct.pack(cls._fmt_symlink(blen, llen), st["uid"], st["gid"], gf, st["mode"], bn, lnk) def entry_purge(entry, gfid): # This is an extremely racy code and needs to be fixed ASAP. # The GFID check here is to be sure that the pargfid/bname # to be purged is the GFID gotten from the changelog. # (a stat(changelog_gfid) would also be valid here) # The race here is between the GFID check and the purge. disk_gfid = cls.gfid_mnt(entry) if isinstance(disk_gfid, int): return if not gfid == disk_gfid: return er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR]) if isinstance(er, int): if er == EISDIR: er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY]) if er == ENOTEMPTY: return er for e in entries: blob = None op = e["op"] gfid = e["gfid"] entry = e["entry"] (pg, bname) = entry2pb(entry) if op in ["RMDIR", "UNLINK"]: while True: er = entry_purge(entry, gfid) if isinstance(er, int): if er == ENOTEMPTY and op == "RMDIR": er1 = errno_wrap(shutil.rmtree, [os.path.join(pg, bname)], [ENOENT]) if not isinstance(er1, int): logging.info("Removed %s/%s recursively" % (pg, bname)) break logging.warn("Failed to remove %s => %s/%s. %s" % (gfid, pg, bname, os.strerror(er))) time.sleep(1) else: break elif op in ["CREATE", "MKNOD"]: blob = entry_pack_reg(gfid, bname, e["mode"], e["uid"], e["gid"]) elif op == "MKDIR": blob = entry_pack_mkdir(gfid, bname, e["mode"], e["uid"], e["gid"]) elif op == "LINK": slink = os.path.join(pfx, gfid) st = lstat(slink) if isinstance(st, int): (pg, bname) = entry2pb(entry) blob = entry_pack_reg_stat(gfid, bname, e["stat"]) else: errno_wrap(os.link, [slink, entry], [ENOENT, EEXIST]) elif op == "SYMLINK": blob = entry_pack_symlink(gfid, bname, e["link"], e["stat"]) elif op == "RENAME": en = e["entry1"] st = lstat(entry) if isinstance(st, int): if e["stat"] and not stat.S_ISDIR(e["stat"]["mode"]): (pg, bname) = entry2pb(en) blob = entry_pack_reg_stat(gfid, bname, e["stat"]) else: errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST]) if blob: errno_wrap(Xattr.lsetxattr, [pg, "glusterfs.gfid.newfile", blob], [EEXIST], [ENOENT, ESTALE, EINVAL])
def process_change(self, change, done, retry): pfx = gauxpfx() clist = [] entries = [] datas = set() # basic crawl stats: files and bytes files_pending = {'count': 0, 'purge': 0, 'bytes': 0, 'files': []} try: f = open(change, "r") clist = f.readlines() f.close() except IOError: raise def edct(op, **ed): dct = {} dct['op'] = op for k in ed: if k == 'stat': st = ed[k] dst = dct['stat'] = {} dst['uid'] = st.st_uid dst['gid'] = st.st_gid dst['mode'] = st.st_mode else: dct[k] = ed[k] return dct # regular file update: bytes & count def _update_reg(entry, size): if not entry in files_pending['files']: files_pending['count'] += 1 files_pending['bytes'] += size files_pending['files'].append(entry) # updates for directories, symlinks etc.. def _update_rest(): files_pending['count'] += 1 # entry count def entry_update(entry, size, mode): if stat.S_ISREG(mode): _update_reg(entry, size) else: _update_rest() # purge count def purge_update(): files_pending['purge'] += 1 for e in clist: e = e.strip() et = e[self.IDX_START:self.IDX_END] ec = e[self.IDX_END:].split(' ') if et in self.TYPE_ENTRY: ty = ec[self.POS_TYPE] en = unescape(os.path.join(pfx, ec[self.POS_ENTRY1])) gfid = ec[self.POS_GFID] # definitely need a better way bucketize entry ops if ty in ['UNLINK', 'RMDIR']: purge_update() entries.append(edct(ty, gfid=gfid, entry=en)) continue go = os.path.join(pfx, gfid) st = lstat(go) if isinstance(st, int): if ty == 'RENAME': entries.append(edct('UNLINK', gfid=gfid, entry=en)) else: logging.debug('file %s got purged in the interim' % go) continue entry_update(go, st.st_size, st.st_mode) if ty in ['CREATE', 'MKDIR', 'MKNOD']: entries.append(edct(ty, stat=st, entry=en, gfid=gfid)) elif ty == 'LINK': entries.append(edct(ty, stat=st, entry=en, gfid=gfid)) elif ty == 'SYMLINK': rl = errno_wrap(os.readlink, [en], [ENOENT]) if isinstance(rl, int): continue entries.append( edct(ty, stat=st, entry=en, gfid=gfid, link=rl)) elif ty == 'RENAME': e2 = unescape(os.path.join(pfx, ec[self.POS_ENTRY2])) entries.append( edct(ty, gfid=gfid, entry=en, entry1=e2, stat=st)) else: logging.warn('ignoring %s [op %s]' % (gfid, ty)) elif et in self.TYPE_GFID: go = os.path.join(pfx, ec[0]) st = lstat(go) if isinstance(st, int): logging.debug('file %s got purged in the interim' % go) continue entry_update(go, st.st_size, st.st_mode) datas.update([go]) logging.debug('entries: %s' % repr(entries)) if not retry: self.update_cumulative_stats(files_pending) # sync namespace if (entries): self.slave.server.entry_ops(entries) # sync data if self.syncdata(datas): if done: self.master.server.changelog_done(change) return True
def entry_ops(cls, entries): pfx = gauxpfx() logging.debug("entries: %s" % repr(entries)) # regular file def entry_pack_reg(gf, bn, st): blen = len(bn) mo = st["mode"] return struct.pack(cls._fmt_mknod(blen), st["uid"], st["gid"], gf, mo, bn, stat.S_IMODE(mo), 0, umask()) # mkdir def entry_pack_mkdir(gf, bn, st): blen = len(bn) mo = st["mode"] return struct.pack(cls._fmt_mkdir(blen), st["uid"], st["gid"], gf, mo, bn, stat.S_IMODE(mo), umask()) # symlink def entry_pack_symlink(gf, bn, lnk, st): blen = len(bn) llen = len(lnk) return struct.pack(cls._fmt_symlink(blen, llen), st["uid"], st["gid"], gf, st["mode"], bn, lnk) def entry_purge(entry, gfid): # This is an extremely racy code and needs to be fixed ASAP. # The GFID check here is to be sure that the pargfid/bname # to be purged is the GFID gotten from the changelog. # (a stat(changelog_gfid) would also be valid here) # The race here is between the GFID check and the purge. disk_gfid = cls.gfid(entry) if isinstance(disk_gfid, int): return if not gfid == disk_gfid: return er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR]) if isinstance(er, int): if er == EISDIR: er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY]) if er == ENOTEMPTY: return er for e in entries: blob = None op = e["op"] gfid = e["gfid"] entry = e["entry"] (pg, bname) = entry2pb(entry) if op in ["RMDIR", "UNLINK"]: while True: er = entry_purge(entry, gfid) if isinstance(er, int): time.sleep(1) else: break elif op == "CREATE": blob = entry_pack_reg(gfid, bname, e["stat"]) elif op == "MKDIR": blob = entry_pack_mkdir(gfid, bname, e["stat"]) elif op == "LINK": errno_wrap(os.link, [os.path.join(pfx, gfid), entry], [ENOENT, EEXIST]) elif op == "SYMLINK": blob = entry_pack_symlink(gfid, bname, e["link"], e["stat"]) elif op == "RENAME": en = e["entry1"] errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST]) if blob: errno_wrap(Xattr.lsetxattr_l, [pg, "glusterfs.gfid.newfile", blob], [ENOENT, EEXIST])
def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master, suuid, slavenodes): """the monitor loop Basic logic is a blantantly simple blunt heuristics: if spawned client survives 60 secs, it's considered OK. This servers us pretty well as it's not vulneralbe to any kind of irregular behavior of the child... ... well, except for one: if children is hung up on waiting for some event, it can survive aeons, still will be defunct. So we tweak the above logic to expect the worker to send us a signal within 60 secs (in the form of closing its end of a pipe). The worker does this when it's done with the setup stage ready to enter the service loop (note it's the setup stage which is vulnerable to hangs -- the full blown worker blows up on EPIPE if the net goes down, due to the keep-alive thread) """ if not self.status.get(w[0]['dir'], None): self.status[w[0]['dir']] = GeorepStatus(gconf.get("state-file"), w[0]['host'], w[0]['dir'], w[0]['uuid'], master, "%s::%s" % (slave_host, slave_vol)) ret = 0 def nwait(p, o=0): try: p2, r = waitpid(p, o) if not p2: return return r except OSError as e: # no child process, this happens if the child process # already died and has been cleaned up if e.errno == ECHILD: return -1 else: raise def exit_signalled(s): """ child terminated due to receipt of SIGUSR1 """ return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1)) def exit_status(s): if os.WIFEXITED(s): return os.WEXITSTATUS(s) return 1 conn_timeout = gconf.get("connection-timeout") while ret in (0, 1): remote_user, remote_host = w[1][0].split("@") remote_id = w[1][1] # Check the status of the connected slave node # If the connected slave node is down then try to connect to # different up node. current_slave_host = remote_host slave_up_hosts = get_up_nodes(slavenodes, gconf.get("ssh-port")) if (current_slave_host, remote_id) not in slave_up_hosts: if len(slave_up_hosts) > 0: remote_new = random.choice(slave_up_hosts) remote_host = "%s@%s" % (remote_user, remote_new[0]) remote_id = remote_new[1] # Spawn the worker and agent in lock to avoid fd leak self.lock.acquire() self.status[w[0]['dir']].set_worker_status(self.ST_INIT) logging.info(lf('starting gsyncd worker', brick=w[0]['dir'], slave_node=remote_host)) # Couple of pipe pairs for RPC communication b/w # worker and changelog agent. # read/write end for agent (ra, ww) = pipe() # read/write end for worker (rw, wa) = pipe() # spawn the agent process apid = os.fork() if apid == 0: os.close(rw) os.close(ww) args_to_agent = argv + [ 'agent', rconf.args.master, rconf.args.slave, '--local-path', w[0]['dir'], '--local-node', w[0]['host'], '--local-node-id', w[0]['uuid'], '--slave-id', suuid, '--rpc-fd', ','.join([str(ra), str(wa), str(rw), str(ww)]) ] if rconf.args.config_file is not None: args_to_agent += ['-c', rconf.args.config_file] if rconf.args.debug: args_to_agent.append("--debug") os.execv(sys.executable, args_to_agent) pr, pw = pipe() cpid = os.fork() if cpid == 0: os.close(pr) os.close(ra) os.close(wa) args_to_worker = argv + [ 'worker', rconf.args.master, rconf.args.slave, '--feedback-fd', str(pw), '--local-path', w[0]['dir'], '--local-node', w[0]['host'], '--local-node-id', w[0]['uuid'], '--slave-id', suuid, '--rpc-fd', ','.join([str(rw), str(ww), str(ra), str(wa)]), '--subvol-num', str(w[2]), '--resource-remote', remote_host, '--resource-remote-id', remote_id ] if rconf.args.config_file is not None: args_to_worker += ['-c', rconf.args.config_file] if w[3]: args_to_worker.append("--is-hottier") if rconf.args.debug: args_to_worker.append("--debug") access_mount = gconf.get("access-mount") if access_mount: os.execv(sys.executable, args_to_worker) else: if unshare_propagation_supported(): logging.debug("Worker would mount volume privately") unshare_cmd = ['unshare', '-m', '--propagation', 'private'] cmd = unshare_cmd + args_to_worker os.execvp("unshare", cmd) else: logging.debug("Mount is not private. It would be lazy" " umounted") os.execv(sys.executable, args_to_worker) cpids.add(cpid) agents.add(apid) os.close(pw) # close all RPC pipes in monitor os.close(ra) os.close(wa) os.close(rw) os.close(ww) self.lock.release() t0 = time.time() so = select((pr,), (), (), conn_timeout)[0] os.close(pr) if so: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret_agent is not None: # Agent is died Kill Worker logging.info(lf("Changelog Agent died, Aborting Worker", brick=w[0]['dir'])) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) if ret is not None: logging.info(lf("worker died before establishing " "connection", brick=w[0]['dir'])) nwait(apid) # wait for agent else: logging.debug("worker(%s) connected" % w[0]['dir']) while time.time() < t0 + conn_timeout: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret is not None: logging.info(lf("worker died in startup phase", brick=w[0]['dir'])) nwait(apid) # wait for agent break if ret_agent is not None: # Agent is died Kill Worker logging.info(lf("Changelog Agent died, Aborting " "Worker", brick=w[0]['dir'])) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) break time.sleep(1) else: logging.info( lf("Worker not confirmed after wait, aborting it. " "Gsyncd invocation on remote slave via SSH or " "gluster master mount might have hung. Please " "check the above logs for exact issue and check " "master or slave volume for errors. Restarting " "master/slave volume accordingly might help.", brick=w[0]['dir'], timeout=conn_timeout)) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(apid) # wait for agent ret = nwait(cpid) if ret is None: # If worker dies, agent terminates on EOF. # So lets wait for agent first. nwait(apid) ret = nwait(cpid) if exit_signalled(ret): ret = 0 else: ret = exit_status(ret) if ret in (0, 1): self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY) gf_event(EVENT_GEOREP_FAULTY, master_volume=master.volume, master_node=w[0]['host'], master_node_id=w[0]['uuid'], slave_host=slave_host, slave_volume=slave_vol, current_slave_host=current_slave_host, brick_path=w[0]['dir']) time.sleep(10) self.status[w[0]['dir']].set_worker_status(self.ST_INCON) return ret
def gfid(cls, gfidpath): return errno_wrap( Xattr.lgetxattr, [gfidpath, 'glusterfs.gfid', cls.GX_GFID_CANONICAL_LEN], [ENOENT])
def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master, suuid, slavenodes): """the monitor loop Basic logic is a blantantly simple blunt heuristics: if spawned client survives 60 secs, it's considered OK. This servers us pretty well as it's not vulneralbe to any kind of irregular behavior of the child... ... well, except for one: if children is hung up on waiting for some event, it can survive aeons, still will be defunct. So we tweak the above logic to expect the worker to send us a signal within 60 secs (in the form of closing its end of a pipe). The worker does this when it's done with the setup stage ready to enter the service loop (note it's the setup stage which is vulnerable to hangs -- the full blown worker blows up on EPIPE if the net goes down, due to the keep-alive thread) """ if not self.status.get(w[0]['dir'], None): self.status[w[0]['dir']] = GeorepStatus( gconf.get("state-file"), w[0]['host'], w[0]['dir'], w[0]['uuid'], master, "%s::%s" % (slave_host, slave_vol)) set_monitor_status(gconf.get("state-file"), self.ST_STARTED) self.status[w[0]['dir']].set_worker_status(self.ST_INIT) ret = 0 def nwait(p, o=0): try: p2, r = waitpid(p, o) if not p2: return return r except OSError as e: # no child process, this happens if the child process # already died and has been cleaned up if e.errno == ECHILD: return -1 else: raise def exit_signalled(s): """ child teminated due to receipt of SIGUSR1 """ return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1)) def exit_status(s): if os.WIFEXITED(s): return os.WEXITSTATUS(s) return 1 conn_timeout = gconf.get("connection-timeout") while ret in (0, 1): remote_user, remote_host = w[1][0].split("@") remote_id = w[1][1] # Check the status of the connected slave node # If the connected slave node is down then try to connect to # different up node. current_slave_host = remote_host slave_up_hosts = get_up_nodes(slavenodes, gconf.get("ssh-port")) if (current_slave_host, remote_id) not in slave_up_hosts: if len(slave_up_hosts) > 0: remote_new = random.choice(slave_up_hosts) remote_host = "%s@%s" % (remote_user, remote_new[0]) remote_id = remote_new[1] # Spawn the worker and agent in lock to avoid fd leak self.lock.acquire() logging.info( lf('starting gsyncd worker', brick=w[0]['dir'], slave_node=remote_host)) # Couple of pipe pairs for RPC communication b/w # worker and changelog agent. # read/write end for agent (ra, ww) = os.pipe() # read/write end for worker (rw, wa) = os.pipe() # spawn the agent process apid = os.fork() if apid == 0: os.close(rw) os.close(ww) args_to_agent = argv + [ 'agent', rconf.args.master, rconf.args.slave, '--local-path', w[0]['dir'], '--local-node', w[0]['host'], '--local-node-id', w[0]['uuid'], '--slave-id', suuid, '--rpc-fd', ','.join( [str(ra), str(wa), str(rw), str(ww)]) ] if rconf.args.config_file is not None: args_to_agent += ['-c', rconf.args.config_file] if rconf.args.debug: args_to_agent.append("--debug") os.execv(sys.executable, args_to_agent) pr, pw = os.pipe() cpid = os.fork() if cpid == 0: os.close(pr) os.close(ra) os.close(wa) args_to_worker = argv + [ 'worker', rconf.args.master, rconf.args.slave, '--feedback-fd', str(pw), '--local-path', w[0]['dir'], '--local-node', w[0]['host'], '--local-node-id', w[0]['uuid'], '--slave-id', suuid, '--rpc-fd', ','.join( [str(rw), str(ww), str(ra), str(wa)]), '--subvol-num', str(w[2]), '--resource-remote', remote_host, '--resource-remote-id', remote_id ] if rconf.args.config_file is not None: args_to_worker += ['-c', rconf.args.config_file] if w[3]: args_to_worker.append("--is-hottier") if rconf.args.debug: args_to_worker.append("--debug") os.execv(sys.executable, args_to_worker) cpids.add(cpid) agents.add(apid) os.close(pw) # close all RPC pipes in monitor os.close(ra) os.close(wa) os.close(rw) os.close(ww) self.lock.release() t0 = time.time() so = select((pr, ), (), (), conn_timeout)[0] os.close(pr) if so: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret_agent is not None: # Agent is died Kill Worker logging.info( lf("Changelog Agent died, Aborting Worker", brick=w[0]['dir'])) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) if ret is not None: logging.info( lf("worker died before establishing " "connection", brick=w[0]['dir'])) nwait(apid) # wait for agent else: logging.debug("worker(%s) connected" % w[0]['dir']) while time.time() < t0 + conn_timeout: ret = nwait(cpid, os.WNOHANG) ret_agent = nwait(apid, os.WNOHANG) if ret is not None: logging.info( lf("worker died in startup phase", brick=w[0]['dir'])) nwait(apid) # wait for agent break if ret_agent is not None: # Agent is died Kill Worker logging.info( lf("Changelog Agent died, Aborting " "Worker", brick=w[0]['dir'])) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(cpid) nwait(apid) break time.sleep(1) else: logging.info( lf( "Worker not confirmed after wait, aborting it. " "Gsyncd invocation on remote slave via SSH or " "gluster master mount might have hung. Please " "check the above logs for exact issue and check " "master or slave volume for errors. Restarting " "master/slave volume accordingly might help.", brick=w[0]['dir'], timeout=conn_timeout)) errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH]) nwait(apid) # wait for agent ret = nwait(cpid) if ret is None: # If worker dies, agent terminates on EOF. # So lets wait for agent first. nwait(apid) ret = nwait(cpid) if exit_signalled(ret): ret = 0 else: ret = exit_status(ret) if ret in (0, 1): self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY) gf_event(EVENT_GEOREP_FAULTY, master_volume=master.volume, master_node=w[0]['host'], master_node_id=w[0]['uuid'], slave_host=slave_host, slave_volume=slave_vol, current_slave_host=current_slave_host, brick_path=w[0]['dir']) time.sleep(10) self.status[w[0]['dir']].set_worker_status(self.ST_INCON) return ret
def process_change(self, change, done, retry): pfx = gauxpfx() clist = [] entries = [] datas = set() # basic crawl stats: files and bytes files_pending = {'count': 0, 'purge': 0, 'bytes': 0, 'files': []} try: f = open(change, "r") clist = f.readlines() f.close() except IOError: raise def edct(op, **ed): dct = {} dct['op'] = op for k in ed: if k == 'stat': st = ed[k] dst = dct['stat'] = {} dst['uid'] = st.st_uid dst['gid'] = st.st_gid dst['mode'] = st.st_mode else: dct[k] = ed[k] return dct # regular file update: bytes & count def _update_reg(entry, size): if not entry in files_pending['files']: files_pending['count'] += 1 files_pending['bytes'] += size files_pending['files'].append(entry) # updates for directories, symlinks etc.. def _update_rest(): files_pending['count'] += 1 # entry count def entry_update(entry, size, mode): if stat.S_ISREG(mode): _update_reg(entry, size) else: _update_rest() # purge count def purge_update(): files_pending['purge'] += 1 for e in clist: e = e.strip() et = e[self.IDX_START:self.IDX_END] ec = e[self.IDX_END:].split(' ') if et in self.TYPE_ENTRY: ty = ec[self.POS_TYPE] en = unescape(os.path.join(pfx, ec[self.POS_ENTRY1])) gfid = ec[self.POS_GFID] # definitely need a better way bucketize entry ops if ty in ['UNLINK', 'RMDIR']: purge_update() entries.append(edct(ty, gfid=gfid, entry=en)) continue go = os.path.join(pfx, gfid) st = lstat(go) if isinstance(st, int): if ty == 'RENAME': entries.append(edct('UNLINK', gfid=gfid, entry=en)) else: logging.debug('file %s got purged in the interim' % go) continue entry_update(go, st.st_size, st.st_mode) if ty in ['CREATE', 'MKDIR', 'MKNOD']: entries.append(edct(ty, stat=st, entry=en, gfid=gfid)) elif ty == 'LINK': entries.append(edct(ty, stat=st, entry=en, gfid=gfid)) elif ty == 'SYMLINK': rl = errno_wrap(os.readlink, [en], [ENOENT]) if isinstance(rl, int): continue entries.append(edct(ty, stat=st, entry=en, gfid=gfid, link=rl)) elif ty == 'RENAME': e2 = unescape(os.path.join(pfx, ec[self.POS_ENTRY2])) entries.append(edct(ty, gfid=gfid, entry=en, entry1=e2, stat=st)) else: logging.warn('ignoring %s [op %s]' % (gfid, ty)) elif et in self.TYPE_GFID: go = os.path.join(pfx, ec[0]) st = lstat(go) if isinstance(st, int): logging.debug('file %s got purged in the interim' % go) continue entry_update(go, st.st_size, st.st_mode) datas.update([go]) logging.debug('entries: %s' % repr(entries)) if not retry: self.update_cumulative_stats(files_pending) # sync namespace if (entries): self.slave.server.entry_ops(entries) # sync data if self.syncdata(datas): if done: self.master.server.changelog_done(change) return True
def crawl(self, path='.', xtr=None, done=0): """ generate a CHANGELOG file consumable by process_change """ if path == '.': self.open() self.crawls += 1 if not xtr: # get the root stime and use it for all comparisons xtr = self.xtime('.', self.slave) if isinstance(xtr, int): if xtr != ENOENT: raise GsyncdError('slave is corrupt') xtr = self.minus_infinity xtl = self.xtime(path) if isinstance(xtl, int): raise GsyncdError('master is corrupt') if xtr == xtl: if path == '.': self.close() return self.xtime_reversion_hook(path, xtl, xtr) logging.debug("entering " + path) dem = self.master.server.entries(path) pargfid = self.master.server.gfid(path) if isinstance(pargfid, int): logging.warn('skipping directory %s' % (path)) for e in dem: bname = e e = os.path.join(path, e) st = lstat(e) if isinstance(st, int): logging.warn('%s got purged in the interim..' % e) continue gfid = self.master.server.gfid(e) if isinstance(gfid, int): logging.warn('skipping entry %s..' % (e)) continue xte = self.xtime(e) if isinstance(xte, int): raise GsyncdError('master is corrupt') if not self.need_sync(e, xte, xtr): continue mo = st.st_mode if stat.S_ISDIR(mo): self.write_entry_change( "E", [gfid, 'MKDIR', escape(os.path.join(pargfid, bname))]) self.crawl(e, xtr) elif stat.S_ISLNK(mo): rl = errno_wrap(os.readlink, [en], [ENOENT]) if isinstance(rl, int): continue self.write_entry_change("E", [ gfid, 'SYMLINK', escape(os.path.join(pargfid, bname)), rl ]) else: # if a file has a hardlink, create a Changelog entry as 'LINK' so the slave # side will decide if to create the new entry, or to create link. if st.st_nlink == 1: self.write_entry_change( "E", [gfid, 'MKNOD', escape(os.path.join(pargfid, bname))]) else: self.write_entry_change( "E", [gfid, 'LINK', escape(os.path.join(pargfid, bname))]) if stat.S_ISREG(mo): self.write_entry_change("D", [gfid]) if path == '.': logging.info('processing xsync changelog %s' % self.fname()) self.close() self.process([self.fname()], done) self.upd_stime(xtl)
def gfid(cls, gfidpath): return errno_wrap(Xattr.lgetxattr, [gfidpath, "glusterfs.gfid", cls.GX_GFID_CANONICAL_LEN], [ENOENT])