Exemplo n.º 1
0
def subcmd_delete(args):
    import logging
    import shutil
    import glob
    import sys
    from errno import ENOENT, ENODATA
    import struct

    from syncdutils import GsyncdError, Xattr, errno_wrap
    import gsyncdconfig as gconf

    logging.info('geo-replication delete')
    # remove the stime xattr from all the brick paths so that
    # a re-create of a session will start sync all over again
    stime_xattr_prefix = gconf.get('stime-xattr-prefix', None)

    # Delete pid file, status file, socket file
    cleanup_paths = []
    cleanup_paths.append(gconf.get("pid-file"))

    # Cleanup Session dir
    try:
        shutil.rmtree(gconf.get("georep-session-working-dir"))
    except (IOError, OSError):
        if sys.exc_info()[1].errno == ENOENT:
            pass
        else:
            raise GsyncdError('Error while removing working dir: %s' %
                              gconf.get("georep-session-working-dir"))

    # Cleanup changelog working dirs
    try:
        shutil.rmtree(gconf.get("working-dir"))
    except (IOError, OSError):
        if sys.exc_info()[1].errno == ENOENT:
            pass
        else:
            raise GsyncdError('Error while removing working dir: %s' %
                              gconf.get("working-dir"))

    for path in cleanup_paths:
        # To delete temp files
        for f in glob.glob(path + "*"):
            _unlink(f)

    if args.reset_sync_time and stime_xattr_prefix:
        for p in args.paths:
            if p != "":
                # set stime to (0,0) to trigger full volume content resync
                # to slave on session recreation
                # look at master.py::Xcrawl   hint: zero_zero
                errno_wrap(Xattr.lsetxattr, (p, stime_xattr_prefix + ".stime",
                                             struct.pack("!II", 0, 0)),
                           [ENOENT, ENODATA])
                errno_wrap(Xattr.lremovexattr,
                           (p, stime_xattr_prefix + ".entry_stime"),
                           [ENOENT, ENODATA])

    return
Exemplo n.º 2
0
def monitor(local, remote):
    # Check if gsyncd restarted in pause state. If
    # yes, send SIGSTOP to negative of monitor pid
    # to go back to pause state.
    if rconf.args.pause_on_start:
        errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH])
    """oh yeah, actually Monitor is used as singleton, too"""
    return Monitor().multiplex(*distribute(local, remote))
Exemplo n.º 3
0
 def wmon(w):
     cpid, _ = self.monitor(w, argv, cpids, slave_vol,
                            slave_host, master, suuid, slavenodes)
     time.sleep(1)
     self.lock.acquire()
     for cpid in cpids:
         errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
     self.lock.release()
     finalize(exval=1)
Exemplo n.º 4
0
def monitor(*resources):
    # Check if gsyncd restarted in pause state. If
    # yes, send SIGSTOP to negative of monitor pid
    # to go back to pause state.
    if gconf.pause_on_start:
        errno_wrap(os.kill, [-os.getpid(), signal.SIGSTOP], [ESRCH])

    """oh yeah, actually Monitor is used as singleton, too"""
    return Monitor().multiplex(*distribute(*resources))
Exemplo n.º 5
0
 def meta_ops(cls, meta_entries):
     logging.debug('Meta-entries: %s' % repr(meta_entries))
     for e in meta_entries:
         mode = e['stat']['mode']
         uid = e['stat']['uid']
         gid = e['stat']['gid']
         go = e['go']
         errno_wrap(os.chmod, [go, mode], [ENOENT], [ESTALE, EINVAL])
         errno_wrap(os.chown, [go, uid, gid], [ENOENT], [ESTALE, EINVAL])
Exemplo n.º 6
0
 def meta_ops(cls, meta_entries):
     logging.debug("Meta-entries: %s" % repr(meta_entries))
     for e in meta_entries:
         mode = e["stat"]["mode"]
         uid = e["stat"]["uid"]
         gid = e["stat"]["gid"]
         go = e["go"]
         errno_wrap(os.chmod, [go, mode], [ENOENT], [ESTALE, EINVAL])
         errno_wrap(os.chown, [go, uid, gid], [ENOENT], [ESTALE, EINVAL])
Exemplo n.º 7
0
 def meta_ops(cls, meta_entries):
     logging.debug('Meta-entries: %s' % repr(meta_entries))
     for e in meta_entries:
         mode = e['stat']['mode']
         uid = e['stat']['uid']
         gid = e['stat']['gid']
         go = e['go']
         errno_wrap(os.chmod, [go, mode], [ENOENT], [ESTALE, EINVAL])
         errno_wrap(os.chown, [go, uid, gid], [ENOENT], [ESTALE, EINVAL])
Exemplo n.º 8
0
 def wmon(w):
     cpid, _ = self.monitor(w, argv, cpids, agents, slave_vol,
                            slave_host, master)
     time.sleep(1)
     self.lock.acquire()
     for cpid in cpids:
         errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
     for apid in agents:
         errno_wrap(os.kill, [apid, signal.SIGKILL], [ESRCH])
     self.lock.release()
     finalize(exval=1)
Exemplo n.º 9
0
 def entry_purge(entry, gfid):
     # This is an extremely racy code and needs to be fixed ASAP.
     # The GFID check here is to be sure that the pargfid/bname
     # to be purged is the GFID gotten from the changelog.
     # (a stat(changelog_gfid) would also be valid here)
     # The race here is between the GFID check and the purge.
     disk_gfid = cls.gfid(entry)
     if isinstance(disk_gfid, int):
         return
     if not gfid == disk_gfid:
         return
     er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR])
     if isinstance(er, int):
         if er == EISDIR:
             er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY])
             if er == ENOTEMPTY:
                 return er
Exemplo n.º 10
0
 def entry_purge(entry, gfid):
     # This is an extremely racy code and needs to be fixed ASAP.
     # The GFID check here is to be sure that the pargfid/bname
     # to be purged is the GFID gotten from the changelog.
     # (a stat(changelog_gfid) would also be valid here)
     # The race here is between the GFID check and the purge.
     disk_gfid = cls.gfid_mnt(entry)
     if isinstance(disk_gfid, int):
         return
     if not gfid == disk_gfid:
         return
     er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR])
     if isinstance(er, int):
         if er == EISDIR:
             er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY])
             if er == ENOTEMPTY:
                 return er
Exemplo n.º 11
0
    def entry_ops(cls, entries):
        pfx = gauxpfx()
        logging.debug('entries: %s' % repr(entries))
        # regular file

        def entry_pack_reg(gf, bn, mo, uid, gid):
            blen = len(bn)
            return struct.pack(cls._fmt_mknod(blen),
                               uid, gid, gf, mo, bn,
                               stat.S_IMODE(mo), 0, umask())

        def entry_pack_reg_stat(gf, bn, st):
            blen = len(bn)
            mo = st['mode']
            return struct.pack(cls._fmt_mknod(blen),
                               st['uid'], st['gid'],
                               gf, mo, bn,
                               stat.S_IMODE(mo), 0, umask())
        # mkdir

        def entry_pack_mkdir(gf, bn, mo, uid, gid):
            blen = len(bn)
            return struct.pack(cls._fmt_mkdir(blen),
                               uid, gid, gf, mo, bn,
                               stat.S_IMODE(mo), umask())
        # symlink

        def entry_pack_symlink(gf, bn, lnk, st):
            blen = len(bn)
            llen = len(lnk)
            return struct.pack(cls._fmt_symlink(blen, llen),
                               st['uid'], st['gid'],
                               gf, st['mode'], bn, lnk)

        def entry_purge(entry, gfid):
            # This is an extremely racy code and needs to be fixed ASAP.
            # The GFID check here is to be sure that the pargfid/bname
            # to be purged is the GFID gotten from the changelog.
            # (a stat(changelog_gfid) would also be valid here)
            # The race here is between the GFID check and the purge.
            disk_gfid = cls.gfid_mnt(entry)
            if isinstance(disk_gfid, int):
                return
            if not gfid == disk_gfid:
                return
            er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR])
            if isinstance(er, int):
                if er == EISDIR:
                    er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY])
                    if er == ENOTEMPTY:
                        return er
        for e in entries:
            blob = None
            op = e['op']
            gfid = e['gfid']
            entry = e['entry']
            (pg, bname) = entry2pb(entry)
            if op in ['RMDIR', 'UNLINK']:
                while True:
                    er = entry_purge(entry, gfid)
                    if isinstance(er, int):
                        time.sleep(1)
                    else:
                        break
            elif op in ['CREATE', 'MKNOD']:
                blob = entry_pack_reg(
                    gfid, bname, e['mode'], e['uid'], e['uid'])
            elif op == 'MKDIR':
                blob = entry_pack_mkdir(
                    gfid, bname, e['mode'], e['uid'], e['uid'])
            elif op == 'LINK':
                slink = os.path.join(pfx, gfid)
                st = lstat(slink)
                if isinstance(st, int):
                    (pg, bname) = entry2pb(entry)
                    blob = entry_pack_reg_stat(gfid, bname, e['stat'])
                else:
                    errno_wrap(os.link, [slink, entry], [ENOENT, EEXIST])
            elif op == 'SYMLINK':
                blob = entry_pack_symlink(gfid, bname, e['link'], e['stat'])
            elif op == 'RENAME':
                en = e['entry1']
                st = lstat(entry)
                if isinstance(st, int):
                    (pg, bname) = entry2pb(en)
                    blob = entry_pack_reg_stat(gfid, bname, e['stat'])
                else:
                    errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST])
            if blob:
                errno_wrap(Xattr.lsetxattr_l, [pg, 'glusterfs.gfid.newfile',
                                               blob],
                           [EEXIST], [ENOENT, ESTALE, EINVAL])
Exemplo n.º 12
0
 def gfid_mnt(cls, gfidpath):
     return errno_wrap(Xattr.lgetxattr,
                       [gfidpath, 'glusterfs.gfid.string',
                        cls.GX_GFID_CANONICAL_LEN], [ENOENT])
Exemplo n.º 13
0
    def entry_ops(cls, entries):
        pfx = gauxpfx()
        logging.debug('entries: %s' % repr(entries))

        # regular file
        def entry_pack_reg(gf, bn, st):
            blen = len(bn)
            mo = st['mode']
            return struct.pack(cls._fmt_mknod(blen), st['uid'], st['gid'], gf,
                               mo, bn, stat.S_IMODE(mo), 0, umask())

        # mkdir
        def entry_pack_mkdir(gf, bn, st):
            blen = len(bn)
            mo = st['mode']
            return struct.pack(cls._fmt_mkdir(blen), st['uid'], st['gid'], gf,
                               mo, bn, stat.S_IMODE(mo), umask())

        #symlink
        def entry_pack_symlink(gf, bn, lnk, st):
            blen = len(bn)
            llen = len(lnk)
            return struct.pack(cls._fmt_symlink(blen, llen), st['uid'],
                               st['gid'], gf, st['mode'], bn, lnk)

        def entry_purge(entry, gfid):
            # This is an extremely racy code and needs to be fixed ASAP.
            # The GFID check here is to be sure that the pargfid/bname
            # to be purged is the GFID gotten from the changelog.
            # (a stat(changelog_gfid) would also be valid here)
            # The race here is between the GFID check and the purge.
            disk_gfid = cls.gfid(entry)
            if isinstance(disk_gfid, int):
                return
            if not gfid == disk_gfid:
                return
            er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR])
            if isinstance(er, int):
                if er == EISDIR:
                    er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY])
                    if er == ENOTEMPTY:
                        return er

        for e in entries:
            blob = None
            op = e['op']
            gfid = e['gfid']
            entry = e['entry']
            (pg, bname) = entry2pb(entry)
            if op in ['RMDIR', 'UNLINK']:
                while True:
                    er = entry_purge(entry, gfid)
                    if isinstance(er, int):
                        time.sleep(1)
                    else:
                        break
            elif op == 'CREATE':
                blob = entry_pack_reg(gfid, bname, e['stat'])
            elif op == 'MKDIR':
                blob = entry_pack_mkdir(gfid, bname, e['stat'])
            elif op == 'LINK':
                errno_wrap(os.link, [os.path.join(pfx, gfid), entry],
                           [ENOENT, EEXIST])
            elif op == 'SYMLINK':
                blob = entry_pack_symlink(gfid, bname, e['link'], e['stat'])
            elif op == 'RENAME':
                en = e['entry1']
                errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST])
            if blob:
                errno_wrap(Xattr.lsetxattr_l,
                           [pg, 'glusterfs.gfid.newfile', blob],
                           [ENOENT, EEXIST])
Exemplo n.º 14
0
    def crawl(self, path='.', xtr=None, done=0):
        """ generate a CHANGELOG file consumable by process_change """
        if path == '.':
            self.open()
            self.crawls += 1
        if not xtr:
            # get the root stime and use it for all comparisons
            xtr = self.xtime('.', self.slave)
            if isinstance(xtr, int):
                if xtr != ENOENT:
                    raise GsyncdError('slave is corrupt')
                xtr = self.minus_infinity
        xtl = self.xtime(path)
        if isinstance(xtl, int):
            raise GsyncdError('master is corrupt')
        if xtr == xtl:
            if path == '.':
                self.close()
            return
        self.xtime_reversion_hook(path, xtl, xtr)
        logging.debug("entering " + path)
        dem = self.master.server.entries(path)
        pargfid = self.master.server.gfid(path)
        if isinstance(pargfid, int):
            logging.warn('skipping directory %s' % (path))
        for e in dem:
            bname = e
            e = os.path.join(path, e)
            st = lstat(e)
            if isinstance(st, int):
                logging.warn('%s got purged in the interim..' % e)
                continue
            gfid = self.master.server.gfid(e)
            if isinstance(gfid, int):
                logging.warn('skipping entry %s..' % (e))
                continue
            xte = self.xtime(e)
            if isinstance(xte, int):
                raise GsyncdError('master is corrupt')
            if not self.need_sync(e, xte, xtr):
                continue
            mo = st.st_mode
            if stat.S_ISDIR(mo):
                self.write_entry_change("E", [gfid, 'MKDIR', escape(os.path.join(pargfid, bname))])
                self.crawl(e, xtr)
            elif stat.S_ISLNK(mo):
                rl = errno_wrap(os.readlink, [en], [ENOENT])
                if isinstance(rl, int):
                    continue
                self.write_entry_change("E", [gfid, 'SYMLINK', escape(os.path.join(pargfid, bname)), rl])
            else:
                # if a file has a hardlink, create a Changelog entry as 'LINK' so the slave
                # side will decide if to create the new entry, or to create link.
                if st.st_nlink == 1:
                    self.write_entry_change("E", [gfid, 'MKNOD', escape(os.path.join(pargfid, bname))])
                else:
                    self.write_entry_change("E", [gfid, 'LINK', escape(os.path.join(pargfid, bname))])
                if stat.S_ISREG(mo):
                    self.write_entry_change("D", [gfid])

        if path == '.':
            logging.info('processing xsync changelog %s' % self.fname())
            self.close()
            self.process([self.fname()], done)
            self.upd_stime(xtl)
Exemplo n.º 15
0
 def terminate():
     # relax one SIGTERM by setting a handler that sets back
     # standard handler
     set_term_handler(lambda *a: set_term_handler())
     # give a chance to graceful exit
     errno_wrap(os.kill, [-os.getpid(), signal.SIGTERM], [ESRCH])
Exemplo n.º 16
0
def subcmd_delete(args):
    import logging
    import shutil
    import glob
    import sys
    from errno import ENOENT, ENODATA
    import struct

    from syncdutils import GsyncdError, Xattr, errno_wrap
    import gsyncdconfig as gconf

    logging.info('geo-replication delete')
    # remove the stime xattr from all the brick paths so that
    # a re-create of a session will start sync all over again
    stime_xattr_prefix = gconf.get('stime-xattr-prefix', None)

    # Delete pid file, status file, socket file
    cleanup_paths = []
    cleanup_paths.append(gconf.get("pid-file"))

    # Cleanup Session dir
    try:
        shutil.rmtree(gconf.get("georep-session-working-dir"))
    except (IOError, OSError):
        if sys.exc_info()[1].errno == ENOENT:
            pass
        else:
            raise GsyncdError(
                'Error while removing working dir: %s' %
                gconf.get("georep-session-working-dir"))

    # Cleanup changelog working dirs
    try:
        shutil.rmtree(gconf.get("working-dir"))
    except (IOError, OSError):
        if sys.exc_info()[1].errno == ENOENT:
            pass
        else:
            raise GsyncdError(
                'Error while removing working dir: %s' %
                gconf.get("working-dir"))

    for path in cleanup_paths:
        # To delete temp files
        for f in glob.glob(path + "*"):
            _unlink(f)

    if args.reset_sync_time and stime_xattr_prefix:
        for p in args.paths:
            if p != "":
                # set stime to (0,0) to trigger full volume content resync
                # to slave on session recreation
                # look at master.py::Xcrawl   hint: zero_zero
                errno_wrap(Xattr.lsetxattr,
                           (p, stime_xattr_prefix + ".stime",
                            struct.pack("!II", 0, 0)),
                           [ENOENT, ENODATA])
                errno_wrap(Xattr.lremovexattr,
                           (p, stime_xattr_prefix + ".entry_stime"),
                           [ENOENT, ENODATA])

    return
Exemplo n.º 17
0
    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """
        if not self.status.get(w[0], None):
            self.status[w[0]] = GeorepStatus(gconf.state_file, w[0])

        set_monitor_status(gconf.state_file, self.ST_STARTED)
        self.status[w[0]].set_worker_status(self.ST_INIT)

        ret = 0

        def nwait(p, o=0):
            try:
                p2, r = waitpid(p, o)
                if not p2:
                    return
                return r
            except OSError as e:
                # no child process, this happens if the child process
                # already died and has been cleaned up
                if e.errno == ECHILD:
                    return -1
                else:
                    raise

        def exit_signalled(s):
            """ child teminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1

        conn_timeout = int(gconf.connection_timeout)
        while ret in (0, 1):
            remote_host = w[1]
            # Check the status of the connected slave node
            # If the connected slave node is down then try to connect to
            # different up node.
            m = re.match("(ssh|gluster|file):\/\/(.+)@([^:]+):(.+)",
                         remote_host)
            if m:
                current_slave_host = m.group(3)
                slave_up_hosts = get_slave_bricks_status(
                    slave_host, slave_vol)

                if current_slave_host not in slave_up_hosts:
                    if len(slave_up_hosts) > 0:
                        remote_host = "%s://%s@%s:%s" % (m.group(1),
                                                         m.group(2),
                                                         random.choice(
                                                             slave_up_hosts),
                                                         m.group(4))

            # Spawn the worker and agent in lock to avoid fd leak
            self.lock.acquire()

            logging.info('-' * conn_timeout)
            logging.info('starting gsyncd worker')

            # Couple of pipe pairs for RPC communication b/w
            # worker and changelog agent.

            # read/write end for agent
            (ra, ww) = os.pipe()
            # read/write end for worker
            (rw, wa) = os.pipe()

            # spawn the agent process
            apid = os.fork()
            if apid == 0:
                os.close(rw)
                os.close(ww)
                os.execv(sys.executable, argv + ['--local-path', w[0],
                                                 '--agent',
                                                 '--rpc-fd',
                                                 ','.join([str(ra), str(wa),
                                                           str(rw), str(ww)])])
            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.close(ra)
                os.close(wa)
                os.execv(sys.executable, argv + ['--feedback-fd', str(pw),
                                                 '--local-path', w[0],
                                                 '--local-id',
                                                 '.' + escape(w[0]),
                                                 '--rpc-fd',
                                                 ','.join([str(rw), str(ww),
                                                           str(ra), str(wa)]),
                                                 '--subvol-num', str(w[2])] +
                         (['--is-hottier'] if w[3] else []) +
                         ['--resource-remote', remote_host])

            cpids.add(cpid)
            agents.add(apid)
            os.close(pw)

            # close all RPC pipes in monitor
            os.close(ra)
            os.close(wa)
            os.close(rw)
            os.close(ww)
            self.lock.release()

            t0 = time.time()
            so = select((pr,), (), (), conn_timeout)[0]
            os.close(pr)

            if so:
                ret = nwait(cpid, os.WNOHANG)
                ret_agent = nwait(apid, os.WNOHANG)

                if ret_agent is not None:
                    # Agent is died Kill Worker
                    logging.info("Changelog Agent died, "
                                 "Aborting Worker(%s)" % w[0])
                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                    nwait(cpid)
                    nwait(apid)

                if ret is not None:
                    logging.info("worker(%s) died before establishing "
                                 "connection" % w[0])
                    nwait(apid)  # wait for agent
                else:
                    logging.debug("worker(%s) connected" % w[0])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        ret_agent = nwait(apid, os.WNOHANG)

                        if ret is not None:
                            logging.info("worker(%s) died in startup "
                                         "phase" % w[0])
                            nwait(apid)  # wait for agent
                            break

                        if ret_agent is not None:
                            # Agent is died Kill Worker
                            logging.info("Changelog Agent died, Aborting "
                                         "Worker(%s)" % w[0])
                            errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                            nwait(cpid)
                            nwait(apid)
                            break

                        time.sleep(1)
            else:
                logging.info("worker(%s) not confirmed in %d sec, "
                             "aborting it" % (w[0], conn_timeout))
                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                nwait(apid)  # wait for agent
                ret = nwait(cpid)
            if ret is None:
                self.status[w[0]].set_worker_status(self.ST_STABLE)
                # If worker dies, agent terminates on EOF.
                # So lets wait for agent first.
                nwait(apid)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.status[w[0]].set_worker_status(self.ST_FAULTY)
            time.sleep(10)
        self.status[w[0]].set_worker_status(self.ST_INCON)
        return ret
Exemplo n.º 18
0
 def terminate():
     # relax one SIGTERM by setting a handler that sets back
     # standard handler
     set_term_handler(lambda *a: set_term_handler())
     # give a chance to graceful exit
     errno_wrap(os.kill, [-os.getpid(), signal.SIGTERM], [ESRCH])
Exemplo n.º 19
0
    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """
        if not self.status.get(w[0]['dir'], None):
            self.status[w[0]['dir']] = GeorepStatus(
                gconf.state_file, w[0]['dir'], master,
                "%s::%s" % (slave_host, slave_vol))

        set_monitor_status(gconf.state_file, self.ST_STARTED)
        self.status[w[0]['dir']].set_worker_status(self.ST_INIT)

        ret = 0

        def nwait(p, o=0):
            try:
                p2, r = waitpid(p, o)
                if not p2:
                    return
                return r
            except OSError as e:
                # no child process, this happens if the child process
                # already died and has been cleaned up
                if e.errno == ECHILD:
                    return -1
                else:
                    raise

        def exit_signalled(s):
            """ child teminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1

        conn_timeout = int(gconf.connection_timeout)
        while ret in (0, 1):
            remote_host = w[1]
            # Check the status of the connected slave node
            # If the connected slave node is down then try to connect to
            # different up node.
            m = re.match("(ssh|gluster|file):\/\/(.+)@([^:]+):(.+)",
                         remote_host)
            if m:
                current_slave_host = m.group(3)
                slave_up_hosts = get_slave_bricks_status(slave_host, slave_vol)

                if current_slave_host not in slave_up_hosts:
                    if len(slave_up_hosts) > 0:
                        remote_host = "%s://%s@%s:%s" % (m.group(1), m.group(
                            2), random.choice(slave_up_hosts), m.group(4))

            # Spawn the worker and agent in lock to avoid fd leak
            self.lock.acquire()

            logging.info('starting gsyncd worker(%s). Slave node: %s' %
                         (w[0]['dir'], remote_host))

            # Couple of pipe pairs for RPC communication b/w
            # worker and changelog agent.

            # read/write end for agent
            (ra, ww) = os.pipe()
            # read/write end for worker
            (rw, wa) = os.pipe()

            # spawn the agent process
            apid = os.fork()
            if apid == 0:
                os.close(rw)
                os.close(ww)
                os.execv(
                    sys.executable, argv + [
                        '--local-path', w[0]['dir'], '--agent', '--rpc-fd',
                        ','.join([str(ra), str(wa),
                                  str(rw), str(ww)])
                    ])
            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.close(ra)
                os.close(wa)
                os.execv(
                    sys.executable, argv + [
                        '--feedback-fd',
                        str(pw), '--local-path', w[0]['dir'], '--local-id',
                        '.' + escape(w[0]['dir']), '--rpc-fd', ','.join(
                            [str(rw), str(ww),
                             str(ra), str(wa)]), '--subvol-num',
                        str(w[2])
                    ] + (['--is-hottier'] if w[3] else []) +
                    ['--resource-remote', remote_host])

            cpids.add(cpid)
            agents.add(apid)
            os.close(pw)

            # close all RPC pipes in monitor
            os.close(ra)
            os.close(wa)
            os.close(rw)
            os.close(ww)
            self.lock.release()

            t0 = time.time()
            so = select((pr, ), (), (), conn_timeout)[0]
            os.close(pr)

            if so:
                ret = nwait(cpid, os.WNOHANG)
                ret_agent = nwait(apid, os.WNOHANG)

                if ret_agent is not None:
                    # Agent is died Kill Worker
                    logging.info("Changelog Agent died, "
                                 "Aborting Worker(%s)" % w[0]['dir'])
                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                    nwait(cpid)
                    nwait(apid)

                if ret is not None:
                    logging.info("worker(%s) died before establishing "
                                 "connection" % w[0]['dir'])
                    nwait(apid)  # wait for agent
                else:
                    logging.debug("worker(%s) connected" % w[0]['dir'])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        ret_agent = nwait(apid, os.WNOHANG)

                        if ret is not None:
                            logging.info("worker(%s) died in startup "
                                         "phase" % w[0]['dir'])
                            nwait(apid)  # wait for agent
                            break

                        if ret_agent is not None:
                            # Agent is died Kill Worker
                            logging.info("Changelog Agent died, Aborting "
                                         "Worker(%s)" % w[0]['dir'])
                            errno_wrap(os.kill, [cpid, signal.SIGKILL],
                                       [ESRCH])
                            nwait(cpid)
                            nwait(apid)
                            break

                        time.sleep(1)
            else:
                logging.info("worker(%s) not confirmed in %d sec, "
                             "aborting it" % (w[0]['dir'], conn_timeout))
                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                nwait(apid)  # wait for agent
                ret = nwait(cpid)
            if ret is None:
                self.status[w[0]['dir']].set_worker_status(self.ST_STABLE)
                # If worker dies, agent terminates on EOF.
                # So lets wait for agent first.
                nwait(apid)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY)
                    gf_event(EVENT_GEOREP_FAULTY,
                             master_volume=master.volume,
                             master_node=w[0]['host'],
                             slave_host=slave_host,
                             slave_volume=slave_vol,
                             current_slave_host=current_slave_host,
                             brick_path=w[0]['dir'])
            time.sleep(10)
        self.status[w[0]['dir']].set_worker_status(self.ST_INCON)
        return ret
Exemplo n.º 20
0
    def entry_ops(cls, entries):
        pfx = gauxpfx()
        logging.debug("entries: %s" % repr(entries))
        # regular file

        def entry_pack_reg(gf, bn, mo, uid, gid):
            blen = len(bn)
            return struct.pack(cls._fmt_mknod(blen), uid, gid, gf, mo, bn, stat.S_IMODE(mo), 0, umask())

        def entry_pack_reg_stat(gf, bn, st):
            blen = len(bn)
            mo = st["mode"]
            return struct.pack(cls._fmt_mknod(blen), st["uid"], st["gid"], gf, mo, bn, stat.S_IMODE(mo), 0, umask())

        # mkdir

        def entry_pack_mkdir(gf, bn, mo, uid, gid):
            blen = len(bn)
            return struct.pack(cls._fmt_mkdir(blen), uid, gid, gf, mo, bn, stat.S_IMODE(mo), umask())

        # symlink

        def entry_pack_symlink(gf, bn, lnk, st):
            blen = len(bn)
            llen = len(lnk)
            return struct.pack(cls._fmt_symlink(blen, llen), st["uid"], st["gid"], gf, st["mode"], bn, lnk)

        def entry_purge(entry, gfid):
            # This is an extremely racy code and needs to be fixed ASAP.
            # The GFID check here is to be sure that the pargfid/bname
            # to be purged is the GFID gotten from the changelog.
            # (a stat(changelog_gfid) would also be valid here)
            # The race here is between the GFID check and the purge.
            disk_gfid = cls.gfid_mnt(entry)
            if isinstance(disk_gfid, int):
                return
            if not gfid == disk_gfid:
                return
            er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR])
            if isinstance(er, int):
                if er == EISDIR:
                    er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY])
                    if er == ENOTEMPTY:
                        return er

        for e in entries:
            blob = None
            op = e["op"]
            gfid = e["gfid"]
            entry = e["entry"]
            (pg, bname) = entry2pb(entry)
            if op in ["RMDIR", "UNLINK"]:
                while True:
                    er = entry_purge(entry, gfid)
                    if isinstance(er, int):
                        if er == ENOTEMPTY and op == "RMDIR":
                            er1 = errno_wrap(shutil.rmtree, [os.path.join(pg, bname)], [ENOENT])
                            if not isinstance(er1, int):
                                logging.info("Removed %s/%s recursively" % (pg, bname))
                                break

                        logging.warn("Failed to remove %s => %s/%s. %s" % (gfid, pg, bname, os.strerror(er)))
                        time.sleep(1)
                    else:
                        break
            elif op in ["CREATE", "MKNOD"]:
                blob = entry_pack_reg(gfid, bname, e["mode"], e["uid"], e["gid"])
            elif op == "MKDIR":
                blob = entry_pack_mkdir(gfid, bname, e["mode"], e["uid"], e["gid"])
            elif op == "LINK":
                slink = os.path.join(pfx, gfid)
                st = lstat(slink)
                if isinstance(st, int):
                    (pg, bname) = entry2pb(entry)
                    blob = entry_pack_reg_stat(gfid, bname, e["stat"])
                else:
                    errno_wrap(os.link, [slink, entry], [ENOENT, EEXIST])
            elif op == "SYMLINK":
                blob = entry_pack_symlink(gfid, bname, e["link"], e["stat"])
            elif op == "RENAME":
                en = e["entry1"]
                st = lstat(entry)
                if isinstance(st, int):
                    if e["stat"] and not stat.S_ISDIR(e["stat"]["mode"]):
                        (pg, bname) = entry2pb(en)
                        blob = entry_pack_reg_stat(gfid, bname, e["stat"])
                else:
                    errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST])
            if blob:
                errno_wrap(Xattr.lsetxattr, [pg, "glusterfs.gfid.newfile", blob], [EEXIST], [ENOENT, ESTALE, EINVAL])
Exemplo n.º 21
0
    def process_change(self, change, done, retry):
        pfx = gauxpfx()
        clist = []
        entries = []
        datas = set()

        # basic crawl stats: files and bytes
        files_pending = {'count': 0, 'purge': 0, 'bytes': 0, 'files': []}
        try:
            f = open(change, "r")
            clist = f.readlines()
            f.close()
        except IOError:
            raise

        def edct(op, **ed):
            dct = {}
            dct['op'] = op
            for k in ed:
                if k == 'stat':
                    st = ed[k]
                    dst = dct['stat'] = {}
                    dst['uid'] = st.st_uid
                    dst['gid'] = st.st_gid
                    dst['mode'] = st.st_mode
                else:
                    dct[k] = ed[k]
            return dct

        # regular file update: bytes & count
        def _update_reg(entry, size):
            if not entry in files_pending['files']:
                files_pending['count'] += 1
                files_pending['bytes'] += size
                files_pending['files'].append(entry)

        # updates for directories, symlinks etc..
        def _update_rest():
            files_pending['count'] += 1

        # entry count
        def entry_update(entry, size, mode):
            if stat.S_ISREG(mode):
                _update_reg(entry, size)
            else:
                _update_rest()

        # purge count
        def purge_update():
            files_pending['purge'] += 1

        for e in clist:
            e = e.strip()
            et = e[self.IDX_START:self.IDX_END]
            ec = e[self.IDX_END:].split(' ')
            if et in self.TYPE_ENTRY:
                ty = ec[self.POS_TYPE]
                en = unescape(os.path.join(pfx, ec[self.POS_ENTRY1]))
                gfid = ec[self.POS_GFID]
                # definitely need a better way bucketize entry ops
                if ty in ['UNLINK', 'RMDIR']:
                    purge_update()
                    entries.append(edct(ty, gfid=gfid, entry=en))
                    continue
                go = os.path.join(pfx, gfid)
                st = lstat(go)
                if isinstance(st, int):
                    if ty == 'RENAME':
                        entries.append(edct('UNLINK', gfid=gfid, entry=en))
                    else:
                        logging.debug('file %s got purged in the interim' % go)
                    continue
                entry_update(go, st.st_size, st.st_mode)
                if ty in ['CREATE', 'MKDIR', 'MKNOD']:
                    entries.append(edct(ty, stat=st, entry=en, gfid=gfid))
                elif ty == 'LINK':
                    entries.append(edct(ty, stat=st, entry=en, gfid=gfid))
                elif ty == 'SYMLINK':
                    rl = errno_wrap(os.readlink, [en], [ENOENT])
                    if isinstance(rl, int):
                        continue
                    entries.append(
                        edct(ty, stat=st, entry=en, gfid=gfid, link=rl))
                elif ty == 'RENAME':
                    e2 = unescape(os.path.join(pfx, ec[self.POS_ENTRY2]))
                    entries.append(
                        edct(ty, gfid=gfid, entry=en, entry1=e2, stat=st))
                else:
                    logging.warn('ignoring %s [op %s]' % (gfid, ty))
            elif et in self.TYPE_GFID:
                go = os.path.join(pfx, ec[0])
                st = lstat(go)
                if isinstance(st, int):
                    logging.debug('file %s got purged in the interim' % go)
                    continue
                entry_update(go, st.st_size, st.st_mode)
                datas.update([go])
        logging.debug('entries: %s' % repr(entries))
        if not retry:
            self.update_cumulative_stats(files_pending)
        # sync namespace
        if (entries):
            self.slave.server.entry_ops(entries)
        # sync data
        if self.syncdata(datas):
            if done:
                self.master.server.changelog_done(change)
            return True
Exemplo n.º 22
0
    def entry_ops(cls, entries):
        pfx = gauxpfx()
        logging.debug("entries: %s" % repr(entries))
        # regular file
        def entry_pack_reg(gf, bn, st):
            blen = len(bn)
            mo = st["mode"]
            return struct.pack(cls._fmt_mknod(blen), st["uid"], st["gid"], gf, mo, bn, stat.S_IMODE(mo), 0, umask())

        # mkdir
        def entry_pack_mkdir(gf, bn, st):
            blen = len(bn)
            mo = st["mode"]
            return struct.pack(cls._fmt_mkdir(blen), st["uid"], st["gid"], gf, mo, bn, stat.S_IMODE(mo), umask())

        # symlink
        def entry_pack_symlink(gf, bn, lnk, st):
            blen = len(bn)
            llen = len(lnk)
            return struct.pack(cls._fmt_symlink(blen, llen), st["uid"], st["gid"], gf, st["mode"], bn, lnk)

        def entry_purge(entry, gfid):
            # This is an extremely racy code and needs to be fixed ASAP.
            # The GFID check here is to be sure that the pargfid/bname
            # to be purged is the GFID gotten from the changelog.
            # (a stat(changelog_gfid) would also be valid here)
            # The race here is between the GFID check and the purge.
            disk_gfid = cls.gfid(entry)
            if isinstance(disk_gfid, int):
                return
            if not gfid == disk_gfid:
                return
            er = errno_wrap(os.unlink, [entry], [ENOENT, EISDIR])
            if isinstance(er, int):
                if er == EISDIR:
                    er = errno_wrap(os.rmdir, [entry], [ENOENT, ENOTEMPTY])
                    if er == ENOTEMPTY:
                        return er

        for e in entries:
            blob = None
            op = e["op"]
            gfid = e["gfid"]
            entry = e["entry"]
            (pg, bname) = entry2pb(entry)
            if op in ["RMDIR", "UNLINK"]:
                while True:
                    er = entry_purge(entry, gfid)
                    if isinstance(er, int):
                        time.sleep(1)
                    else:
                        break
            elif op == "CREATE":
                blob = entry_pack_reg(gfid, bname, e["stat"])
            elif op == "MKDIR":
                blob = entry_pack_mkdir(gfid, bname, e["stat"])
            elif op == "LINK":
                errno_wrap(os.link, [os.path.join(pfx, gfid), entry], [ENOENT, EEXIST])
            elif op == "SYMLINK":
                blob = entry_pack_symlink(gfid, bname, e["link"], e["stat"])
            elif op == "RENAME":
                en = e["entry1"]
                errno_wrap(os.rename, [entry, en], [ENOENT, EEXIST])
            if blob:
                errno_wrap(Xattr.lsetxattr_l, [pg, "glusterfs.gfid.newfile", blob], [ENOENT, EEXIST])
Exemplo n.º 23
0
    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master,
                suuid, slavenodes):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """
        if not self.status.get(w[0]['dir'], None):
            self.status[w[0]['dir']] = GeorepStatus(gconf.get("state-file"),
                                                    w[0]['host'],
                                                    w[0]['dir'],
                                                    w[0]['uuid'],
                                                    master,
                                                    "%s::%s" % (slave_host,
                                                                slave_vol))
        ret = 0

        def nwait(p, o=0):
            try:
                p2, r = waitpid(p, o)
                if not p2:
                    return
                return r
            except OSError as e:
                # no child process, this happens if the child process
                # already died and has been cleaned up
                if e.errno == ECHILD:
                    return -1
                else:
                    raise

        def exit_signalled(s):
            """ child terminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1

        conn_timeout = gconf.get("connection-timeout")
        while ret in (0, 1):
            remote_user, remote_host = w[1][0].split("@")
            remote_id = w[1][1]
            # Check the status of the connected slave node
            # If the connected slave node is down then try to connect to
            # different up node.
            current_slave_host = remote_host
            slave_up_hosts = get_up_nodes(slavenodes, gconf.get("ssh-port"))

            if (current_slave_host, remote_id) not in slave_up_hosts:
                if len(slave_up_hosts) > 0:
                    remote_new = random.choice(slave_up_hosts)
                    remote_host = "%s@%s" % (remote_user, remote_new[0])
                    remote_id = remote_new[1]

            # Spawn the worker and agent in lock to avoid fd leak
            self.lock.acquire()

            self.status[w[0]['dir']].set_worker_status(self.ST_INIT)
            logging.info(lf('starting gsyncd worker',
                            brick=w[0]['dir'],
                            slave_node=remote_host))

            # Couple of pipe pairs for RPC communication b/w
            # worker and changelog agent.

            # read/write end for agent
            (ra, ww) = pipe()
            # read/write end for worker
            (rw, wa) = pipe()

            # spawn the agent process
            apid = os.fork()
            if apid == 0:
                os.close(rw)
                os.close(ww)
                args_to_agent = argv + [
                    'agent',
                    rconf.args.master,
                    rconf.args.slave,
                    '--local-path', w[0]['dir'],
                    '--local-node', w[0]['host'],
                    '--local-node-id', w[0]['uuid'],
                    '--slave-id', suuid,
                    '--rpc-fd', ','.join([str(ra), str(wa), str(rw), str(ww)])
                ]

                if rconf.args.config_file is not None:
                    args_to_agent += ['-c', rconf.args.config_file]

                if rconf.args.debug:
                    args_to_agent.append("--debug")

                os.execv(sys.executable, args_to_agent)

            pr, pw = pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.close(ra)
                os.close(wa)

                args_to_worker = argv + [
                    'worker',
                    rconf.args.master,
                    rconf.args.slave,
                    '--feedback-fd', str(pw),
                    '--local-path', w[0]['dir'],
                    '--local-node', w[0]['host'],
                    '--local-node-id', w[0]['uuid'],
                    '--slave-id', suuid,
                    '--rpc-fd',
                    ','.join([str(rw), str(ww), str(ra), str(wa)]),
                    '--subvol-num', str(w[2]),
                    '--resource-remote', remote_host,
                    '--resource-remote-id', remote_id
                ]

                if rconf.args.config_file is not None:
                    args_to_worker += ['-c', rconf.args.config_file]

                if w[3]:
                    args_to_worker.append("--is-hottier")

                if rconf.args.debug:
                    args_to_worker.append("--debug")

                access_mount = gconf.get("access-mount")
                if access_mount:
                    os.execv(sys.executable, args_to_worker)
                else:
                    if unshare_propagation_supported():
                        logging.debug("Worker would mount volume privately")
                        unshare_cmd = ['unshare', '-m', '--propagation',
                                       'private']
                        cmd = unshare_cmd + args_to_worker
                        os.execvp("unshare", cmd)
                    else:
                        logging.debug("Mount is not private. It would be lazy"
                                      " umounted")
                        os.execv(sys.executable, args_to_worker)

            cpids.add(cpid)
            agents.add(apid)
            os.close(pw)

            # close all RPC pipes in monitor
            os.close(ra)
            os.close(wa)
            os.close(rw)
            os.close(ww)
            self.lock.release()

            t0 = time.time()
            so = select((pr,), (), (), conn_timeout)[0]
            os.close(pr)

            if so:
                ret = nwait(cpid, os.WNOHANG)
                ret_agent = nwait(apid, os.WNOHANG)

                if ret_agent is not None:
                    # Agent is died Kill Worker
                    logging.info(lf("Changelog Agent died, Aborting Worker",
                                    brick=w[0]['dir']))
                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                    nwait(cpid)
                    nwait(apid)

                if ret is not None:
                    logging.info(lf("worker died before establishing "
                                    "connection",
                                    brick=w[0]['dir']))
                    nwait(apid)  # wait for agent
                else:
                    logging.debug("worker(%s) connected" % w[0]['dir'])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        ret_agent = nwait(apid, os.WNOHANG)

                        if ret is not None:
                            logging.info(lf("worker died in startup phase",
                                            brick=w[0]['dir']))
                            nwait(apid)  # wait for agent
                            break

                        if ret_agent is not None:
                            # Agent is died Kill Worker
                            logging.info(lf("Changelog Agent died, Aborting "
                                            "Worker",
                                            brick=w[0]['dir']))
                            errno_wrap(os.kill, [cpid, signal.SIGKILL],
                                       [ESRCH])
                            nwait(cpid)
                            nwait(apid)
                            break

                        time.sleep(1)
            else:
                logging.info(
                    lf("Worker not confirmed after wait, aborting it. "
                       "Gsyncd invocation on remote slave via SSH or "
                       "gluster master mount might have hung. Please "
                       "check the above logs for exact issue and check "
                       "master or slave volume for errors. Restarting "
                       "master/slave volume accordingly might help.",
                       brick=w[0]['dir'],
                       timeout=conn_timeout))
                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                nwait(apid)  # wait for agent
                ret = nwait(cpid)
            if ret is None:
                # If worker dies, agent terminates on EOF.
                # So lets wait for agent first.
                nwait(apid)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY)
                    gf_event(EVENT_GEOREP_FAULTY,
                             master_volume=master.volume,
                             master_node=w[0]['host'],
                             master_node_id=w[0]['uuid'],
                             slave_host=slave_host,
                             slave_volume=slave_vol,
                             current_slave_host=current_slave_host,
                             brick_path=w[0]['dir'])
            time.sleep(10)
        self.status[w[0]['dir']].set_worker_status(self.ST_INCON)
        return ret
Exemplo n.º 24
0
 def gfid(cls, gfidpath):
     return errno_wrap(
         Xattr.lgetxattr,
         [gfidpath, 'glusterfs.gfid', cls.GX_GFID_CANONICAL_LEN], [ENOENT])
Exemplo n.º 25
0
    def monitor(self, w, argv, cpids, agents, slave_vol, slave_host, master,
                suuid, slavenodes):
        """the monitor loop

        Basic logic is a blantantly simple blunt heuristics:
        if spawned client survives 60 secs, it's considered OK.
        This servers us pretty well as it's not vulneralbe to
        any kind of irregular behavior of the child...

        ... well, except for one: if children is hung up on
        waiting for some event, it can survive aeons, still
        will be defunct. So we tweak the above logic to
        expect the worker to send us a signal within 60 secs
        (in the form of closing its end of a pipe). The worker
        does this when it's done with the setup stage
        ready to enter the service loop (note it's the setup
        stage which is vulnerable to hangs -- the full
        blown worker blows up on EPIPE if the net goes down,
        due to the keep-alive thread)
        """
        if not self.status.get(w[0]['dir'], None):
            self.status[w[0]['dir']] = GeorepStatus(
                gconf.get("state-file"), w[0]['host'], w[0]['dir'],
                w[0]['uuid'], master, "%s::%s" % (slave_host, slave_vol))

        set_monitor_status(gconf.get("state-file"), self.ST_STARTED)
        self.status[w[0]['dir']].set_worker_status(self.ST_INIT)

        ret = 0

        def nwait(p, o=0):
            try:
                p2, r = waitpid(p, o)
                if not p2:
                    return
                return r
            except OSError as e:
                # no child process, this happens if the child process
                # already died and has been cleaned up
                if e.errno == ECHILD:
                    return -1
                else:
                    raise

        def exit_signalled(s):
            """ child teminated due to receipt of SIGUSR1 """
            return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))

        def exit_status(s):
            if os.WIFEXITED(s):
                return os.WEXITSTATUS(s)
            return 1

        conn_timeout = gconf.get("connection-timeout")
        while ret in (0, 1):
            remote_user, remote_host = w[1][0].split("@")
            remote_id = w[1][1]
            # Check the status of the connected slave node
            # If the connected slave node is down then try to connect to
            # different up node.
            current_slave_host = remote_host
            slave_up_hosts = get_up_nodes(slavenodes, gconf.get("ssh-port"))

            if (current_slave_host, remote_id) not in slave_up_hosts:
                if len(slave_up_hosts) > 0:
                    remote_new = random.choice(slave_up_hosts)
                    remote_host = "%s@%s" % (remote_user, remote_new[0])
                    remote_id = remote_new[1]

            # Spawn the worker and agent in lock to avoid fd leak
            self.lock.acquire()

            logging.info(
                lf('starting gsyncd worker',
                   brick=w[0]['dir'],
                   slave_node=remote_host))

            # Couple of pipe pairs for RPC communication b/w
            # worker and changelog agent.

            # read/write end for agent
            (ra, ww) = os.pipe()
            # read/write end for worker
            (rw, wa) = os.pipe()

            # spawn the agent process
            apid = os.fork()
            if apid == 0:
                os.close(rw)
                os.close(ww)
                args_to_agent = argv + [
                    'agent', rconf.args.master, rconf.args.slave,
                    '--local-path', w[0]['dir'], '--local-node', w[0]['host'],
                    '--local-node-id', w[0]['uuid'], '--slave-id', suuid,
                    '--rpc-fd', ','.join(
                        [str(ra), str(wa), str(rw),
                         str(ww)])
                ]

                if rconf.args.config_file is not None:
                    args_to_agent += ['-c', rconf.args.config_file]

                if rconf.args.debug:
                    args_to_agent.append("--debug")

                os.execv(sys.executable, args_to_agent)

            pr, pw = os.pipe()
            cpid = os.fork()
            if cpid == 0:
                os.close(pr)
                os.close(ra)
                os.close(wa)

                args_to_worker = argv + [
                    'worker', rconf.args.master, rconf.args.slave,
                    '--feedback-fd',
                    str(pw), '--local-path', w[0]['dir'], '--local-node',
                    w[0]['host'], '--local-node-id', w[0]['uuid'],
                    '--slave-id', suuid, '--rpc-fd', ','.join(
                        [str(rw), str(ww), str(ra),
                         str(wa)]), '--subvol-num',
                    str(w[2]), '--resource-remote', remote_host,
                    '--resource-remote-id', remote_id
                ]

                if rconf.args.config_file is not None:
                    args_to_worker += ['-c', rconf.args.config_file]

                if w[3]:
                    args_to_worker.append("--is-hottier")

                if rconf.args.debug:
                    args_to_worker.append("--debug")

                os.execv(sys.executable, args_to_worker)

            cpids.add(cpid)
            agents.add(apid)
            os.close(pw)

            # close all RPC pipes in monitor
            os.close(ra)
            os.close(wa)
            os.close(rw)
            os.close(ww)
            self.lock.release()

            t0 = time.time()
            so = select((pr, ), (), (), conn_timeout)[0]
            os.close(pr)

            if so:
                ret = nwait(cpid, os.WNOHANG)
                ret_agent = nwait(apid, os.WNOHANG)

                if ret_agent is not None:
                    # Agent is died Kill Worker
                    logging.info(
                        lf("Changelog Agent died, Aborting Worker",
                           brick=w[0]['dir']))
                    errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                    nwait(cpid)
                    nwait(apid)

                if ret is not None:
                    logging.info(
                        lf("worker died before establishing "
                           "connection",
                           brick=w[0]['dir']))
                    nwait(apid)  # wait for agent
                else:
                    logging.debug("worker(%s) connected" % w[0]['dir'])
                    while time.time() < t0 + conn_timeout:
                        ret = nwait(cpid, os.WNOHANG)
                        ret_agent = nwait(apid, os.WNOHANG)

                        if ret is not None:
                            logging.info(
                                lf("worker died in startup phase",
                                   brick=w[0]['dir']))
                            nwait(apid)  # wait for agent
                            break

                        if ret_agent is not None:
                            # Agent is died Kill Worker
                            logging.info(
                                lf("Changelog Agent died, Aborting "
                                   "Worker",
                                   brick=w[0]['dir']))
                            errno_wrap(os.kill, [cpid, signal.SIGKILL],
                                       [ESRCH])
                            nwait(cpid)
                            nwait(apid)
                            break

                        time.sleep(1)
            else:
                logging.info(
                    lf(
                        "Worker not confirmed after wait, aborting it. "
                        "Gsyncd invocation on remote slave via SSH or "
                        "gluster master mount might have hung. Please "
                        "check the above logs for exact issue and check "
                        "master or slave volume for errors. Restarting "
                        "master/slave volume accordingly might help.",
                        brick=w[0]['dir'],
                        timeout=conn_timeout))
                errno_wrap(os.kill, [cpid, signal.SIGKILL], [ESRCH])
                nwait(apid)  # wait for agent
                ret = nwait(cpid)
            if ret is None:
                # If worker dies, agent terminates on EOF.
                # So lets wait for agent first.
                nwait(apid)
                ret = nwait(cpid)
            if exit_signalled(ret):
                ret = 0
            else:
                ret = exit_status(ret)
                if ret in (0, 1):
                    self.status[w[0]['dir']].set_worker_status(self.ST_FAULTY)
                    gf_event(EVENT_GEOREP_FAULTY,
                             master_volume=master.volume,
                             master_node=w[0]['host'],
                             master_node_id=w[0]['uuid'],
                             slave_host=slave_host,
                             slave_volume=slave_vol,
                             current_slave_host=current_slave_host,
                             brick_path=w[0]['dir'])
            time.sleep(10)
        self.status[w[0]['dir']].set_worker_status(self.ST_INCON)
        return ret
Exemplo n.º 26
0
    def process_change(self, change, done, retry):
        pfx = gauxpfx()
        clist   = []
        entries = []
        datas = set()

        # basic crawl stats: files and bytes
        files_pending  = {'count': 0, 'purge': 0, 'bytes': 0, 'files': []}
        try:
            f = open(change, "r")
            clist = f.readlines()
            f.close()
        except IOError:
            raise

        def edct(op, **ed):
            dct = {}
            dct['op'] = op
            for k in ed:
                if k == 'stat':
                    st = ed[k]
                    dst = dct['stat'] = {}
                    dst['uid'] = st.st_uid
                    dst['gid'] = st.st_gid
                    dst['mode'] = st.st_mode
                else:
                    dct[k] = ed[k]
            return dct

        # regular file update: bytes & count
        def _update_reg(entry, size):
            if not entry in files_pending['files']:
                files_pending['count'] += 1
                files_pending['bytes'] += size
                files_pending['files'].append(entry)
        # updates for directories, symlinks etc..
        def _update_rest():
            files_pending['count'] += 1

        # entry count
        def entry_update(entry, size, mode):
            if stat.S_ISREG(mode):
                _update_reg(entry, size)
            else:
                _update_rest()
        # purge count
        def purge_update():
            files_pending['purge'] += 1

        for e in clist:
            e = e.strip()
            et = e[self.IDX_START:self.IDX_END]
            ec = e[self.IDX_END:].split(' ')
            if et in self.TYPE_ENTRY:
                ty = ec[self.POS_TYPE]
                en = unescape(os.path.join(pfx, ec[self.POS_ENTRY1]))
                gfid = ec[self.POS_GFID]
                # definitely need a better way bucketize entry ops
                if ty in ['UNLINK', 'RMDIR']:
                    purge_update()
                    entries.append(edct(ty, gfid=gfid, entry=en))
                    continue
                go = os.path.join(pfx, gfid)
                st = lstat(go)
                if isinstance(st, int):
		    if ty == 'RENAME':
                        entries.append(edct('UNLINK', gfid=gfid, entry=en))
		    else:
                        logging.debug('file %s got purged in the interim' % go)
                    continue
                entry_update(go, st.st_size, st.st_mode)
                if ty in ['CREATE', 'MKDIR', 'MKNOD']:
                    entries.append(edct(ty, stat=st, entry=en, gfid=gfid))
                elif ty == 'LINK':
                    entries.append(edct(ty, stat=st, entry=en, gfid=gfid))
                elif ty == 'SYMLINK':
                    rl = errno_wrap(os.readlink, [en], [ENOENT])
                    if isinstance(rl, int):
                        continue
                    entries.append(edct(ty, stat=st, entry=en, gfid=gfid, link=rl))
                elif ty == 'RENAME':
                    e2 = unescape(os.path.join(pfx, ec[self.POS_ENTRY2]))
                    entries.append(edct(ty, gfid=gfid, entry=en, entry1=e2, stat=st))
                else:
                    logging.warn('ignoring %s [op %s]' % (gfid, ty))
            elif et in self.TYPE_GFID:
                go = os.path.join(pfx, ec[0])
                st = lstat(go)
                if isinstance(st, int):
                    logging.debug('file %s got purged in the interim' % go)
                    continue
                entry_update(go, st.st_size, st.st_mode)
                datas.update([go])
        logging.debug('entries: %s' % repr(entries))
        if not retry:
            self.update_cumulative_stats(files_pending)
        # sync namespace
        if (entries):
            self.slave.server.entry_ops(entries)
        # sync data
        if self.syncdata(datas):
            if done:
                self.master.server.changelog_done(change)
            return True
Exemplo n.º 27
0
    def crawl(self, path='.', xtr=None, done=0):
        """ generate a CHANGELOG file consumable by process_change """
        if path == '.':
            self.open()
            self.crawls += 1
        if not xtr:
            # get the root stime and use it for all comparisons
            xtr = self.xtime('.', self.slave)
            if isinstance(xtr, int):
                if xtr != ENOENT:
                    raise GsyncdError('slave is corrupt')
                xtr = self.minus_infinity
        xtl = self.xtime(path)
        if isinstance(xtl, int):
            raise GsyncdError('master is corrupt')
        if xtr == xtl:
            if path == '.':
                self.close()
            return
        self.xtime_reversion_hook(path, xtl, xtr)
        logging.debug("entering " + path)
        dem = self.master.server.entries(path)
        pargfid = self.master.server.gfid(path)
        if isinstance(pargfid, int):
            logging.warn('skipping directory %s' % (path))
        for e in dem:
            bname = e
            e = os.path.join(path, e)
            st = lstat(e)
            if isinstance(st, int):
                logging.warn('%s got purged in the interim..' % e)
                continue
            gfid = self.master.server.gfid(e)
            if isinstance(gfid, int):
                logging.warn('skipping entry %s..' % (e))
                continue
            xte = self.xtime(e)
            if isinstance(xte, int):
                raise GsyncdError('master is corrupt')
            if not self.need_sync(e, xte, xtr):
                continue
            mo = st.st_mode
            if stat.S_ISDIR(mo):
                self.write_entry_change(
                    "E", [gfid, 'MKDIR',
                          escape(os.path.join(pargfid, bname))])
                self.crawl(e, xtr)
            elif stat.S_ISLNK(mo):
                rl = errno_wrap(os.readlink, [en], [ENOENT])
                if isinstance(rl, int):
                    continue
                self.write_entry_change("E", [
                    gfid, 'SYMLINK',
                    escape(os.path.join(pargfid, bname)), rl
                ])
            else:
                # if a file has a hardlink, create a Changelog entry as 'LINK' so the slave
                # side will decide if to create the new entry, or to create link.
                if st.st_nlink == 1:
                    self.write_entry_change(
                        "E",
                        [gfid, 'MKNOD',
                         escape(os.path.join(pargfid, bname))])
                else:
                    self.write_entry_change(
                        "E",
                        [gfid, 'LINK',
                         escape(os.path.join(pargfid, bname))])
                if stat.S_ISREG(mo):
                    self.write_entry_change("D", [gfid])

        if path == '.':
            logging.info('processing xsync changelog %s' % self.fname())
            self.close()
            self.process([self.fname()], done)
            self.upd_stime(xtl)
Exemplo n.º 28
0
 def gfid(cls, gfidpath):
     return errno_wrap(Xattr.lgetxattr, [gfidpath, "glusterfs.gfid", cls.GX_GFID_CANONICAL_LEN], [ENOENT])