Пример #1
0
 def __init__(self, conf):
     """
     :param conf: configuration object obtained from ConfigParser
     :param logger: logging object
     """
     self.conf = conf
     self.logger = get_logger(conf, log_route='object-replicator')
     self.devices_dir = conf.get('devices', '/srv/node')
     self.mount_check = conf.get('mount_check', 'true').lower() in \
                           ('true', 't', '1', 'on', 'yes', 'y')
     self.vm_test_mode = conf.get('vm_test_mode',
                                  'no').lower() in ('yes', 'true', 'on',
                                                    '1')
     self.chase_dir = conf.get('chase_dir', '/etc/chase')
     self.port = int(conf.get('bind_port', 6000))
     self.concurrency = int(conf.get('concurrency', 1))
     self.stats_interval = int(conf.get('stats_interval', '300'))
     self.object_ring = Ring(join(self.chase_dir, 'object.ring.gz'))
     self.ring_check_interval = int(conf.get('ring_check_interval', 15))
     self.next_check = time.time() + self.ring_check_interval
     self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7))
     self.partition_times = []
     self.run_pause = int(conf.get('run_pause', 30))
     self.rsync_timeout = int(conf.get('rsync_timeout', 900))
     self.rsync_io_timeout = conf.get('rsync_io_timeout', '30')
     self.http_timeout = int(conf.get('http_timeout', 60))
     self.lockup_timeout = int(conf.get('lockup_timeout', 1800))
     self.recon_enable = conf.get('recon_enable',
                                  'no').lower() in TRUE_VALUES
     self.recon_cache_path = conf.get('recon_cache_path',
                                      '/var/cache/chase')
     self.recon_object = os.path.join(self.recon_cache_path, "object.recon")
Пример #2
0
def reset_environment():
    call(['resetchase'])
    pids = {}
    try:
        port2server = {}
        for s, p in (('account', 6002), ('container', 6001), ('object', 6000)):
            for n in xrange(1, 5):
                pids['%s%d' % (s, n)] = \
                    Popen(['chase-%s-server' % s,
                           '/etc/chase/%s-server/%d.conf' % (s, n)]).pid
                port2server[p + (n * 10)] = '%s%d' % (s, n)
        pids['proxy'] = Popen(
            ['chase-proxy-server', '/etc/chase/proxy-server.conf']).pid
        account_ring = Ring('/etc/chase/account.ring.gz')
        container_ring = Ring('/etc/chase/container.ring.gz')
        object_ring = Ring('/etc/chase/object.ring.gz')
        attempt = 0
        while True:
            attempt += 1
            try:
                url, token = get_auth('http://127.0.0.1:8080/auth/v1.0',
                                      'test:tester', 'testing')
                account = url.split('/')[-1]
                break
            except Exception, err:
                if attempt > 9:
                    print err
                    print 'Giving up after %s retries.' % attempt
                    raise err
                print err
                print 'Retrying in 2 seconds...'
                sleep(2)
    except BaseException, err:
        kill_pids(pids)
        raise err
Пример #3
0
 def __init__(self, conf, container_ring=None, object_ring=None):
     #: The dict of configuration values from the [container-sync] section
     #: of the container-server.conf.
     self.conf = conf
     #: Logger to use for container-sync log lines.
     self.logger = get_logger(conf, log_route='container-sync')
     #: Path to the local device mount points.
     self.devices = conf.get('devices', '/srv/node')
     #: Indicates whether mount points should be verified as actual mount
     #: points (normally true, false for tests and SAIO).
     self.mount_check = \
         conf.get('mount_check', 'true').lower() in TRUE_VALUES
     #: Minimum time between full scans. This is to keep the daemon from
     #: running wild on near empty systems.
     self.interval = int(conf.get('interval', 300))
     #: Maximum amount of time to spend syncing a container before moving on
     #: to the next one. If a conatiner sync hasn't finished in this time,
     #: it'll just be resumed next scan.
     self.container_time = int(conf.get('container_time', 60))
     #: The list of hosts we're allowed to send syncs to.
     self.allowed_sync_hosts = [
         h.strip()
         for h in conf.get('allowed_sync_hosts', '127.0.0.1').split(',')
         if h.strip()
     ]
     self.proxy = conf.get('sync_proxy')
     #: Number of containers with sync turned on that were successfully
     #: synced.
     self.container_syncs = 0
     #: Number of successful DELETEs triggered.
     self.container_deletes = 0
     #: Number of successful PUTs triggered.
     self.container_puts = 0
     #: Number of containers that didn't have sync turned on.
     self.container_skips = 0
     #: Number of containers that had a failure of some type.
     self.container_failures = 0
     #: Time of last stats report.
     self.reported = time()
     chase_dir = conf.get('chase_dir', '/etc/chase')
     #: chase.common.ring.Ring for locating containers.
     self.container_ring = container_ring or \
         Ring(os.path.join(chase_dir, 'container.ring.gz'))
     #: chase.common.ring.Ring for locating objects.
     self.object_ring = object_ring or \
         Ring(os.path.join(chase_dir, 'object.ring.gz'))
     self._myips = whataremyips()
     self._myport = int(conf.get('bind_port', 6001))
Пример #4
0
 def __init__(self, conf):
     """
     :param conf: configuration object obtained from ConfigParser
     :param logger: logging object
     """
     self.conf = conf
     self.logger = get_logger(conf, log_route='object-replicator')
     self.devices_dir = conf.get('devices', '/srv/node')
     self.mount_check = conf.get('mount_check', 'true').lower() in \
                           ('true', 't', '1', 'on', 'yes', 'y')
     self.vm_test_mode = conf.get(
             'vm_test_mode', 'no').lower() in ('yes', 'true', 'on', '1')
     self.chase_dir = conf.get('chase_dir', '/etc/chase')
     self.port = int(conf.get('bind_port', 6000))
     self.concurrency = int(conf.get('concurrency', 1))
     self.stats_interval = int(conf.get('stats_interval', '300'))
     self.object_ring = Ring(join(self.chase_dir, 'object.ring.gz'))
     self.ring_check_interval = int(conf.get('ring_check_interval', 15))
     self.next_check = time.time() + self.ring_check_interval
     self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7))
     self.partition_times = []
     self.run_pause = int(conf.get('run_pause', 30))
     self.rsync_timeout = int(conf.get('rsync_timeout', 900))
     self.rsync_io_timeout = conf.get('rsync_io_timeout', '30')
     self.http_timeout = int(conf.get('http_timeout', 60))
     self.lockup_timeout = int(conf.get('lockup_timeout', 1800))
     self.recon_enable = conf.get(
             'recon_enable', 'no').lower() in TRUE_VALUES
     self.recon_cache_path = conf.get(
             'recon_cache_path', '/var/cache/chase')
     self.recon_object = os.path.join(self.recon_cache_path, "object.recon")
Пример #5
0
 def get_account_ring(self):
     """Get the account ring.  Load it if it hasn't been yet."""
     if not self.account_ring:
         self.logger.debug(
             _('Loading account ring from %s'), self.account_ring_path)
         self.account_ring = Ring(self.account_ring_path)
     return self.account_ring
Пример #6
0
 def get_object_ring(self):
     """ The object :class:`chase.common.ring.Ring` for the cluster. """
     if not self.object_ring:
         self.logger.debug(_('Loading object ring from %s'),
                           self.object_ring_path)
         self.object_ring = Ring(self.object_ring_path)
     return self.object_ring
Пример #7
0
 def get_container_ring(self):
     """ The container :class:`chase.common.ring.Ring` for the cluster. """
     if not self.container_ring:
         self.logger.debug(_('Loading container ring from %s'),
                           self.container_ring_path)
         self.container_ring = Ring(self.container_ring_path)
     return self.container_ring
Пример #8
0
 def get_container_ring(self):
     """Get the container ring.  Load it, if it hasn't been yet."""
     if not self.container_ring:
         self.logger.debug(_('Loading container ring from %s'),
                           self.container_ring_path)
         self.container_ring = Ring(self.container_ring_path)
     return self.container_ring
Пример #9
0
class ObjectReplicator(Daemon):
    """
    Replicate objects.

    Encapsulates most logic and data needed by the object replication process.
    Each call to .replicate() performs one replication pass.  It's up to the
    caller to do this in a loop.
    """
    def __init__(self, conf):
        """
        :param conf: configuration object obtained from ConfigParser
        :param logger: logging object
        """
        self.conf = conf
        self.logger = get_logger(conf, log_route='object-replicator')
        self.devices_dir = conf.get('devices', '/srv/node')
        self.mount_check = conf.get('mount_check', 'true').lower() in \
                              ('true', 't', '1', 'on', 'yes', 'y')
        self.vm_test_mode = conf.get('vm_test_mode',
                                     'no').lower() in ('yes', 'true', 'on',
                                                       '1')
        self.chase_dir = conf.get('chase_dir', '/etc/chase')
        self.port = int(conf.get('bind_port', 6000))
        self.concurrency = int(conf.get('concurrency', 1))
        self.stats_interval = int(conf.get('stats_interval', '300'))
        self.object_ring = Ring(join(self.chase_dir, 'object.ring.gz'))
        self.ring_check_interval = int(conf.get('ring_check_interval', 15))
        self.next_check = time.time() + self.ring_check_interval
        self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7))
        self.partition_times = []
        self.run_pause = int(conf.get('run_pause', 30))
        self.rsync_timeout = int(conf.get('rsync_timeout', 900))
        self.rsync_io_timeout = conf.get('rsync_io_timeout', '30')
        self.http_timeout = int(conf.get('http_timeout', 60))
        self.lockup_timeout = int(conf.get('lockup_timeout', 1800))
        self.recon_enable = conf.get('recon_enable',
                                     'no').lower() in TRUE_VALUES
        self.recon_cache_path = conf.get('recon_cache_path',
                                         '/var/cache/chase')
        self.recon_object = os.path.join(self.recon_cache_path, "object.recon")

    def _rsync(self, args):
        """
        Execute the rsync binary to replicate a partition.

        :returns: return code of rsync process. 0 is successful
        """
        start_time = time.time()
        ret_val = None
        try:
            with Timeout(self.rsync_timeout):
                proc = subprocess.Popen(args,
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.STDOUT)
                results = proc.stdout.read()
                ret_val = proc.wait()
        except Timeout:
            self.logger.error(_("Killing long-running rsync: %s"), str(args))
            proc.kill()
            return 1  # failure response code
        total_time = time.time() - start_time
        for result in results.split('\n'):
            if result == '':
                continue
            if result.startswith('cd+'):
                continue
            if not ret_val:
                self.logger.info(result)
            else:
                self.logger.error(result)
        if ret_val:
            self.logger.error(_('Bad rsync return code: %(args)s -> %(ret)d'),
                              {
                                  'args': str(args),
                                  'ret': ret_val
                              })
        elif results:
            self.logger.info(
                _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {
                    'src': args[-2],
                    'dst': args[-1],
                    'time': total_time
                })
        else:
            self.logger.debug(
                _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"), {
                    'src': args[-2],
                    'dst': args[-1],
                    'time': total_time
                })
        return ret_val

    def rsync(self, node, job, suffixes):
        """
        Synchronize local suffix directories from a partition with a remote
        node.

        :param node: the "dev" entry for the remote node to sync with
        :param job: information about the partition being synced
        :param suffixes: a list of suffixes which need to be pushed

        :returns: boolean indicating success or failure
        """
        if not os.path.exists(job['path']):
            return False
        args = [
            'rsync',
            '--recursive',
            '--whole-file',
            '--human-readable',
            '--xattrs',
            '--itemize-changes',
            '--ignore-existing',
            '--timeout=%s' % self.rsync_io_timeout,
            '--contimeout=%s' % self.rsync_io_timeout,
        ]
        if self.vm_test_mode:
            rsync_module = '[%s]::object%s' % (node['ip'], node['port'])
        else:
            rsync_module = '[%s]::object' % node['ip']
        had_any = False
        for suffix in suffixes:
            spath = join(job['path'], suffix)
            if os.path.exists(spath):
                args.append(spath)
                had_any = True
        if not had_any:
            return False
        args.append(
            join(rsync_module, node['device'], 'objects', job['partition']))
        return self._rsync(args) == 0

    def check_ring(self):
        """
        Check to see if the ring has been updated

        :returns: boolean indicating whether or not the ring has changed
        """
        if time.time() > self.next_check:
            self.next_check = time.time() + self.ring_check_interval
            if self.object_ring.has_changed():
                return False
        return True

    def update_deleted(self, job):
        """
        High-level method that replicates a single partition that doesn't
        belong on this node.

        :param job: a dict containing info about the partition to be replicated
        """
        def tpool_get_suffixes(path):
            return [
                suff for suff in os.listdir(path)
                if len(suff) == 3 and isdir(join(path, suff))
            ]

        self.replication_count += 1
        begin = time.time()
        try:
            responses = []
            suffixes = tpool.execute(tpool_get_suffixes, job['path'])
            if suffixes:
                for node in job['nodes']:
                    success = self.rsync(node, job, suffixes)
                    if success:
                        with Timeout(self.http_timeout):
                            http_connect(node['ip'],
                                         node['port'],
                                         node['device'],
                                         job['partition'],
                                         'REPLICATE',
                                         '/' + '-'.join(suffixes),
                                         headers={
                                             'Content-Length': '0'
                                         }).getresponse().read()
                    responses.append(success)
            if not suffixes or (len(responses) == \
                        self.object_ring.replica_count and all(responses)):
                self.logger.info(_("Removing partition: %s"), job['path'])
                tpool.execute(shutil.rmtree, job['path'], ignore_errors=True)
        except (Exception, Timeout):
            self.logger.exception(_("Error syncing handoff partition"))
        finally:
            self.partition_times.append(time.time() - begin)

    def update(self, job):
        """
        High-level method that replicates a single partition.

        :param job: a dict containing info about the partition to be replicated
        """
        self.replication_count += 1
        begin = time.time()
        try:
            hashed, local_hash = tpool.execute(
                tpooled_get_hashes,
                job['path'],
                do_listdir=(self.replication_count % 10) == 0,
                reclaim_age=self.reclaim_age)
            # See tpooled_get_hashes "Hack".
            if isinstance(hashed, BaseException):
                raise hashed
            self.suffix_hash += hashed
            attempts_left = self.object_ring.replica_count - 1
            nodes = itertools.chain(
                job['nodes'],
                self.object_ring.get_more_nodes(int(job['partition'])))
            while attempts_left > 0:
                # If this throws StopIterator it will be caught way below
                node = next(nodes)
                attempts_left -= 1
                try:
                    with Timeout(self.http_timeout):
                        resp = http_connect(node['ip'],
                                            node['port'],
                                            node['device'],
                                            job['partition'],
                                            'REPLICATE',
                                            '',
                                            headers={
                                                'Content-Length': '0'
                                            }).getresponse()
                        if resp.status == 507:
                            self.logger.error(
                                _('%(ip)s/%(device)s responded'
                                  ' as unmounted'), node)
                            attempts_left += 1
                            continue
                        if resp.status != 200:
                            self.logger.error(
                                _("Invalid response %(resp)s "
                                  "from %(ip)s"), {
                                      'resp': resp.status,
                                      'ip': node['ip']
                                  })
                            continue
                        remote_hash = pickle.loads(resp.read())
                        del resp
                    suffixes = [
                        suffix for suffix in local_hash
                        if local_hash[suffix] != remote_hash.get(suffix, -1)
                    ]
                    if not suffixes:
                        continue
                    hashed, recalc_hash = tpool.execute(
                        tpooled_get_hashes,
                        job['path'],
                        recalculate=suffixes,
                        reclaim_age=self.reclaim_age)
                    # See tpooled_get_hashes "Hack".
                    if isinstance(hashed, BaseException):
                        raise hashed
                    local_hash = recalc_hash
                    suffixes = [
                        suffix for suffix in local_hash
                        if local_hash[suffix] != remote_hash.get(suffix, -1)
                    ]
                    self.rsync(node, job, suffixes)
                    with Timeout(self.http_timeout):
                        conn = http_connect(node['ip'],
                                            node['port'],
                                            node['device'],
                                            job['partition'],
                                            'REPLICATE',
                                            '/' + '-'.join(suffixes),
                                            headers={'Content-Length': '0'})
                        conn.getresponse().read()
                    self.suffix_sync += len(suffixes)
                except (Exception, Timeout):
                    self.logger.exception(
                        _("Error syncing with node: %s") % node)
            self.suffix_count += len(local_hash)
        except (Exception, Timeout):
            self.logger.exception(_("Error syncing partition"))
        finally:
            self.partition_times.append(time.time() - begin)

    def stats_line(self):
        """
        Logs various stats for the currently running replication pass.
        """
        if self.replication_count:
            rate = self.replication_count / (time.time() - self.start)
            self.logger.info(
                _("%(replicated)d/%(total)d (%(percentage).2f%%)"
                  " partitions replicated in %(time).2fs (%(rate).2f/sec, "
                  "%(remaining)s remaining)"), {
                      'replicated':
                      self.replication_count,
                      'total':
                      self.job_count,
                      'percentage':
                      self.replication_count * 100.0 / self.job_count,
                      'time':
                      time.time() - self.start,
                      'rate':
                      rate,
                      'remaining':
                      '%d%s' % compute_eta(self.start, self.replication_count,
                                           self.job_count)
                  })
            if self.suffix_count:
                self.logger.info(
                    _("%(checked)d suffixes checked - "
                      "%(hashed).2f%% hashed, %(synced).2f%% synced"), {
                          'checked': self.suffix_count,
                          'hashed':
                          (self.suffix_hash * 100.0) / self.suffix_count,
                          'synced':
                          (self.suffix_sync * 100.0) / self.suffix_count
                      })
                self.partition_times.sort()
                self.logger.info(
                    _("Partition times: max %(max).4fs, "
                      "min %(min).4fs, med %(med).4fs"), {
                          'max': self.partition_times[-1],
                          'min': self.partition_times[0],
                          'med':
                          self.partition_times[len(self.partition_times) // 2]
                      })
        else:
            self.logger.info(_("Nothing replicated for %s seconds."),
                             (time.time() - self.start))

    def kill_coros(self):
        """Utility function that kills all coroutines currently running."""
        for coro in list(self.run_pool.coroutines_running):
            try:
                coro.kill(GreenletExit)
            except GreenletExit:
                pass

    def heartbeat(self):
        """
        Loop that runs in the background during replication.  It periodically
        logs progress.
        """
        while True:
            eventlet.sleep(self.stats_interval)
            self.stats_line()

    def detect_lockups(self):
        """
        In testing, the pool.waitall() call very occasionally failed to return.
        This is an attempt to make sure the replicator finishes its replication
        pass in some eventuality.
        """
        while True:
            eventlet.sleep(self.lockup_timeout)
            if self.replication_count == self.last_replication_count:
                self.logger.error(_("Lockup detected.. killing live coros."))
                self.kill_coros()
            self.last_replication_count = self.replication_count

    def collect_jobs(self):
        """
        Returns a sorted list of jobs (dictionaries) that specify the
        partitions, nodes, etc to be rsynced.
        """
        jobs = []
        ips = whataremyips()
        for local_dev in [
                dev for dev in self.object_ring.devs
                if dev and dev['ip'] in ips and dev['port'] == self.port
        ]:
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, 'objects')
            tmp_path = join(dev_path, 'tmp')
            if self.mount_check and not os.path.ismount(dev_path):
                self.logger.warn(_('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                continue
            for partition in os.listdir(obj_path):
                try:
                    nodes = [
                        node for node in self.object_ring.get_part_nodes(
                            int(partition)) if node['id'] != local_dev['id']
                    ]
                    jobs.append(
                        dict(path=join(obj_path, partition),
                             nodes=nodes,
                             delete=len(nodes) >
                             self.object_ring.replica_count - 1,
                             partition=partition))
                except ValueError:
                    continue
        random.shuffle(jobs)
        # Partititons that need to be deleted take priority
        jobs.sort(key=lambda job: not job['delete'])
        self.job_count = len(jobs)
        return jobs

    def replicate(self):
        """Run a replication pass"""
        self.start = time.time()
        self.suffix_count = 0
        self.suffix_sync = 0
        self.suffix_hash = 0
        self.replication_count = 0
        self.last_replication_count = -1
        self.partition_times = []
        stats = eventlet.spawn(self.heartbeat)
        lockup_detector = eventlet.spawn(self.detect_lockups)
        eventlet.sleep()  # Give spawns a cycle
        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs()
            for job in jobs:
                if not self.check_ring():
                    self.logger.info(
                        _("Ring change detected. Aborting "
                          "current replication pass."))
                    return
                if job['delete']:
                    self.run_pool.spawn(self.update_deleted, job)
                else:
                    self.run_pool.spawn(self.update, job)
            with Timeout(self.lockup_timeout):
                self.run_pool.waitall()
        except (Exception, Timeout):
            self.logger.exception(_("Exception in top-level replication loop"))
            self.kill_coros()
        finally:
            stats.kill()
            lockup_detector.kill()
            self.stats_line()

    def run_once(self, *args, **kwargs):
        start = time.time()
        self.logger.info(_("Running object replicator in script mode."))
        self.replicate()
        total = (time.time() - start) / 60
        self.logger.info(_("Object replication complete. (%.02f minutes)"),
                         total)
        if self.recon_enable:
            try:
                dump_recon_cache('object_replication_time', total, \
                    self.recon_object)
            except (Exception, Timeout):
                self.logger.exception(_('Exception dumping recon cache'))

    def run_forever(self, *args, **kwargs):
        self.logger.info(_("Starting object replicator in daemon mode."))
        # Run the replicator continually
        while True:
            start = time.time()
            self.logger.info(_("Starting object replication pass."))
            # Run the replicator
            self.replicate()
            total = (time.time() - start) / 60
            self.logger.info(_("Object replication complete. (%.02f minutes)"),
                             total)
            if self.recon_enable:
                try:
                    dump_recon_cache('object_replication_time', total, \
                        self.recon_object)
                except (Exception, Timeout):
                    self.logger.exception(_('Exception dumping recon cache'))
            self.logger.debug(_('Replication sleeping for %s seconds.'),
                              self.run_pause)
            sleep(self.run_pause)
Пример #10
0
class ObjectReplicator(Daemon):
    """
    Replicate objects.

    Encapsulates most logic and data needed by the object replication process.
    Each call to .replicate() performs one replication pass.  It's up to the
    caller to do this in a loop.
    """

    def __init__(self, conf):
        """
        :param conf: configuration object obtained from ConfigParser
        :param logger: logging object
        """
        self.conf = conf
        self.logger = get_logger(conf, log_route='object-replicator')
        self.devices_dir = conf.get('devices', '/srv/node')
        self.mount_check = conf.get('mount_check', 'true').lower() in \
                              ('true', 't', '1', 'on', 'yes', 'y')
        self.vm_test_mode = conf.get(
                'vm_test_mode', 'no').lower() in ('yes', 'true', 'on', '1')
        self.chase_dir = conf.get('chase_dir', '/etc/chase')
        self.port = int(conf.get('bind_port', 6000))
        self.concurrency = int(conf.get('concurrency', 1))
        self.stats_interval = int(conf.get('stats_interval', '300'))
        self.object_ring = Ring(join(self.chase_dir, 'object.ring.gz'))
        self.ring_check_interval = int(conf.get('ring_check_interval', 15))
        self.next_check = time.time() + self.ring_check_interval
        self.reclaim_age = int(conf.get('reclaim_age', 86400 * 7))
        self.partition_times = []
        self.run_pause = int(conf.get('run_pause', 30))
        self.rsync_timeout = int(conf.get('rsync_timeout', 900))
        self.rsync_io_timeout = conf.get('rsync_io_timeout', '30')
        self.http_timeout = int(conf.get('http_timeout', 60))
        self.lockup_timeout = int(conf.get('lockup_timeout', 1800))
        self.recon_enable = conf.get(
                'recon_enable', 'no').lower() in TRUE_VALUES
        self.recon_cache_path = conf.get(
                'recon_cache_path', '/var/cache/chase')
        self.recon_object = os.path.join(self.recon_cache_path, "object.recon")

    def _rsync(self, args):
        """
        Execute the rsync binary to replicate a partition.

        :returns: return code of rsync process. 0 is successful
        """
        start_time = time.time()
        ret_val = None
        try:
            with Timeout(self.rsync_timeout):
                proc = subprocess.Popen(args, stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT)
                results = proc.stdout.read()
                ret_val = proc.wait()
        except Timeout:
            self.logger.error(_("Killing long-running rsync: %s"), str(args))
            proc.kill()
            return 1  # failure response code
        total_time = time.time() - start_time
        for result in results.split('\n'):
            if result == '':
                continue
            if result.startswith('cd+'):
                continue
            if not ret_val:
                self.logger.info(result)
            else:
                self.logger.error(result)
        if ret_val:
            self.logger.error(_('Bad rsync return code: %(args)s -> %(ret)d'),
                    {'args': str(args), 'ret': ret_val})
        elif results:
            self.logger.info(
                _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"),
                {'src': args[-2], 'dst': args[-1], 'time': total_time})
        else:
            self.logger.debug(
                _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"),
                {'src': args[-2], 'dst': args[-1], 'time': total_time})
        return ret_val

    def rsync(self, node, job, suffixes):
        """
        Synchronize local suffix directories from a partition with a remote
        node.

        :param node: the "dev" entry for the remote node to sync with
        :param job: information about the partition being synced
        :param suffixes: a list of suffixes which need to be pushed

        :returns: boolean indicating success or failure
        """
        if not os.path.exists(job['path']):
            return False
        args = [
            'rsync',
            '--recursive',
            '--whole-file',
            '--human-readable',
            '--xattrs',
            '--itemize-changes',
            '--ignore-existing',
            '--timeout=%s' % self.rsync_io_timeout,
            '--contimeout=%s' % self.rsync_io_timeout,
        ]
        if self.vm_test_mode:
            rsync_module = '[%s]::object%s' % (node['ip'], node['port'])
        else:
            rsync_module = '[%s]::object' % node['ip']
        had_any = False
        for suffix in suffixes:
            spath = join(job['path'], suffix)
            if os.path.exists(spath):
                args.append(spath)
                had_any = True
        if not had_any:
            return False
        args.append(join(rsync_module, node['device'],
                    'objects', job['partition']))
        return self._rsync(args) == 0

    def check_ring(self):
        """
        Check to see if the ring has been updated

        :returns: boolean indicating whether or not the ring has changed
        """
        if time.time() > self.next_check:
            self.next_check = time.time() + self.ring_check_interval
            if self.object_ring.has_changed():
                return False
        return True

    def update_deleted(self, job):
        """
        High-level method that replicates a single partition that doesn't
        belong on this node.

        :param job: a dict containing info about the partition to be replicated
        """

        def tpool_get_suffixes(path):
            return [suff for suff in os.listdir(path)
                    if len(suff) == 3 and isdir(join(path, suff))]
        self.replication_count += 1
        begin = time.time()
        try:
            responses = []
            suffixes = tpool.execute(tpool_get_suffixes, job['path'])
            if suffixes:
                for node in job['nodes']:
                    success = self.rsync(node, job, suffixes)
                    if success:
                        with Timeout(self.http_timeout):
                            http_connect(node['ip'], node['port'],
                                node['device'], job['partition'], 'REPLICATE',
                                '/' + '-'.join(suffixes),
                          headers={'Content-Length': '0'}).getresponse().read()
                    responses.append(success)
            if not suffixes or (len(responses) == \
                        self.object_ring.replica_count and all(responses)):
                self.logger.info(_("Removing partition: %s"), job['path'])
                tpool.execute(shutil.rmtree, job['path'], ignore_errors=True)
        except (Exception, Timeout):
            self.logger.exception(_("Error syncing handoff partition"))
        finally:
            self.partition_times.append(time.time() - begin)

    def update(self, job):
        """
        High-level method that replicates a single partition.

        :param job: a dict containing info about the partition to be replicated
        """
        self.replication_count += 1
        begin = time.time()
        try:
            hashed, local_hash = tpool.execute(tpooled_get_hashes, job['path'],
                    do_listdir=(self.replication_count % 10) == 0,
                    reclaim_age=self.reclaim_age)
            # See tpooled_get_hashes "Hack".
            if isinstance(hashed, BaseException):
                raise hashed
            self.suffix_hash += hashed
            attempts_left = self.object_ring.replica_count - 1
            nodes = itertools.chain(job['nodes'],
                        self.object_ring.get_more_nodes(int(job['partition'])))
            while attempts_left > 0:
                # If this throws StopIterator it will be caught way below
                node = next(nodes)
                attempts_left -= 1
                try:
                    with Timeout(self.http_timeout):
                        resp = http_connect(node['ip'], node['port'],
                                node['device'], job['partition'], 'REPLICATE',
                            '', headers={'Content-Length': '0'}).getresponse()
                        if resp.status == 507:
                            self.logger.error(_('%(ip)s/%(device)s responded'
                                    ' as unmounted'), node)
                            attempts_left += 1
                            continue
                        if resp.status != 200:
                            self.logger.error(_("Invalid response %(resp)s "
                                "from %(ip)s"),
                                {'resp': resp.status, 'ip': node['ip']})
                            continue
                        remote_hash = pickle.loads(resp.read())
                        del resp
                    suffixes = [suffix for suffix in local_hash if
                            local_hash[suffix] != remote_hash.get(suffix, -1)]
                    if not suffixes:
                        continue
                    hashed, recalc_hash = tpool.execute(tpooled_get_hashes,
                        job['path'], recalculate=suffixes,
                        reclaim_age=self.reclaim_age)
                    # See tpooled_get_hashes "Hack".
                    if isinstance(hashed, BaseException):
                        raise hashed
                    local_hash = recalc_hash
                    suffixes = [suffix for suffix in local_hash if
                            local_hash[suffix] != remote_hash.get(suffix, -1)]
                    self.rsync(node, job, suffixes)
                    with Timeout(self.http_timeout):
                        conn = http_connect(node['ip'], node['port'],
                            node['device'], job['partition'], 'REPLICATE',
                            '/' + '-'.join(suffixes),
                            headers={'Content-Length': '0'})
                        conn.getresponse().read()
                    self.suffix_sync += len(suffixes)
                except (Exception, Timeout):
                    self.logger.exception(_("Error syncing with node: %s") %
                                            node)
            self.suffix_count += len(local_hash)
        except (Exception, Timeout):
            self.logger.exception(_("Error syncing partition"))
        finally:
            self.partition_times.append(time.time() - begin)

    def stats_line(self):
        """
        Logs various stats for the currently running replication pass.
        """
        if self.replication_count:
            rate = self.replication_count / (time.time() - self.start)
            self.logger.info(_("%(replicated)d/%(total)d (%(percentage).2f%%)"
                " partitions replicated in %(time).2fs (%(rate).2f/sec, "
                "%(remaining)s remaining)"),
                {'replicated': self.replication_count, 'total': self.job_count,
                 'percentage': self.replication_count * 100.0 / self.job_count,
                 'time': time.time() - self.start, 'rate': rate,
                 'remaining': '%d%s' % compute_eta(self.start,
                           self.replication_count, self.job_count)})
            if self.suffix_count:
                self.logger.info(_("%(checked)d suffixes checked - "
                    "%(hashed).2f%% hashed, %(synced).2f%% synced"),
                    {'checked': self.suffix_count,
                     'hashed': (self.suffix_hash * 100.0) / self.suffix_count,
                     'synced': (self.suffix_sync * 100.0) / self.suffix_count})
                self.partition_times.sort()
                self.logger.info(_("Partition times: max %(max).4fs, "
                    "min %(min).4fs, med %(med).4fs"),
                    {'max': self.partition_times[-1],
                     'min': self.partition_times[0],
                     'med': self.partition_times[
                                len(self.partition_times) // 2]})
        else:
            self.logger.info(_("Nothing replicated for %s seconds."),
                (time.time() - self.start))

    def kill_coros(self):
        """Utility function that kills all coroutines currently running."""
        for coro in list(self.run_pool.coroutines_running):
            try:
                coro.kill(GreenletExit)
            except GreenletExit:
                pass

    def heartbeat(self):
        """
        Loop that runs in the background during replication.  It periodically
        logs progress.
        """
        while True:
            eventlet.sleep(self.stats_interval)
            self.stats_line()

    def detect_lockups(self):
        """
        In testing, the pool.waitall() call very occasionally failed to return.
        This is an attempt to make sure the replicator finishes its replication
        pass in some eventuality.
        """
        while True:
            eventlet.sleep(self.lockup_timeout)
            if self.replication_count == self.last_replication_count:
                self.logger.error(_("Lockup detected.. killing live coros."))
                self.kill_coros()
            self.last_replication_count = self.replication_count

    def collect_jobs(self):
        """
        Returns a sorted list of jobs (dictionaries) that specify the
        partitions, nodes, etc to be rsynced.
        """
        jobs = []
        ips = whataremyips()
        for local_dev in [dev for dev in self.object_ring.devs
                if dev and dev['ip'] in ips and dev['port'] == self.port]:
            dev_path = join(self.devices_dir, local_dev['device'])
            obj_path = join(dev_path, 'objects')
            tmp_path = join(dev_path, 'tmp')
            if self.mount_check and not os.path.ismount(dev_path):
                self.logger.warn(_('%s is not mounted'), local_dev['device'])
                continue
            unlink_older_than(tmp_path, time.time() - self.reclaim_age)
            if not os.path.exists(obj_path):
                continue
            for partition in os.listdir(obj_path):
                try:
                    nodes = [node for node in
                        self.object_ring.get_part_nodes(int(partition))
                             if node['id'] != local_dev['id']]
                    jobs.append(dict(path=join(obj_path, partition),
                        nodes=nodes,
                        delete=len(nodes) > self.object_ring.replica_count - 1,
                        partition=partition))
                except ValueError:
                    continue
        random.shuffle(jobs)
        # Partititons that need to be deleted take priority
        jobs.sort(key=lambda job: not job['delete'])
        self.job_count = len(jobs)
        return jobs

    def replicate(self):
        """Run a replication pass"""
        self.start = time.time()
        self.suffix_count = 0
        self.suffix_sync = 0
        self.suffix_hash = 0
        self.replication_count = 0
        self.last_replication_count = -1
        self.partition_times = []
        stats = eventlet.spawn(self.heartbeat)
        lockup_detector = eventlet.spawn(self.detect_lockups)
        eventlet.sleep()  # Give spawns a cycle
        try:
            self.run_pool = GreenPool(size=self.concurrency)
            jobs = self.collect_jobs()
            for job in jobs:
                if not self.check_ring():
                    self.logger.info(_("Ring change detected. Aborting "
                            "current replication pass."))
                    return
                if job['delete']:
                    self.run_pool.spawn(self.update_deleted, job)
                else:
                    self.run_pool.spawn(self.update, job)
            with Timeout(self.lockup_timeout):
                self.run_pool.waitall()
        except (Exception, Timeout):
            self.logger.exception(_("Exception in top-level replication loop"))
            self.kill_coros()
        finally:
            stats.kill()
            lockup_detector.kill()
            self.stats_line()

    def run_once(self, *args, **kwargs):
        start = time.time()
        self.logger.info(_("Running object replicator in script mode."))
        self.replicate()
        total = (time.time() - start) / 60
        self.logger.info(
            _("Object replication complete. (%.02f minutes)"), total)
        if self.recon_enable:
            try:
                dump_recon_cache('object_replication_time', total, \
                    self.recon_object)
            except (Exception, Timeout):
                self.logger.exception(_('Exception dumping recon cache'))

    def run_forever(self, *args, **kwargs):
        self.logger.info(_("Starting object replicator in daemon mode."))
        # Run the replicator continually
        while True:
            start = time.time()
            self.logger.info(_("Starting object replication pass."))
            # Run the replicator
            self.replicate()
            total = (time.time() - start) / 60
            self.logger.info(
                _("Object replication complete. (%.02f minutes)"), total)
            if self.recon_enable:
                try:
                    dump_recon_cache('object_replication_time', total, \
                        self.recon_object)
                except (Exception, Timeout):
                    self.logger.exception(_('Exception dumping recon cache'))
            self.logger.debug(_('Replication sleeping for %s seconds.'),
                self.run_pause)
            sleep(self.run_pause)