def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. """ jobs = [] ips = whataremyips() for local_dev in [dev for dev in self.object_ring.devs if dev and dev['ip'] in ips and dev['port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not os.path.ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): continue for partition in os.listdir(obj_path): try: nodes = [node for node in self.object_ring.get_part_nodes(int(partition)) if node['id'] != local_dev['id']] jobs.append(dict(path=join(obj_path, partition), nodes=nodes, delete=len(nodes) > self.object_ring.replica_count - 1, partition=partition)) except ValueError: continue random.shuffle(jobs) # Partititons that need to be deleted take priority jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs
def run_once(self, *args, **kwargs): """Run a replication pass once.""" self._zero_stats() dirs = [] ips = whataremyips() if not ips: self.logger.error(_('ERROR Failed to get my own IPs?')) return for node in self.ring.devs: if (node and node['replication_ip'] in ips and node['replication_port'] == self.port): if self.mount_check and not os.path.ismount( os.path.join(self.root, node['device'])): self.logger.warn( _('Skipping %(device)s as it is not mounted') % node) continue unlink_older_than( os.path.join(self.root, node['device'], 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): dirs.append((datadir, node['id'])) self.logger.info(_('Beginning replication run')) for part, object_file, node_id in roundrobin_datadirs(dirs): self.cpool.spawn_n(self._replicate_object, part, object_file, node_id) self.cpool.waitall() self.logger.info(_('Replication run OVER')) self._report_stats()
def run_once(self, *args, **kwargs): """Run a replication pass once.""" self._zero_stats() dirs = [] ips = whataremyips() if not ips: self.logger.error(_('ERROR Failed to get my own IPs?')) return for node in self.ring.devs: if node and node['ip'] in ips and node['port'] == self.port: if self.mount_check and not os.path.ismount( os.path.join(self.root, node['device'])): self.logger.warn( _('Skipping %(device)s as it is not mounted') % node) continue unlink_older_than( os.path.join(self.root, node['device'], 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): dirs.append((datadir, node['id'])) self.logger.info(_('Beginning replication run')) for part, object_file, node_id in self.roundrobin_datadirs(dirs): self.cpool.spawn_n( self._replicate_object, part, object_file, node_id) self.cpool.waitall() self.logger.info(_('Replication run OVER')) self._report_stats()
def collect_parts(self, override_devices=None, override_partitions=None): """ Helper for yielding partitions in the top level reconstructor """ override_devices = override_devices or [] override_partitions = override_partitions or [] ips = whataremyips() for policy in POLICIES: if policy.policy_type != EC_POLICY: continue self._diskfile_mgr = self._df_router[policy] self.load_object_ring(policy) data_dir = get_data_dir(policy) local_devices = itertools.ifilter( lambda dev: dev and is_local_device(ips, self.port, dev[ 'replication_ip'], dev['replication_port']), policy.object_ring.devs) for local_dev in local_devices: if override_devices and (local_dev['device'] not in override_devices): continue dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('Unable to create %s' % obj_path) continue try: partitions = os.listdir(obj_path) except OSError: self.logger.exception('Unable to list partitions in %r' % obj_path) continue for partition in partitions: part_path = join(obj_path, partition) if not (partition.isdigit() and os.path.isdir(part_path)): self.logger.warning( 'Unexpected entity in data dir: %r' % part_path) remove_file(part_path) continue partition = int(partition) if override_partitions and (partition not in override_partitions): continue part_info = { 'local_dev': local_dev, 'policy': policy, 'partition': partition, 'part_path': part_path, } yield part_info
def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be synced. """ jobs = [] ips = whataremyips() for local_dev in [ dev for dev in self.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port ]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = \ self.object_ring.get_part_nodes(int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition)) except (ValueError, OSError): continue random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs
def collect_parts(self, override_devices=None, override_partitions=None): """ Helper for yielding partitions in the top level reconstructor """ override_devices = override_devices or [] override_partitions = override_partitions or [] ips = whataremyips(self.bind_ip) for policy in POLICIES: if policy.policy_type != EC_POLICY: continue self._diskfile_mgr = self._df_router[policy] self.load_object_ring(policy) data_dir = get_data_dir(policy) local_devices = itertools.ifilter( lambda dev: dev and is_local_device(ips, self.port, dev["replication_ip"], dev["replication_port"]), policy.object_ring.devs, ) for local_dev in local_devices: if override_devices and (local_dev["device"] not in override_devices): continue dev_path = self._df_router[policy].get_dev_path(local_dev["device"]) if not dev_path: self.logger.warn(_("%s is not mounted"), local_dev["device"]) continue obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception("Unable to create %s" % obj_path) continue try: partitions = os.listdir(obj_path) except OSError: self.logger.exception("Unable to list partitions in %r" % obj_path) continue for partition in partitions: part_path = join(obj_path, partition) if not (partition.isdigit() and os.path.isdir(part_path)): self.logger.warning("Unexpected entity in data dir: %r" % part_path) remove_file(part_path) continue partition = int(partition) if override_partitions and (partition not in override_partitions): continue part_info = { "local_dev": local_dev, "policy": policy, "partition": partition, "part_path": part_path, } yield part_info
def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] data_dir = get_data_dir(policy) for local_dev in [ dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices)) ]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: continue return jobs
def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be synced. """ jobs = [] ips = whataremyips() for local_dev in [dev for dev in self.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning('Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = \ self.object_ring.get_part_nodes(int(partition)) #MODIFIED LightSync for mypos in range(len(part_nodes)): if part_nodes[mypos]['id'] == local_dev['id']: break nodes = part_nodes[mypos+1:]+part_nodes[:mypos] ## jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition)) except (ValueError, OSError): continue random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs
def run_once(self, *args, **kwargs): """Run a replication pass once.""" self._zero_stats() dirs = [] ips = whataremyips(self.bind_ip) if not ips: self.logger.error(_('ERROR Failed to get my own IPs?')) return if self.handoffs_only: self.logger.warning( 'Starting replication pass with handoffs_only enabled. ' 'This mode is not intended for normal ' 'operation; use handoffs_only with care.') self._local_device_ids = set() found_local = False for node in self.ring.devs: if node and is_local_device(ips, self.port, node['replication_ip'], node['replication_port']): found_local = True if not check_drive(self.root, node['device'], self.mount_check): self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in self.ring.devs if failure_dev ]) self.logger.warning( _('Skipping %(device)s as it is not mounted') % node) continue unlink_older_than( os.path.join(self.root, node['device'], 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): self._local_device_ids.add(node['id']) filt = (self.handoffs_only_filter(node['id']) if self.handoffs_only else None) dirs.append((datadir, node['id'], filt)) if not found_local: self.logger.error( "Can't find itself %s with port %s in ring " "file, not replicating", ", ".join(ips), self.port) self.logger.info(_('Beginning replication run')) for part, object_file, node_id in roundrobin_datadirs(dirs): self.cpool.spawn_n(self._replicate_object, part, object_file, node_id) self.cpool.waitall() self.logger.info(_('Replication run OVER')) if self.handoffs_only: self.logger.warning( 'Finished replication pass with handoffs_only enabled. ' 'If handoffs_only is no longer required, disable it.') self._report_stats()
def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be synced. """ jobs = [] ips = whataremyips() for local_dev in [ dev for dev in self.object_ring.devs if dev and dev["replication_ip"] in ips and dev["replication_port"] == self.port ]: dev_path = join(self.devices_dir, local_dev["device"]) obj_path = join(dev_path, "objects") tmp_path = join(dev_path, "tmp") if self.mount_check and not os.path.ismount(dev_path): self.logger.warn(_("%s is not mounted"), local_dev["device"]) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception("ERROR creating %s" % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning("Removing partition directory " "which was a file: %s", job_path) os.remove(job_path) continue part_nodes = self.object_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node["id"] != local_dev["id"]] jobs.append( dict( path=job_path, device=local_dev["device"], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition, ) ) except (ValueError, OSError): continue random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job["delete"]) self.job_count = len(jobs) return jobs
def process_repl(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [dev for dev in obj_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices))]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue try: job_path = join(obj_path, partition) part_nodes = obj_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring, region=local_dev['region'])) except ValueError: continue return jobs
def process_repl(self, policy, jobs, ips): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [ dev for dev in obj_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port ]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = obj_ring.get_part_nodes(int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring)) except (ValueError, OSError): continue
def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be rsynced. """ jobs = [] ips = whataremyips() for local_dev in [dev for dev in self.object_ring.devs if dev and dev['ip'] in ips and dev['port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not os.path.ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning('Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = \ self.object_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition)) except (ValueError, OSError): continue random.shuffle(jobs) self.job_count = len(jobs) return jobs
def process_repl(self, policy, jobs, ips): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [dev for dev in obj_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning( 'Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = obj_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring)) except (ValueError, OSError): continue
def process_repl(self, policy, jobs, ips): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [ dev for dev in obj_ring.devs if dev and dev["replication_ip"] in ips and dev["replication_port"] == self.port ]: dev_path = join(self.devices_dir, local_dev["device"]) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_("%s is not mounted"), local_dev["device"]) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception("ERROR creating %s" % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) part_nodes = obj_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node["id"] != local_dev["id"]] jobs.append( dict( path=job_path, device=local_dev["device"], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring, ) ) except ValueError: continue
def run_once(self, *args, **kwargs): """Run a replication pass once.""" self._zero_stats() dirs = [] ips = whataremyips(self.bind_ip) if not ips: self.logger.error(_('ERROR Failed to get my own IPs?')) return self._local_device_ids = set() found_local = False ###遍历节点 for node in self.ring.devs: if node and is_local_device(ips, self.port, node['replication_ip'], node['replication_port']): found_local = True if self.mount_check and not ismount( os.path.join(self.root, node['device'])): self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in self.ring.devs if failure_dev]) self.logger.warning( _('Skipping %(device)s as it is not mounted') % node) continue unlink_older_than( os.path.join(self.root, node['device'], 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): self._local_device_ids.add(node['id']) dirs.append((datadir, node['id'])) if not found_local: self.logger.error("Can't find itself %s with port %s in ring " "file, not replicating", ", ".join(ips), self.port) self.logger.info(_('Beginning replication run')) for part, object_file, node_id in roundrobin_datadirs(dirs): self.cpool.spawn_n( self._replicate_object, part, object_file, node_id) self.cpool.waitall() self.logger.info(_('Replication run OVER')) self._report_stats()
def run_once(self, *args, **kwargs): """Run a replication pass once.""" self._zero_stats() dirs = [] ips = whataremyips(self.bind_ip) if not ips: self.logger.error(_('ERROR Failed to get my own IPs?')) return self._local_device_ids = set() found_local = False for node in self.ring.devs: if node and is_local_device(ips, self.port, node['replication_ip'], node['replication_port']): found_local = True if self.mount_check and not ismount( os.path.join(self.root, node['device'])): self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in self.ring.devs if failure_dev]) self.logger.warning( _('Skipping %(device)s as it is not mounted') % node) continue unlink_older_than( os.path.join(self.root, node['device'], 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): self._local_device_ids.add(node['id']) dirs.append((datadir, node['id'])) if not found_local: self.logger.error("Can't find itself %s with port %s in ring " "file, not replicating", ", ".join(ips), self.port) self.logger.info(_('Beginning replication run')) for part, object_file, node_id in roundrobin_datadirs(dirs): self.cpool.spawn_n( self._replicate_object, part, object_file, node_id) self.cpool.waitall() self.logger.info(_('Replication run OVER')) self._report_stats()
def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] self.all_devs_info.update([(dev["replication_ip"], dev["device"]) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [ dev for dev in policy.object_ring.devs if ( dev and is_local_device(ips, self.port, dev["replication_ip"], dev["replication_port"]) and (override_devices is None or dev["device"] in override_devices) ) ]: found_local = True dev_path = join(self.devices_dir, local_dev["device"]) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self._add_failure_stats( [ (failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in policy.object_ring.devs if failure_dev ] ) self.logger.warning(_("%s is not mounted"), local_dev["device"]) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception("ERROR creating %s" % obj_path) continue for partition in os.listdir(obj_path): if override_partitions is not None and partition not in override_partitions: continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes(int(partition)) nodes = [node for node in part_nodes if node["id"] != local_dev["id"]] jobs.append( dict( path=job_path, device=local_dev["device"], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev["region"], ) ) except ValueError: if part_nodes: self._add_failure_stats( [(failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in nodes] ) else: self._add_failure_stats( [ (failure_dev["replication_ip"], failure_dev["device"]) for failure_dev in policy.object_ring.devs if failure_dev ] ) continue if not found_local: self.logger.error( "Can't find itself %s with port %s in ring " "file, not replicating", ", ".join(ips), self.port ) return jobs
def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] self.all_devs_info.update([(dev['replication_ip'], dev['device']) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [ dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices)) ]: found_local = True dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev ]) self.logger.warning(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue if (partition.startswith('auditor_status_') and partition.endswith('.json')): # ignore auditor status files continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [ node for node in part_nodes if node['id'] != local_dev['id'] ] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: if part_nodes: self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in nodes ]) else: self._add_failure_stats([ (failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev ]) continue if not found_local: self.logger.error( "Can't find itself in policy with index %d with" " ips %s and with port %s in ring file, not" " replicating", int(policy), ", ".join(ips), self.port) return jobs
def build_replication_jobs(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] df_mgr = self._df_router[policy] self.all_devs_info.update( [(dev['replication_ip'], dev['device']) for dev in policy.object_ring.devs if dev]) data_dir = get_data_dir(policy) found_local = False for local_dev in [dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices))]: found_local = True dev_path = check_drive(self.devices_dir, local_dev['device'], self.mount_check) if not dev_path: self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev]) self.logger.warning( _('%s is not mounted'), local_dev['device']) continue obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) unlink_older_than(tmp_path, time.time() - df_mgr.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue if (partition.startswith('auditor_status_') and partition.endswith('.json')): # ignore auditor status files continue part_nodes = None try: job_path = join(obj_path, partition) part_nodes = policy.object_ring.get_part_nodes( int(partition)) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy=policy, partition=partition, region=local_dev['region'])) except ValueError: if part_nodes: self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in nodes]) else: self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in policy.object_ring.devs if failure_dev]) continue if not found_local: self.logger.error("Can't find itself in policy with index %d with" " ips %s and with port %s in ring file, not" " replicating", int(policy), ", ".join(ips), self.port) return jobs
def run_once(self, *args, **kwargs): """Run a replication pass once.""" override_options = parse_override_options(once=True, **kwargs) devices_to_replicate = override_options.devices or Everything() partitions_to_replicate = override_options.partitions or Everything() self._zero_stats() dirs = [] ips = whataremyips(self.bind_ip) if not ips: self.logger.error(_('ERROR Failed to get my own IPs?')) return if self.handoffs_only: self.logger.warning( 'Starting replication pass with handoffs_only enabled. ' 'This mode is not intended for normal ' 'operation; use handoffs_only with care.') self._local_device_ids = set() found_local = False for node in self.ring.devs: if node and is_local_device(ips, self.port, node['replication_ip'], node['replication_port']): found_local = True try: dev_path = check_drive(self.root, node['device'], self.mount_check) except ValueError as err: self._add_failure_stats( [(failure_dev['replication_ip'], failure_dev['device']) for failure_dev in self.ring.devs if failure_dev]) self.logger.warning('Skipping: %s', err) continue if node['device'] not in devices_to_replicate: self.logger.debug( 'Skipping device %s due to given arguments', node['device']) continue unlink_older_than( os.path.join(dev_path, 'tmp'), time.time() - self.reclaim_age) datadir = os.path.join(self.root, node['device'], self.datadir) if os.path.isdir(datadir): self._local_device_ids.add(node['id']) part_filt = self._partition_dir_filter( node['id'], partitions_to_replicate) dirs.append((datadir, node['id'], part_filt)) if not found_local: self.logger.error("Can't find itself %s with port %s in ring " "file, not replicating", ", ".join(ips), self.port) self.logger.info(_('Beginning replication run')) for part, object_file, node_id in self.roundrobin_datadirs(dirs): self.cpool.spawn_n( self._replicate_object, part, object_file, node_id) self.cpool.waitall() self.logger.info(_('Replication run OVER')) if self.handoffs_only: self.logger.warning( 'Finished replication pass with handoffs_only enabled. ' 'If handoffs_only is no longer required, disable it.') self._report_stats()
def collect_parts(self, override_devices=None, override_partitions=None): """ Helper for getting partitions in the top level reconstructor """ override_devices = override_devices or [] override_partitions = override_partitions or [] ips = whataremyips(self.bind_ip) ec_policies = (policy for policy in POLICIES if policy.policy_type == EC_POLICY) policy2devices = {} for policy in ec_policies: self.load_object_ring(policy) local_devices = list( six.moves.filter( lambda dev: dev and is_local_device( ips, self.port, dev['replication_ip'], dev[ 'replication_port']), policy.object_ring.devs)) if override_devices: local_devices = list( six.moves.filter( lambda dev_info: dev_info['device'] in override_devices, local_devices)) policy2devices[policy] = local_devices self.device_count += len(local_devices) all_parts = [] for policy, local_devices in policy2devices.items(): df_mgr = self._df_router[policy] for local_dev in local_devices: dev_path = df_mgr.get_dev_path(local_dev['device']) if not dev_path: self.logger.warning(_('%s is not mounted'), local_dev['device']) continue data_dir = get_data_dir(policy) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) unlink_older_than(tmp_path, time.time() - df_mgr.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('Unable to create %s' % obj_path) continue try: partitions = os.listdir(obj_path) except OSError: self.logger.exception('Unable to list partitions in %r' % obj_path) continue self.part_count += len(partitions) for partition in partitions: part_path = join(obj_path, partition) if partition in ('auditor_status_ALL.json', 'auditor_status_ZBF.json'): continue if not partition.isdigit(): self.logger.warning( 'Unexpected entity in data dir: %r' % part_path) self.delete_partition(part_path) self.reconstruction_part_count += 1 continue partition = int(partition) if override_partitions and (partition not in override_partitions): continue part_info = { 'local_dev': local_dev, 'policy': policy, 'partition': partition, 'part_path': part_path, } all_parts.append(part_info) random.shuffle(all_parts) return all_parts
def collect_parts(self, override_devices=None, override_partitions=None): """ Helper for yielding partitions in the top level reconstructor """ override_devices = override_devices or [] override_partitions = override_partitions or [] ips = whataremyips(self.bind_ip) for policy in POLICIES: if policy.policy_type != EC_POLICY: continue self._diskfile_mgr = self._df_router[policy] self.load_object_ring(policy) data_dir = get_data_dir(policy) local_devices = list(six.moves.filter( lambda dev: dev and is_local_device( ips, self.port, dev['replication_ip'], dev['replication_port']), policy.object_ring.devs)) if override_devices: self.device_count = len(override_devices) else: self.device_count = len(local_devices) for local_dev in local_devices: if override_devices and (local_dev['device'] not in override_devices): continue self.reconstruction_device_count += 1 dev_path = self._df_router[policy].get_dev_path( local_dev['device']) if not dev_path: self.logger.warning(_('%s is not mounted'), local_dev['device']) continue obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception( 'Unable to create %s' % obj_path) continue try: partitions = os.listdir(obj_path) except OSError: self.logger.exception( 'Unable to list partitions in %r' % obj_path) continue self.part_count += len(partitions) for partition in partitions: part_path = join(obj_path, partition) if not (partition.isdigit() and os.path.isdir(part_path)): self.logger.warning( 'Unexpected entity in data dir: %r' % part_path) remove_file(part_path) self.reconstruction_part_count += 1 continue partition = int(partition) if override_partitions and (partition not in override_partitions): continue part_info = { 'local_dev': local_dev, 'policy': policy, 'partition': partition, 'part_path': part_path, } yield part_info self.reconstruction_part_count += 1
def build_replication_jobs(self, policy, ips, old_dict, new_dict, moving_map): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy :param policy: swift policy object :param ips: the local server ips :param old_dict: dictionary with devices from old ring :param new_dict: dictionary with devices from new ring :param moving_map: the dictionary that contains all the partitions that should be moved, their sources and destinations """ jobs = [] data_dir = get_data_dir(policy) devices = Set(map(lambda x: x[1], moving_map.values())) partitions = Set(map(lambda x: x[0], moving_map.values())) for local_dev in [dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) )]: if self.test: print local_dev['id'] if unicode(local_dev['id']) not in devices: continue dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self.logger.warn('%s is not mounted' % local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) for partition in os.listdir(obj_path): partition = unicode(partition) if (partition not in partitions): continue try: key = "%s_%s" % (local_dev['id'], partition) if key not in moving_map: continue job_path = join(obj_path, partition) _, source_id, dest_id = moving_map[key] if source_id != unicode(local_dev['id']): continue node = {} replication_ip, replication_device = new_dict[dest_id] node['replication_ip'] = replication_ip node['device'] = replication_device remote_path = os.path.join(self.devices_dir, node['device'], self.mover_tmp_dir) jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, node=node, policy=policy, partition=partition, remote_path=remote_path)) except ValueError: continue except Exception as e: self.logger.exception( "an %s exception accure at build_replication_jobs" % e) if self.test: print e return jobs
def build_replication_jobs(self, policy, ips, old_dict, new_dict, moving_map): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy :param policy: swift policy object :param ips: the local server ips :param old_dict: dictionary with devices from old ring :param new_dict: dictionary with devices from new ring :param moving_map: the dictionary that contains all the partitions that should be moved, their sources and destinations """ jobs = [] data_dir = get_data_dir(policy) devices = Set(map(lambda x: x[1], moving_map.values())) partitions = Set(map(lambda x: x[0], moving_map.values())) for local_dev in [ dev for dev in policy.object_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port'])) ]: if self.test: print local_dev['id'] if unicode(local_dev['id']) not in devices: continue dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(policy)) if self.mount_check and not ismount(dev_path): self.logger.warn('%s is not mounted' % local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) for partition in os.listdir(obj_path): partition = unicode(partition) if (partition not in partitions): continue try: key = "%s_%s" % (local_dev['id'], partition) if key not in moving_map: continue job_path = join(obj_path, partition) _, source_id, dest_id = moving_map[key] if source_id != unicode(local_dev['id']): continue node = {} replication_ip, replication_device = new_dict[dest_id] node['replication_ip'] = replication_ip node['device'] = replication_device remote_path = os.path.join(self.devices_dir, node['device'], self.mover_tmp_dir) jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, node=node, policy=policy, partition=partition, remote_path=remote_path)) except ValueError: continue except Exception as e: self.logger.exception( "an %s exception accure at build_replication_jobs" % e) if self.test: print e return jobs
def process_repl(self, policy, ips, override_devices=None, override_partitions=None): """ Helper function for collect_jobs to build jobs for replication using replication style storage policy """ jobs = [] obj_ring = self.get_object_ring(policy.idx) data_dir = get_data_dir(policy.idx) for local_dev in [dev for dev in obj_ring.devs if (dev and is_local_device(ips, self.port, dev['replication_ip'], dev['replication_port']) and (override_devices is None or dev['device'] in override_devices))]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): if (override_partitions is not None and partition not in override_partitions): continue try: job_path = join(obj_path, partition) part_nodes = obj_ring.get_part_nodes(int(partition)) ###################################### CHANGED_CODE ######################################################## f = open("/home/hduser/swift/swift/proxy/controllers/spindowndevices") downlist = f.read().split("\n") f.close() nodes = [node for node in part_nodes if node['id'] != local_dev['id'] and node['device'] not in downlist] print("===Replication nodes===",nodes) ###################################### CHANGED_CODE ######################################################## jobs.append( dict(path=job_path, device=local_dev['device'], obj_path=obj_path, nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, policy_idx=policy.idx, partition=partition, object_ring=obj_ring, region=local_dev['region'])) except ValueError: continue return jobs
def collect_parts(self, override_devices=None, override_partitions=None): """ Helper for getting partitions in the top level reconstructor In handoffs_only mode no primary partitions will not be included in the returned (possibly empty) list. """ override_devices = override_devices or [] override_partitions = override_partitions or [] ips = whataremyips(self.bind_ip) ec_policies = (policy for policy in POLICIES if policy.policy_type == EC_POLICY) policy2devices = {} for policy in ec_policies: self.load_object_ring(policy) local_devices = list( six.moves.filter( lambda dev: dev and is_local_device( ips, self.port, dev['replication_ip'], dev[ 'replication_port']), policy.object_ring.devs)) if override_devices: local_devices = list( six.moves.filter( lambda dev_info: dev_info['device'] in override_devices, local_devices)) policy2devices[policy] = local_devices self.device_count += len(local_devices) all_parts = [] for policy, local_devices in policy2devices.items(): # Skip replication if next_part_power is set. In this case # every object is hard-linked twice, but the replicator # can't detect them and would create a second copy of the # file if not yet existing - and this might double the # actual transferred and stored data next_part_power = getattr(policy.object_ring, 'next_part_power', None) if next_part_power is not None: self.logger.warning( _("next_part_power set in policy '%s'. Skipping"), policy.name) continue df_mgr = self._df_router[policy] for local_dev in local_devices: dev_path = df_mgr.get_dev_path(local_dev['device']) if not dev_path: self.logger.warning(_('%s is not mounted'), local_dev['device']) continue data_dir = get_data_dir(policy) obj_path = join(dev_path, data_dir) tmp_path = join(dev_path, get_tmp_dir(int(policy))) unlink_older_than(tmp_path, time.time() - df_mgr.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('Unable to create %s' % obj_path) continue try: partitions = os.listdir(obj_path) except OSError: self.logger.exception('Unable to list partitions in %r' % obj_path) continue self.part_count += len(partitions) for partition in partitions: part_path = join(obj_path, partition) if partition in ('auditor_status_ALL.json', 'auditor_status_ZBF.json'): continue if not partition.isdigit(): self.logger.warning( 'Unexpected entity in data dir: %r' % part_path) self.delete_partition(part_path) self.reconstruction_part_count += 1 continue partition = int(partition) if override_partitions and (partition not in override_partitions): continue # N.B. At a primary node in handoffs_only mode may skip to # sync misplaced (handoff) fragments in the primary # partition. That may happen while rebalancing several # times. (e.g. a node holding handoff fragment being a new # primary) Those fragments will be synced (and revert) once # handoffs_only mode turned off. if self.handoffs_only and any(local_dev['id'] == n['id'] for n in policy.object_ring. get_part_nodes(partition)): self.logger.debug( 'Skipping %s job for %s ' 'while in handoffs_only mode.', SYNC, part_path) continue part_info = { 'local_dev': local_dev, 'policy': policy, 'partition': partition, 'part_path': part_path, } all_parts.append(part_info) random.shuffle(all_parts) return all_parts
def collect_jobs(self): """ Returns a sorted list of jobs (dictionaries) that specify the partitions, nodes, etc to be synced. """ jobs = [] ips = whataremyips() for local_dev in [dev for dev in self.object_ring.devs if dev and dev['replication_ip'] in ips and dev['replication_port'] == self.port]: dev_path = join(self.devices_dir, local_dev['device']) obj_path = join(dev_path, 'objects') tmp_path = join(dev_path, 'tmp') if self.mount_check and not ismount(dev_path): self.logger.warn(_('%s is not mounted'), local_dev['device']) continue unlink_older_than(tmp_path, time.time() - self.reclaim_age) if not os.path.exists(obj_path): try: mkdirs(obj_path) except Exception: self.logger.exception('ERROR creating %s' % obj_path) continue for partition in os.listdir(obj_path): try: job_path = join(obj_path, partition) if isfile(job_path): # Clean up any (probably zero-byte) files where a # partition should be. self.logger.warning('Removing partition directory ' 'which was a file: %s', job_path) os.remove(job_path) continue part_nodes = \ self.object_ring.get_part_nodes(int(partition)) #### CHANGED CODE #### #f = open("/home/swift/spindowndevices","r") #sdlist = f.read().strip().split("\n") #logging.info("===Spun down devices===:%s",str(sdlist)) #f.close() #sddict =dict() #for i in sdlist: # logging.info("===sdditc===%s",sddict) # if(i.split(":")[0] in sddict): # sddict[i.split(":")[0]].append(i.split(":")[1]) # else: # sddict[i.split(":")[0]] = [] # sddict[i.split(":")[0]].append(i.split(":")[1]) #nodes = [] #for node in part_nodes: # if(node['ip'] not in sddict and node['id']!= local_dev['id']): # nodes.append(node) # else: # if(node['device'] not in sddict[node['ip']] and node['id']!=local_dev['id']): # nodes.append(node) nodes = [node for node in part_nodes if node['id'] != local_dev['id']] logging.info("===Replication nodes===%s",str(nodes)) # logging.info("===sddict===%s",str(sddict)) #### END CHANGED CODE #### jobs.append( dict(path=job_path, device=local_dev['device'], nodes=nodes, delete=len(nodes) > len(part_nodes) - 1, partition=partition)) except (ValueError, OSError): continue random.shuffle(jobs) if self.handoffs_first: # Move the handoff parts to the front of the list jobs.sort(key=lambda job: not job['delete']) self.job_count = len(jobs) return jobs