def get_file_locations(self, complete=1, sites=None, cloud=None): '''Helper function to resolve replica locations of files of the given datasets NB: by default, it checks only sites with complete datasets ''' profiler = ElapsedTimeProfiler(logger=logger) # get the dataset locations ds_sites = self.get_locations(complete=complete, overlap=False) logger.info('resolving dataset file locations') profiler.start() replicas = {} #if complete: # pass #else: for ds in ds_sites.keys(): logger.debug('dataset: %s' % ds) logger.debug('sites: %s' % repr(ds_sites[ds])) replicas.update(resolve_file_locations(ds, sites=ds_sites[ds])) profiler.check('%d datasets %d files' % ( len(ds_sites.keys()), len(replicas.keys()) )) return replicas
def get_datasets(self, name, filter=True): '''Get datasets names, especially when the name is given as a wildcard pattern''' profiler = ElapsedTimeProfiler(logger=logger) profiler.start() datasets = listDatasets(name, filter) profiler.check('listing %d datasets/containers' % len(datasets)) return datasets
def __resolve_containers(self, containers, nthreads=10): '''resolving dataset containers''' datasets = {} wq = Queue(len(containers)) for ds in containers: wq.put(ds) mylock = Lock() def worker(id): dq2 = DQ2() while not wq.empty(): try: ds = wq.get(block=True, timeout=1) logger.debug('worker id: %d on dataset container: %s' % (id, ds)) datasets[ds] = [] ds_tmp = dq2.listDatasetsInContainer(ds) mylock.acquire() datasets[ds] = ds_tmp mylock.release() except DQException as err: logger.warning(str(err)) except Empty: pass profiler = ElapsedTimeProfiler(logger=logger) profiler.start() threads = [] for i in range(nthreads): t = GangaThread(name='stager_ds_w_%d' % i, target=worker, kwargs={'id': i}) # t.setDaemon(False) threads.append(t) for t in threads: t.start() for t in threads: t.join() profiler.check('resolving %d containers' % len(containers)) return datasets
def get_file_locations(self, complete=1, sites=None, cloud=None): '''Helper function to resolve replica locations of files of the given datasets NB: by default, it checks only sites with complete datasets ''' profiler = ElapsedTimeProfiler(logger=logger) # get the dataset locations ds_sites = self.get_locations(complete=complete, overlap=False) logger.info('resolving dataset file locations') profiler.start() replicas = {} #if complete: # pass #else: for ds in ds_sites.keys(): logger.debug('dataset: %s' % ds) logger.debug('sites: %s' % repr(ds_sites[ds])) replicas.update(resolve_file_locations(ds, sites=ds_sites[ds])) profiler.check('%d datasets %d files' % (len(ds_sites.keys()), len(replicas.keys()))) return replicas
def get_complete_files_replicas(self, nthread=10, diskOnly=True): '''Gets a comprehensive dataset information about the contents and the location of COMPLETE replicas''' if not self.complete_files_replicas: re_tapeSite = re.compile('.*TAPE$') ds_info = {} self.__expand_datasets() wq = Queue(len(self.dataset)) for ds in self.dataset: wq.put(ds) mylock = Lock() def worker(id): dq2 = DQ2() while not wq.empty(): try: ds = wq.get(block=True, timeout=1) logger.debug('worker id: %d on dataset: %s' % (id, ds)) # get contents (guids) of the complete dataset contents = dq2.listFilesInDataset(ds) # get locations of the complete dataset replicas locations = dq2.listDatasetReplicas(ds,complete=1) vuid = None try: vuid = locations.keys()[0] except IndexError as err: pass mylock.acquire() # updating ds_info hastable if vuid: ds_info[ds] = [] ds_sites = [] if diskOnly: for site in locations[vuid][1]: if not re_tapeSite.match(site): ds_sites.append(site) else: ds_sites = locations[vuid][1] ds_info[ds] += [ contents[0], ds_sites ] else: logger.warning('dataset not available: %s' % ds) mylock.release() except DQException as err: logger.warning(str(err)) except Empty: pass # prepare and run the query threads profiler = ElapsedTimeProfiler(logger=logger) profiler.start() threads = [] for i in range(nthread): t = GangaThread(name='stager_ds_w_%d' % i, target=worker, kwargs={'id': i}) # t.setDaemon(False) threads.append(t) for t in threads: t.start() for t in threads: t.join() self.complete_files_replicas = ds_info profiler.check( 'information collected: %d datasets' % ( len(self.complete_files_replicas.keys()) ) ) else: logger.debug('using cached complete_files_replicas') pass return self.complete_files_replicas
def get_complete_files_replicas(self, nthread=10, diskOnly=True): '''Gets a comprehensive dataset information about the contents and the location of COMPLETE replicas''' if not self.complete_files_replicas: re_tapeSite = re.compile('.*TAPE$') ds_info = {} self.__expand_datasets() wq = Queue(len(self.dataset)) for ds in self.dataset: wq.put(ds) mylock = Lock() def worker(id): dq2 = DQ2() while not wq.empty(): try: ds = wq.get(block=True, timeout=1) logger.debug('worker id: %d on dataset: %s' % (id, ds)) # get contents (guids) of the complete dataset contents = dq2.listFilesInDataset(ds) # get locations of the complete dataset replicas locations = dq2.listDatasetReplicas(ds, complete=1) vuid = None try: vuid = locations.keys()[0] except IndexError as err: pass mylock.acquire() # updating ds_info hastable if vuid: ds_info[ds] = [] ds_sites = [] if diskOnly: for site in locations[vuid][1]: if not re_tapeSite.match(site): ds_sites.append(site) else: ds_sites = locations[vuid][1] ds_info[ds] += [contents[0], ds_sites] else: logger.warning('dataset not available: %s' % ds) mylock.release() except DQException as err: logger.warning(str(err)) except Empty: pass # prepare and run the query threads profiler = ElapsedTimeProfiler(logger=logger) profiler.start() threads = [] for i in range(nthread): t = GangaThread(name='stager_ds_w_%d' % i, target=worker, kwargs={'id': i}) # t.setDaemon(False) threads.append(t) for t in threads: t.start() for t in threads: t.join() self.complete_files_replicas = ds_info profiler.check('information collected: %d datasets' % (len(self.complete_files_replicas.keys()))) else: logger.debug('using cached complete_files_replicas') pass return self.complete_files_replicas