def getDataset(self, **kwargs): '''Get all metadata of all datasets. Public method, not exported to GPI.''' db_view_column = ['dataset_id', 'creation_date', 'occupancy'] sql = 'SELECT * FROM dataset_union WHERE true' kwargs['owner'] = kwargs.get('owner', ['official', utils.getOwner()]) # add filter to query if len(kwargs) > 0: for key, value in kwargs.iteritems(): if key in db_view_column: sql += " AND %s ILIKE '%s%%'" % (key, value) elif key == 'files': sql += " AND files > %s" % value elif key in ['status', 'session', 'owner']: if not isinstance(value, list): value = [value] sql += " AND (false" for s in value: sql += " OR %s ILIKE '%s%%'" % (key, s) sql += ")" else: sql += " AND parameters->'%s' ILIKE '%s%%'" % (key, value) # clean up the query sql = sql.replace('false OR ', '') sql = sql.replace('true AND ', '') # TODO: add control to prevent sql injection datasets = db.read(sql) if len(datasets) == 0: raise GangaException('No dataset found') i = 0 for dataset in datasets: dataset['id'] = i i += 1 dataset['occupancy_human'] = utils.sizeof_fmt_binary( dataset['occupancy']) if 'evt_file' in dataset[ 'parameters'] and not 'evt_tot' in dataset['parameters']: evt_file = int(dataset['parameters']['evt_file']) if dataset['files'] is None: dataset['files'] = 0 files = int(dataset['files']) dataset['parameters']['evt_tot'] = evt_file * files if 'evt_tot' in dataset['parameters']: dataset['parameters'][ 'evt_tot_human'] = utils.sizeof_fmt_decimal( int(dataset['parameters']['evt_tot'])) return datasets
def getDataset(self, **kwargs): '''Interactive method. It prints the datasets (the user can apply filters), the user chooses one of them and inserts the number of LFNs he wants.''' manager = SBDatasetManager.SBDatasetManager() def validateFilter(filter, allowed): kwargs[filter] = kwargs.get(filter, allowed) if not isinstance(kwargs[filter], list): kwargs[filter] = [kwargs[filter]] if not set(kwargs[filter]).issubset(set(allowed)): raise GangaException('%s must be %s' % (filter, allowed)) validateFilter('status', ['open', 'closed']) validateFilter('session', ['analysis']) kwargs['files'] = 0 datasets = manager.getDataset(**kwargs) dataset = manager.printDatasets(datasets) self.dataset_id = dataset['dataset_id'] print('\nChosen dataset details:') manager.printDatasetDetail(dataset) print( '\nInsert the minimum number of files that you need for your analysis (zero for all):' ) self.files_total = utils.getIndex(maxInclusive=int(dataset['files'])) lfns = self.__getLFNs() tot_size = 0 tot_files = len(lfns) for lfn in lfns: tot_size += int(lfn['size']) print('\nTotal job input size: ' + str(utils.sizeof_fmt_binary(tot_size))) print('Total number of involved lfns: ' + str(tot_files)) print( '\nInsert the maximum number of files for each subjob. Remember:') print('- maximum output size is 2GiB.') print('- suggested maximum job duration 18h.') print('- maximum input size job is 10GiB.') self.files_per_subjobs = utils.getIndex(minInclusive=1, maxInclusive=tot_files) job = self.__createInputPath(lfns) print('\nSubjobs details:') column_names = ['id', 'list_path', 'size', 'lfns'] print(utils.format_dict_table(job, column_names))
def getDataset(self, **kwargs): '''Get all metadata of all datasets. Public method, not exported to GPI.''' db_view_column = ['dataset_id', 'creation_date', 'occupancy'] sql = 'SELECT * FROM dataset_union WHERE true' kwargs['owner'] = kwargs.get('owner', ['official', utils.getOwner()]) # add filter to query if len(kwargs) > 0: for key, value in kwargs.iteritems(): if key in db_view_column: sql += " AND %s ILIKE '%s%%'" % (key, value) elif key == 'files': sql += " AND files > %s" % value elif key in ['status', 'session', 'owner']: if not isinstance(value, list): value = [value] sql += " AND (false" for s in value: sql += " OR %s ILIKE '%s%%'" % (key, s) sql += ")" else: sql += " AND parameters->'%s' ILIKE '%s%%'" % (key, value) # clean up the query sql = sql.replace('false OR ', '') sql = sql.replace('true AND ', '') # TODO: add control to prevent sql injection datasets = db.read(sql) if len(datasets) == 0: raise GangaException('No dataset found') i = 0 for dataset in datasets: dataset['id'] = i i += 1 dataset['occupancy_human'] = utils.sizeof_fmt_binary(dataset['occupancy']) if 'evt_file' in dataset['parameters'] and not 'evt_tot' in dataset['parameters']: evt_file = int(dataset['parameters']['evt_file']) if dataset['files'] is None: dataset['files'] = 0 files = int(dataset['files']) dataset['parameters']['evt_tot'] = evt_file * files if 'evt_tot' in dataset['parameters']: dataset['parameters']['evt_tot_human'] = utils.sizeof_fmt_decimal(int(dataset['parameters']['evt_tot'])) return datasets
def getDataset(self, **kwargs): '''Interactive mathod. It prints the datasets (the user can apply filters), the user chooses one of them and inserts the number of events he wants.''' manager = SBDatasetManager.SBDatasetManager() def validateFilter(filter, allowed): kwargs[filter] = kwargs.get(filter, allowed) if not isinstance(kwargs[filter], list): kwargs[filter] = [kwargs[filter]] if not set(kwargs[filter]).issubset(set(allowed)): raise GangaException('%s must be %s' % (filter, allowed)) validateFilter('status', ['open', 'closed']) validateFilter('session', ['fastsim', 'fullsim']) datasets = manager.getDataset(**kwargs) dataset = manager.printDatasets(datasets) self.dataset_id = dataset['dataset_id'] print('\nChosen dataset details:') manager.printDatasetDetail(dataset) print('\nInsert the minimum number of events that you need for your analysis (zero for all):') self.events_total = utils.getIndex(maxInclusive=int(dataset['parameters']['evt_tot'])) lfns = self.__getLFNs(dataset['parameters']['evt_file']) tot_size = 0 tot_files = len(lfns) tot_events = int(dataset['parameters']['evt_file']) * tot_files for lfn in lfns: tot_size += int(lfn['size']) print('\nTotal job input size: ' + str(utils.sizeof_fmt_binary(tot_size))) print('Total selected number of events: ' + str(utils.sizeof_fmt_decimal(tot_events))) print('Total number of involved lfns: ' + str(tot_files)) print('\nInsert the maximum number of events for each subjob. Remember:') print('- maximum output size is 2GiB.') print('- suggested maximum job duration 18h.') print('- maximum input size job is 10GiB.') print('- at least %s (that is the number of events of one file).' % dataset['parameters']['evt_file']) self.events_per_subjobs = utils.getIndex(minInclusive=int(dataset['parameters']['evt_file']), maxInclusive=tot_events) job = self.__createInputPath(lfns, dataset['parameters']['evt_file']) print('\nSubjobs details:') column_names = ['id', 'list_path', 'size', 'events', 'lfns'] print(utils.format_dict_table(job, column_names))
def downloadDataset(self, **kwargs): '''to retrieve all files belonging to a owned dataset from GRID to submission machine''' # TODO: create surl file lists beside the lfn list to permit lcg-cp #fail over chain implamantation and to permit the direct plugin # subjob configuration by user given list kwargs['owner'] = utils.getOwner() kwargs['files'] = 0 datasets = self.getDataset(**kwargs) dataset = self.printDatasets(datasets) dataset_id = dataset['dataset_id'] files = dataset['files'] occupancy_human = dataset['occupancy_human'] home = os.path.expanduser('~') s = os.statvfs(home) free_disk = utils.sizeof_fmt_binary(s.f_bsize * s.f_bavail) #print('\nFree disk space: %s' % free_disk) print('\nTotal download size: %s\n' % occupancy_human) sql = 'SELECT lfn FROM analysis_output WHERE dataset_id = %s' lfns = db.read(sql, (r'\x' + dataset_id, )) localdir = os.path.join(home, dataset_id) os.mkdir(localdir) print('Downloading to %s ...' % localdir) i = 1 for lfn in lfns: source = lfn['lfn'] destination = os.path.join(localdir, source.split('/')[-1]) process = subprocess.Popen(['lcg-cp', source, destination], stdout=subprocess.PIPE, close_fds=True) outData, errData = process.communicate() retCode = process.poll() if retCode != 0: raise Exception('lcg-cp fail with return code %d' % retCode) sys.stdout.write('\b' * 80 + '%s/%s' % (str(i), str(files))) sys.stdout.flush() i += 1
def __createInputPath(self, lfns): '''This method splits the list of LFNs between subjobs and writes a text file for each one.''' # split all lfns between subjobs job = list() job.append(list()) size = 0 files = 0 maxInput = 10 * (2**30) # 10GiB minInput = 2 * (2**30) # 2GiB # fill the subjobs al long as there are LFNs, # to determine the number of subjobs required for lfn in lfns: if (size + int(lfn['size'])) < maxInput and (files + 1) <= self.files_per_subjobs: size += int(lfn['size']) files += 1 else: job.append(list()) size = int(lfn['size']) files = 1 job[-1].append(lfn) self.number_of_subjobs = len(job) # level the number of LFNs between the subjob. tot_files = len(lfns) balanced_number_lfn_per_subjob = int(math.ceil(float(tot_files)/self.number_of_subjobs)) job = list() self.input_path = list() max_size = 0 jobInputDir = self.getJobObject().inputdir lfns_index = 0 for subjob_id in xrange(self.number_of_subjobs): subjob = dict() size = 0 events = 0 number_lfns = 0 subjob['id'] = str(subjob_id) subjob['list_path'] = os.path.join(jobInputDir, "list_%d.txt" % subjob_id) f = open(subjob['list_path'], 'w') try: for lfn in lfns[lfns_index:lfns_index + balanced_number_lfn_per_subjob]: f.write(lfn['lfn'] + '\n') size += int(lfn['size']) number_lfns += 1 finally: f.close() lfns_index += balanced_number_lfn_per_subjob self.input_path.append(File(f.name)) subjob['size'] = utils.sizeof_fmt_binary(size) subjob['lfns'] = number_lfns job.append(subjob) if size > max_size: max_size = size if max_size < minInput: logger.warning('These subjobs input is very small, to improve the efficiency you could increase the numbers of events per subjob.') return job
def __createInputPath(self, lfns): '''This method splits the list of LFNs between subjobs and writes a text file for each one.''' # split all lfns between subjobs job = list() job.append(list()) size = 0 files = 0 maxInput = 10 * (2**30) # 10GiB minInput = 2 * (2**30) # 2GiB # fill the subjobs al long as there are LFNs, # to determine the number of subjobs required for lfn in lfns: if (size + int(lfn['size'])) < maxInput and ( files + 1) <= self.files_per_subjobs: size += int(lfn['size']) files += 1 else: job.append(list()) size = int(lfn['size']) files = 1 job[-1].append(lfn) self.number_of_subjobs = len(job) # level the number of LFNs between the subjob. tot_files = len(lfns) balanced_number_lfn_per_subjob = int( math.ceil(float(tot_files) / self.number_of_subjobs)) job = list() self.input_path = list() max_size = 0 jobInputDir = self.getJobObject().inputdir lfns_index = 0 for subjob_id in xrange(self.number_of_subjobs): subjob = dict() size = 0 events = 0 number_lfns = 0 subjob['id'] = str(subjob_id) subjob['list_path'] = os.path.join(jobInputDir, "list_%d.txt" % subjob_id) f = open(subjob['list_path'], 'w') try: for lfn in lfns[lfns_index:lfns_index + balanced_number_lfn_per_subjob]: f.write(lfn['lfn'] + '\n') size += int(lfn['size']) number_lfns += 1 finally: f.close() lfns_index += balanced_number_lfn_per_subjob self.input_path.append(File(f.name)) subjob['size'] = utils.sizeof_fmt_binary(size) subjob['lfns'] = number_lfns job.append(subjob) if size > max_size: max_size = size if max_size < minInput: logger.warning( 'These subjobs input is very small, to improve the efficiency you could increase the numbers of events per subjob.' ) return job