示例#1
0
    def getDataset(self, **kwargs):
        '''Get all metadata of all datasets.
        Public method, not exported to GPI.'''

        db_view_column = ['dataset_id', 'creation_date', 'occupancy']
        sql = 'SELECT * FROM dataset_union WHERE true'
        kwargs['owner'] = kwargs.get('owner', ['official', utils.getOwner()])

        # add filter to query
        if len(kwargs) > 0:
            for key, value in kwargs.iteritems():
                if key in db_view_column:
                    sql += " AND %s ILIKE '%s%%'" % (key, value)
                elif key == 'files':
                    sql += " AND files > %s" % value
                elif key in ['status', 'session', 'owner']:
                    if not isinstance(value, list):
                        value = [value]

                    sql += " AND (false"
                    for s in value:
                        sql += " OR %s ILIKE '%s%%'" % (key, s)
                    sql += ")"

                else:
                    sql += " AND parameters->'%s' ILIKE '%s%%'" % (key, value)

        # clean up the query
        sql = sql.replace('false OR ', '')
        sql = sql.replace('true AND ', '')

        # TODO: add control to prevent sql injection
        datasets = db.read(sql)

        if len(datasets) == 0:
            raise GangaException('No dataset found')

        i = 0
        for dataset in datasets:
            dataset['id'] = i
            i += 1
            dataset['occupancy_human'] = utils.sizeof_fmt_binary(
                dataset['occupancy'])
            if 'evt_file' in dataset[
                    'parameters'] and not 'evt_tot' in dataset['parameters']:
                evt_file = int(dataset['parameters']['evt_file'])
                if dataset['files'] is None:
                    dataset['files'] = 0
                files = int(dataset['files'])
                dataset['parameters']['evt_tot'] = evt_file * files
            if 'evt_tot' in dataset['parameters']:
                dataset['parameters'][
                    'evt_tot_human'] = utils.sizeof_fmt_decimal(
                        int(dataset['parameters']['evt_tot']))

        return datasets
示例#2
0
    def getDataset(self, **kwargs):
        '''Interactive method. It prints the datasets (the user can apply filters),
        the user chooses one of them and inserts the number of LFNs he wants.'''

        manager = SBDatasetManager.SBDatasetManager()

        def validateFilter(filter, allowed):
            kwargs[filter] = kwargs.get(filter, allowed)
            if not isinstance(kwargs[filter], list):
                kwargs[filter] = [kwargs[filter]]
            if not set(kwargs[filter]).issubset(set(allowed)):
                raise GangaException('%s must be %s' % (filter, allowed))

        validateFilter('status', ['open', 'closed'])
        validateFilter('session', ['analysis'])
        kwargs['files'] = 0

        datasets = manager.getDataset(**kwargs)
        dataset = manager.printDatasets(datasets)

        self.dataset_id = dataset['dataset_id']

        print('\nChosen dataset details:')
        manager.printDatasetDetail(dataset)

        print(
            '\nInsert the minimum number of files that you need for your analysis (zero for all):'
        )
        self.files_total = utils.getIndex(maxInclusive=int(dataset['files']))

        lfns = self.__getLFNs()

        tot_size = 0
        tot_files = len(lfns)

        for lfn in lfns:
            tot_size += int(lfn['size'])

        print('\nTotal job input size: ' +
              str(utils.sizeof_fmt_binary(tot_size)))
        print('Total number of involved lfns: ' + str(tot_files))

        print(
            '\nInsert the maximum number of files for each subjob. Remember:')
        print('- maximum output size is 2GiB.')
        print('- suggested maximum job duration 18h.')
        print('- maximum input size job is 10GiB.')

        self.files_per_subjobs = utils.getIndex(minInclusive=1,
                                                maxInclusive=tot_files)
        job = self.__createInputPath(lfns)

        print('\nSubjobs details:')
        column_names = ['id', 'list_path', 'size', 'lfns']
        print(utils.format_dict_table(job, column_names))
示例#3
0
 def getDataset(self, **kwargs):
     '''Get all metadata of all datasets.
     Public method, not exported to GPI.'''
     
     db_view_column = ['dataset_id', 'creation_date', 'occupancy']
     sql = 'SELECT * FROM dataset_union WHERE true'
     kwargs['owner'] = kwargs.get('owner', ['official', utils.getOwner()])
     
     # add filter to query
     if len(kwargs) > 0:
         for key, value in kwargs.iteritems():
             if key in db_view_column:
                 sql += " AND %s ILIKE '%s%%'" % (key, value)
             elif key == 'files':
                 sql += " AND files > %s" % value
             elif key in ['status', 'session', 'owner']:
                 if not isinstance(value, list):
                     value = [value]
                 
                 sql += " AND (false"
                 for s in value:
                     sql += " OR %s ILIKE '%s%%'" % (key, s)
                 sql += ")"
                 
             else:
                 sql += " AND parameters->'%s' ILIKE '%s%%'" % (key, value)
     
     # clean up the query
     sql = sql.replace('false OR ', '')
     sql = sql.replace('true AND ', '')
     
     # TODO: add control to prevent sql injection
     datasets = db.read(sql)
     
     if len(datasets) == 0:
         raise GangaException('No dataset found')
     
     i = 0
     for dataset in datasets:
         dataset['id'] = i
         i += 1
         dataset['occupancy_human'] = utils.sizeof_fmt_binary(dataset['occupancy'])
         if 'evt_file' in dataset['parameters'] and not 'evt_tot' in dataset['parameters']:
             evt_file = int(dataset['parameters']['evt_file'])
             if dataset['files'] is None:
                 dataset['files'] = 0
             files = int(dataset['files'])
             dataset['parameters']['evt_tot'] = evt_file * files
         if 'evt_tot' in dataset['parameters']:
             dataset['parameters']['evt_tot_human'] = utils.sizeof_fmt_decimal(int(dataset['parameters']['evt_tot']))
     
     return datasets
示例#4
0
 def getDataset(self, **kwargs):
     '''Interactive mathod. It prints the datasets (the user can apply filters),
     the user chooses one of them and inserts the number of events he wants.'''
     
     manager = SBDatasetManager.SBDatasetManager()
     
     def validateFilter(filter, allowed):
         kwargs[filter] = kwargs.get(filter, allowed)
         if not isinstance(kwargs[filter], list):
             kwargs[filter] = [kwargs[filter]]
         if not set(kwargs[filter]).issubset(set(allowed)):
             raise GangaException('%s must be %s' % (filter, allowed))
     
     validateFilter('status', ['open', 'closed'])
     validateFilter('session', ['fastsim', 'fullsim'])
     
     datasets = manager.getDataset(**kwargs)
     dataset = manager.printDatasets(datasets)
     
     self.dataset_id = dataset['dataset_id']
     
     print('\nChosen dataset details:')
     manager.printDatasetDetail(dataset)
     
     print('\nInsert the minimum number of events that you need for your analysis (zero for all):')
     self.events_total = utils.getIndex(maxInclusive=int(dataset['parameters']['evt_tot']))
     
     lfns = self.__getLFNs(dataset['parameters']['evt_file'])
     
     tot_size = 0
     tot_files = len(lfns)
     tot_events = int(dataset['parameters']['evt_file']) * tot_files
     
     for lfn in lfns:
         tot_size += int(lfn['size'])
     
     print('\nTotal job input size: ' + str(utils.sizeof_fmt_binary(tot_size)))
     print('Total selected number of events: ' + str(utils.sizeof_fmt_decimal(tot_events)))
     print('Total number of involved lfns: ' + str(tot_files))
     
     print('\nInsert the maximum number of events for each subjob. Remember:')
     print('- maximum output size is 2GiB.')
     print('- suggested maximum job duration 18h.')
     print('- maximum input size job is 10GiB.')
     print('- at least %s (that is the number of events of one file).' % dataset['parameters']['evt_file'])
     
     self.events_per_subjobs = utils.getIndex(minInclusive=int(dataset['parameters']['evt_file']), maxInclusive=tot_events)
     job = self.__createInputPath(lfns, dataset['parameters']['evt_file'])
     
     print('\nSubjobs details:')
     column_names = ['id', 'list_path', 'size', 'events', 'lfns']
     print(utils.format_dict_table(job, column_names))
示例#5
0
    def downloadDataset(self, **kwargs):
        '''to retrieve all files belonging to a owned dataset from GRID to 
        submission machine'''
        # TODO: create surl file lists beside the lfn list to permit lcg-cp
        #fail over chain implamantation and to permit the direct plugin
        # subjob configuration by user given list

        kwargs['owner'] = utils.getOwner()
        kwargs['files'] = 0

        datasets = self.getDataset(**kwargs)
        dataset = self.printDatasets(datasets)

        dataset_id = dataset['dataset_id']
        files = dataset['files']
        occupancy_human = dataset['occupancy_human']

        home = os.path.expanduser('~')
        s = os.statvfs(home)
        free_disk = utils.sizeof_fmt_binary(s.f_bsize * s.f_bavail)

        #print('\nFree disk space: %s' % free_disk)
        print('\nTotal download size: %s\n' % occupancy_human)

        sql = 'SELECT lfn FROM analysis_output WHERE dataset_id = %s'
        lfns = db.read(sql, (r'\x' + dataset_id, ))

        localdir = os.path.join(home, dataset_id)
        os.mkdir(localdir)

        print('Downloading to %s ...' % localdir)
        i = 1

        for lfn in lfns:
            source = lfn['lfn']
            destination = os.path.join(localdir, source.split('/')[-1])

            process = subprocess.Popen(['lcg-cp', source, destination],
                                       stdout=subprocess.PIPE,
                                       close_fds=True)
            outData, errData = process.communicate()
            retCode = process.poll()

            if retCode != 0:
                raise Exception('lcg-cp fail with return code %d' % retCode)

            sys.stdout.write('\b' * 80 + '%s/%s' % (str(i), str(files)))
            sys.stdout.flush()

            i += 1
示例#6
0
 def downloadDataset(self, **kwargs):
     '''to retrieve all files belonging to a owned dataset from GRID to 
     submission machine'''
     # TODO: create surl file lists beside the lfn list to permit lcg-cp 
     #fail over chain implamantation and to permit the direct plugin
     # subjob configuration by user given list
     
     kwargs['owner'] = utils.getOwner()
     kwargs['files'] = 0
     
     datasets = self.getDataset(**kwargs)
     dataset = self.printDatasets(datasets)
     
     dataset_id = dataset['dataset_id']
     files = dataset['files']
     occupancy_human = dataset['occupancy_human']
     
     home = os.path.expanduser('~')
     s = os.statvfs(home)
     free_disk = utils.sizeof_fmt_binary(s.f_bsize * s.f_bavail)
     
     #print('\nFree disk space: %s' % free_disk)
     print('\nTotal download size: %s\n' % occupancy_human)
     
     sql = 'SELECT lfn FROM analysis_output WHERE dataset_id = %s'
     lfns = db.read(sql, (r'\x' + dataset_id, ))
     
     localdir = os.path.join(home, dataset_id)
     os.mkdir(localdir)
     
     print('Downloading to %s ...' % localdir)
     i = 1
     
     for lfn in lfns:
         source = lfn['lfn']
         destination = os.path.join(localdir, source.split('/')[-1])
         
         process = subprocess.Popen(['lcg-cp', source, destination], stdout=subprocess.PIPE, close_fds=True)
         outData, errData = process.communicate()
         retCode = process.poll()
         
         if retCode != 0:
             raise Exception('lcg-cp fail with return code %d' % retCode)
         
         sys.stdout.write('\b' * 80 + '%s/%s' % (str(i), str(files)))
         sys.stdout.flush()
         
         i += 1
示例#7
0
 def __createInputPath(self, lfns):
     '''This method splits the list of LFNs between subjobs and writes a 
     text file for each one.'''
     
     # split all lfns between subjobs
     job = list()
     job.append(list())
     size = 0
     files = 0
     maxInput = 10 * (2**30) # 10GiB
     minInput = 2 * (2**30) # 2GiB
     
     # fill the subjobs al long as there are LFNs,
     # to determine the number of subjobs required 
     for lfn in lfns:
         if (size + int(lfn['size'])) < maxInput and (files + 1) <= self.files_per_subjobs:
             size += int(lfn['size'])
             files += 1
         else:
             job.append(list())
             size = int(lfn['size'])
             files = 1
         
         job[-1].append(lfn)
     
     self.number_of_subjobs = len(job)
     
     # level the number of LFNs between the subjob.
     tot_files = len(lfns)
     balanced_number_lfn_per_subjob = int(math.ceil(float(tot_files)/self.number_of_subjobs))
     job = list()
     self.input_path = list()
     max_size = 0
     jobInputDir = self.getJobObject().inputdir
     lfns_index = 0
     
     for subjob_id in xrange(self.number_of_subjobs):
         subjob = dict()
         size = 0
         events = 0
         number_lfns = 0
         subjob['id'] = str(subjob_id)
         subjob['list_path'] = os.path.join(jobInputDir, "list_%d.txt" % subjob_id)
         
         f = open(subjob['list_path'], 'w')
         try:
             for lfn in lfns[lfns_index:lfns_index + balanced_number_lfn_per_subjob]:
                 f.write(lfn['lfn'] + '\n')
                 size += int(lfn['size'])
                 number_lfns += 1
         finally:
             f.close()
         
         lfns_index += balanced_number_lfn_per_subjob
         self.input_path.append(File(f.name))
         subjob['size'] = utils.sizeof_fmt_binary(size)
         subjob['lfns'] = number_lfns
         job.append(subjob)
         
         if size > max_size:
             max_size = size
     
     if max_size < minInput:
         logger.warning('These subjobs input is very small, to improve the efficiency you could increase the numbers of events per subjob.')
     
     return job
示例#8
0
    def __createInputPath(self, lfns):
        '''This method splits the list of LFNs between subjobs and writes a 
        text file for each one.'''

        # split all lfns between subjobs
        job = list()
        job.append(list())
        size = 0
        files = 0
        maxInput = 10 * (2**30)  # 10GiB
        minInput = 2 * (2**30)  # 2GiB

        # fill the subjobs al long as there are LFNs,
        # to determine the number of subjobs required
        for lfn in lfns:
            if (size + int(lfn['size'])) < maxInput and (
                    files + 1) <= self.files_per_subjobs:
                size += int(lfn['size'])
                files += 1
            else:
                job.append(list())
                size = int(lfn['size'])
                files = 1

            job[-1].append(lfn)

        self.number_of_subjobs = len(job)

        # level the number of LFNs between the subjob.
        tot_files = len(lfns)
        balanced_number_lfn_per_subjob = int(
            math.ceil(float(tot_files) / self.number_of_subjobs))
        job = list()
        self.input_path = list()
        max_size = 0
        jobInputDir = self.getJobObject().inputdir
        lfns_index = 0

        for subjob_id in xrange(self.number_of_subjobs):
            subjob = dict()
            size = 0
            events = 0
            number_lfns = 0
            subjob['id'] = str(subjob_id)
            subjob['list_path'] = os.path.join(jobInputDir,
                                               "list_%d.txt" % subjob_id)

            f = open(subjob['list_path'], 'w')
            try:
                for lfn in lfns[lfns_index:lfns_index +
                                balanced_number_lfn_per_subjob]:
                    f.write(lfn['lfn'] + '\n')
                    size += int(lfn['size'])
                    number_lfns += 1
            finally:
                f.close()

            lfns_index += balanced_number_lfn_per_subjob
            self.input_path.append(File(f.name))
            subjob['size'] = utils.sizeof_fmt_binary(size)
            subjob['lfns'] = number_lfns
            job.append(subjob)

            if size > max_size:
                max_size = size

        if max_size < minInput:
            logger.warning(
                'These subjobs input is very small, to improve the efficiency you could increase the numbers of events per subjob.'
            )

        return job