def importDatasetWithExistingParents(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed=True): """ _importDataset_ Import a dataset into the local scope DBS. It complains if the parent dataset ar not there!! - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) for inputBlock in inputBlocks: block = inputBlock["Name"] # // # // Test block does not exist in target # // if self.reader.blockExists(block): # // # // block exists # // If block is closed dont attempt transfer if not str(inputBlock["OpenForWriting"]) != "1": msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock["NumberOfFiles"]) != "0": msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block, sename) logging.info(sename) continue try: xferData = reader.dbs.listDatasetContents(sourceDatasetPath, block) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not read content of dataset:\n ==> %s\n" % (sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) try: self.dbs.insertDatasetContents(xferData) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not write content of dataset:\n ==> %s\n" % (sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg)
class DBSReaderTest(unittest.TestCase): def setUp(self): """ _setUp_ Initialize the PhEDEx API to point at the test server. """ #self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" self.endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader' self.dbs = None return @attr("integration") def testListPrimaryDatasets(self): """ listPrimaryDatasets returns known primary datasets """ self.dbs = DBSReader(self.endpoint) results = self.dbs.listPrimaryDatasets('Jet*') self.assertTrue('Jet' in results) self.assertTrue('JetMET' in results) self.assertTrue('JetMETTau' in results) self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist')) return @attr("integration") def testMatchProcessedDatasets(self): """ matchProcessedDatasets returns known processed datasets """ self.dbs = DBSReader(self.endpoint) dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1') self.assertEqual(1, len(dataset)) self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList']) self.assertEqual('Run2011A-v1', dataset[0]['Name']) self.assertFalse( self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666')) @attr("integration") def testlistRuns(self): """listRuns returns known runs""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRuns(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(174074 in runs) runs = self.dbs.listRuns(dataset=DATASET, block=BLOCK) self.assertEqual([173657], runs) @attr("integration") def testlistRunLumis(self): """listRunLumis returns known runs and lumicounts""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRunLumis(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(173692 in runs) self.assertEqual(runs[173692], 2782) runs = self.dbs.listRuns(dataset=DATASET, block=BLOCK) self.assertEqual({173657: 94}, runs) @attr("integration") def testListProcessedDatasets(self): """listProcessedDatasets returns known processed datasets""" self.dbs = DBSReader(self.endpoint) datasets = self.dbs.listProcessedDatasets('Jet', 'RAW') self.assertTrue('Run2011A-v1' in datasets) self.assertTrue('Run2011B-v1' in datasets) self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah')) self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW')) @attr("integration") def testlistDatasetFiles(self): """listDatasetFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.listDatasetFiles(DATASET) self.assertEqual(49, len(files)) self.assertTrue(FILE in files) @attr("integration") def testlistDatasetFileDetails(self): """testlistDatasetFilesDetails returns lumis, events, and parents of a dataset""" TESTFILE = '/store/data/Run2011A/HighPileUp/RAW/v1/000/173/658/56484BAB-CBCB-E011-AF00-BCAEC518FF56.root' for endpoint in [ self.endpoint, 'test/python/WMCore_t/Services_t/DBS_t/DBSReader_t.py:' ]: self.dbs = DBSReader(endpoint) details = self.dbs.listDatasetFileDetails(DATASET) self.assertEqual(len(details), 49) self.assertTrue(TESTFILE in details) self.assertEqual(details[TESTFILE]['NumberOfEvents'], 545) self.assertEqual(details[TESTFILE]['Size'], 286021145) self.assertEqual( details[TESTFILE]['BlockName'], '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace' ) self.assertEqual(details[TESTFILE]['Checksums'], { 'Checksum': '22218315', 'Adler32': 'a41a1446', 'Md5': 'NOTSET' }) self.assertTrue(173658 in details[TESTFILE]['Lumis']) self.assertEqual( sorted(details[TESTFILE]['Lumis'][173658]), sorted( map( long, [8, 12, 9, 14, 10, 6, 2, 1, 4, 3, 36, 49, 16, 11, 27, 35, 46, 39, 20, 24, 52, 23, 40, 42, 45, 21, 32, 37, \ 25, 22, 5, 33, 17, 15, 26, 50, 18, 29, 51, 44, 69, 43, 30, 73, 19, 41, 13, 38, 7, 31, 75, 48, 59, 65, 55, \ 57, 34, 28, 74, 47, 64, 61, 68, 77, 66, 71, 60, 76, 70, 67, 62, 78, 82, 79, 88, 56, 101, 92, 58, 72, 54, \ 63, 96, 53, 84, 95, 89, 85, 99, 81, 91, 102, 80, 100, 107, 94, 93, 90, 86, 87, 83, 97, 104, 110, 111, 106,\ 108, 98, 103, 109, 105])) ) @attr("integration") def testGetDBSSummaryInfo(self): """getDBSSummaryInfo returns summary of dataset and block""" self.dbs = DBSReader(self.endpoint) dataset = self.dbs.getDBSSummaryInfo(DATASET) self.assertEqual(dataset['path'], DATASET) self.assertEqual(dataset['block'], '') self.assertEqual(dataset['NumberOfEvents'], '22075') self.assertEqual(dataset['NumberOfBlocks'], '46') self.assertEqual(dataset['total_size'], '4001680824') self.assertEqual(dataset['NumberOfFiles'], '49') self.assertEqual(dataset['NumberOfLumis'], '7223') block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK) self.assertEqual(block['path'], '') self.assertEqual(block['block'], BLOCK) self.assertEqual(block['NumberOfEvents'], '377') self.assertEqual(block['NumberOfBlocks'], '1') self.assertEqual(block['total_size'], '150780132') self.assertEqual(block['NumberOfFiles'], '2') self.assertEqual(block['NumberOfLumis'], '94') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET + 'blah') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET, BLOCK + 'asas') @attr("integration") def testGetFileBlocksInfo(self): """getFileBlocksInfo returns block info, including location lookup""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.getFileBlocksInfo(DATASET) block = self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK) self.assertEqual(1, len(block)) block = block[0] self.assertEqual(46, len(blocks)) self.assertTrue(block['Name'] in [x['Name'] for x in blocks]) self.assertEqual(BLOCK, block['Name']) #self.assertEqual(377, block['NumberOfEvents']) self.assertEqual(150780132, block['BlockSize']) self.assertEqual(2, block['NumberOfFiles']) # possibly fragile but assume block located at least at cern sites = [ x['Name'] for x in block['StorageElementList'] if x['Name'].find('cern.ch') > -1 ] self.assertTrue(sites) # weird error handling - depends on whether block or dataset is missing self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET + 'blah') self.assertFalse( self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK + 'asas')) @attr("integration") def testListFileBlocks(self): """listFileBlocks returns block names in dataset""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.listFileBlocks(DATASET) # block is closed block = self.dbs.listFileBlocks(DATASET, blockName=BLOCK, onlyClosedBlocks=True)[0] self.assertEqual(block, BLOCK) self.assertTrue(BLOCK in block) @attr("integration") def testListOpenFileBlocks(self): """listOpenFileBlocks finds open blocks""" # hard to find a dataset with open blocks, so don't bother self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.listOpenFileBlocks(DATASET)) @attr("integration") def testBlockExists(self): """blockExists returns existence of blocks""" self.dbs = DBSReader(self.endpoint) self.assertTrue(self.dbs.blockExists(BLOCK)) self.assertFalse(self.dbs.blockExists(DATASET + '#somethingelse')) @attr("integration") def testListFilesInBlock(self): """listFilesInBlock returns files in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue( FILE in [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)]) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock, DATASET + '#blah') @attr("integration") def testListFilesInBlockWithParents(self): """listFilesInBlockWithParents gets files with parents for a block""" # hope PromptReco doesn't get deleted self.dbs = DBSReader(self.endpoint) files = self.dbs.listFilesInBlockWithParents( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60' ) self.assertEqual(1, len(files)) self.assertEqual( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60', files[0]['Block']['Name']) self.assertEqual( '/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root', files[0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents, BLOCK + 'asas') @attr("integration") def testLfnsInBlock(self): """lfnsInBlock returns lfns in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue(FILE in self.dbs.lfnsInBlock(BLOCK)) self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas') @attr("integration") def testListFileBlockLocation(self): """listFileBlockLocation returns block location""" WRONG_BLOCK = BLOCK[:-4] + 'abcd' BLOCK2 = '/HighPileUp/Run2011A-v1/RAW#6021175e-cbfb-11e0-80a9-003048caaace' DBS_BLOCK = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\ 'ea0972193530f531086947d06eb0f121/USER#fb978442-a61b-413a-b4f4-526e6cdb142e' DBS_BLOCK2 = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\ 'ea0972193530f531086947d06eb0f121/USER#0b04d417-d734-4ef2-88b0-392c48254dab' self.dbs = DBSReader( 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader/') # assume one site is cern sites = [ x for x in self.dbs.listFileBlockLocation(BLOCK) if x and x.find('cern.ch') > -1 ] self.assertTrue(sites) #This block is only found on DBS self.assertTrue(self.dbs.listFileBlockLocation(DBS_BLOCK)) # doesn't raise on non-existant block self.assertFalse(self.dbs.listFileBlockLocation(WRONG_BLOCK)) #test bulk call: ## two blocks in phedex self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, BLOCK2]))) ## one block in phedex one does not exist self.assertEqual( 1, len(self.dbs.listFileBlockLocation([BLOCK, WRONG_BLOCK]))) ## one in phedex one in dbs self.assertEqual( 2, len(self.dbs.listFileBlockLocation([BLOCK, DBS_BLOCK]))) ## two in dbs self.assertEqual( 2, len(self.dbs.listFileBlockLocation([DBS_BLOCK, DBS_BLOCK2]))) ## one in DBS and one does not exist self.assertEqual( 1, len(self.dbs.listFileBlockLocation([DBS_BLOCK, WRONG_BLOCK]))) @attr("integration") def testGetFileBlock(self): """getFileBlock returns block""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlock(BLOCK) self.assertEqual(len(block), 1) block = block[BLOCK] self.assertEqual(2, len(block['Files'])) self.assertRaises(DBSReaderError, self.dbs.getFileBlock, BLOCK + 'asas') @attr("integration") def testGetFileBlockWithParents(self): """getFileBlockWithParents returns block and parents""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlockWithParents( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60' ) self.assertEqual(len(block), 1) block = block[ '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60'] self.assertEqual( '/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root', block['Files'][0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents, BLOCK + 'asas') @attr("integration") def testGetFiles(self): """getFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.getFiles(DATASET) self.assertEqual(len(files), 46) @attr("integration") def testListBlockParents(self): """listBlockParents returns block parents""" self.dbs = DBSReader(self.endpoint) parents = self.dbs.listBlockParents( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60' ) self.assertEqual(1, len(parents)) self.assertEqual( '/Jet/Run2011A-v1/RAW#37cf2a40-4e0e-11e0-9833-00151755cb60', parents[0]['Name']) sites = [ x for x in parents[0]['StorageElementList'] if x.find("cern.ch") > -1 ] self.assertTrue(sites) self.assertFalse( self.dbs.listBlockParents( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60dsl' )) @attr("integration") def testBlockIsOpen(self): """blockIsOpen checks if a block is open""" self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.blockIsOpen(BLOCK)) @attr("integration") def testBlockToDatasetPath(self): """blockToDatasetPath extracts path from block name""" self.dbs = DBSReader(self.endpoint) self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET) self.assertFalse(self.dbs.blockToDatasetPath(BLOCK + 'asas'))
class DBSReaderTest(unittest.TestCase): def setUp(self): """ _setUp_ Initialize the PhEDEx API to point at the test server. """ #self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" self.endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader' self.dbs = None return @attr("integration") def testListPrimaryDatasets(self): """ listPrimaryDatasets returns known primary datasets """ self.dbs = DBSReader(self.endpoint) results = self.dbs.listPrimaryDatasets('Jet*') self.assertTrue('Jet' in results) self.assertTrue('JetMET' in results) self.assertTrue('JetMETTau' in results) self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist')) return @attr("integration") def testMatchProcessedDatasets(self): """ matchProcessedDatasets returns known processed datasets """ self.dbs = DBSReader(self.endpoint) dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1') self.assertEqual(1, len(dataset)) self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList']) self.assertEqual('Run2011A-v1', dataset[0]['Name']) self.assertFalse( self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666')) @attr("integration") def testlistRuns(self): """listRuns returns known runs""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRuns(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(174074 in runs) runs = self.dbs.listRuns(dataset=DATASET, block=BLOCK) self.assertEqual([173657], runs) @attr("integration") def testlistRunLumis(self): """listRunLumis returns known runs and lumicounts""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRunLumis(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(173692 in runs) self.assertEqual(runs[173692], 2782) runs = self.dbs.listRuns(dataset=DATASET, block=BLOCK) self.assertEqual({173657: 94}, runs) @attr("integration") def testListProcessedDatasets(self): """listProcessedDatasets returns known processed datasets""" self.dbs = DBSReader(self.endpoint) datasets = self.dbs.listProcessedDatasets('Jet', 'RAW') self.assertTrue('Run2011A-v1' in datasets) self.assertTrue('Run2011B-v1' in datasets) self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah')) self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW')) @attr("integration") def testlistDatasetFiles(self): """listDatasetFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.listDatasetFiles(DATASET) self.assertEqual(49, len(files)) self.assertTrue(FILE in files) @attr("integrtion") def testGetDBSSummaryInfo(self): """getDBSSummaryInfo returns summary of dataset and block""" self.dbs = DBSReader(self.endpoint) dataset = self.dbs.getDBSSummaryInfo(DATASET) self.assertEqual(dataset['path'], DATASET) self.assertEqual(dataset['block'], '') self.assertEqual(dataset['NumberOfEvents'], '22075') self.assertEqual(dataset['NumberOfBlocks'], '46') self.assertEqual(dataset['total_size'], '4001680824') self.assertEqual(dataset['NumberOfFiles'], '49') self.assertEqual(dataset['NumberOfLumis'], '7223') block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK) self.assertEqual(block['path'], '') self.assertEqual(block['block'], BLOCK) self.assertEqual(block['NumberOfEvents'], '377') self.assertEqual(block['NumberOfBlocks'], '1') self.assertEqual(block['total_size'], '150780132') self.assertEqual(block['NumberOfFiles'], '2') self.assertEqual(block['NumberOfLumis'], '94') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET + 'blah') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET, BLOCK + 'asas') @attr("integration") def testGetFileBlocksInfo(self): """getFileBlocksInfo returns block info, including location lookup""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.getFileBlocksInfo(DATASET) block = self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK) self.assertEqual(1, len(block)) block = block[0] self.assertEqual(46, len(blocks)) self.assertTrue(block['Name'] in [x['Name'] for x in blocks]) self.assertEqual(BLOCK, block['Name']) #self.assertEqual(377, block['NumberOfEvents']) self.assertEqual(150780132, block['BlockSize']) self.assertEqual(2, block['NumberOfFiles']) # possibly fragile but assume block located at least at cern sites = [ x['Name'] for x in block['StorageElementList'] if x['Name'].find('cern.ch') > -1 ] self.assertTrue(sites) # weird error handling - depends on whether block or dataset is missing self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET + 'blah') self.assertFalse( self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK + 'asas')) @attr("integration") def testListFileBlocks(self): """listFileBlocks returns block names in dataset""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.listFileBlocks(DATASET) # block is closed block = self.dbs.listFileBlocks(DATASET, blockName=BLOCK, onlyClosedBlocks=True)[0] self.assertEqual(block, BLOCK) self.assertTrue(BLOCK in block) @attr("integration") def testListOpenFileBlocks(self): """listOpenFileBlocks finds open blocks""" # hard to find a dataset with open blocks, so don't bother self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.listOpenFileBlocks(DATASET)) @attr("integration") def testBlockExists(self): """blockExists returns existence of blocks""" self.dbs = DBSReader(self.endpoint) self.assertTrue(self.dbs.blockExists(BLOCK)) self.assertFalse(self.dbs.blockExists(DATASET + '#somethingelse')) @attr("integration") def testListFilesInBlock(self): """listFilesInBlock returns files in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue( FILE in [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)]) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock, DATASET + '#blah') @attr("integration") def testListFilesInBlockWithParents(self): """listFilesInBlockWithParents gets files with parents for a block""" # hope PromptReco doesn't get deleted self.dbs = DBSReader(self.endpoint) files = self.dbs.listFilesInBlockWithParents( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60' ) self.assertEqual(1, len(files)) self.assertEqual( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60', files[0]['Block']['Name']) self.assertEqual( '/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root', files[0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents, BLOCK + 'asas') @attr("integration") def testLfnsInBlock(self): """lfnsInBlock returns lfns in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue(FILE in self.dbs.lfnsInBlock(BLOCK)) self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas') @attr("integration") def testListFileBlockLocation(self): """listFileBlockLocation returns block location""" self.dbs = DBSReader(self.endpoint) # assume one site is cern sites = [ x for x in self.dbs.listFileBlockLocation(BLOCK) if x.find('cern.ch') > -1 ] self.assertTrue(sites) # doesn't raise on non-existant block self.assertFalse(self.dbs.listFileBlockLocation(BLOCK + 'blah')) @attr("integration") def testGetFileBlock(self): """getFileBlock returns block""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlock(BLOCK) self.assertEqual(len(block), 1) block = block[BLOCK] self.assertEqual(2, len(block['Files'])) self.assertRaises(DBSReaderError, self.dbs.getFileBlock, BLOCK + 'asas') @attr("integration") def testGetFileBlockWithParents(self): """getFileBlockWithParents returns block and parents""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlockWithParents( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60' ) self.assertEqual(len(block), 1) block = block[ '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60'] self.assertEqual( '/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root', block['Files'][0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents, BLOCK + 'asas') @attr("integration") def testGetFiles(self): """getFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.getFiles(DATASET) self.assertEqual(len(files), 46) @attr("integration") def testListBlockParents(self): """listBlockParents returns block parents""" self.dbs = DBSReader(self.endpoint) parents = self.dbs.listBlockParents( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60' ) self.assertEqual(1, len(parents)) self.assertEqual( '/Jet/Run2011A-v1/RAW#37cf2a40-4e0e-11e0-9833-00151755cb60', parents[0]['Name']) sites = [ x for x in parents[0]['StorageElementList'] if x.find("cern.ch") > -1 ] self.assertTrue(sites) self.assertFalse( self.dbs.listBlockParents( '/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60dsl' )) @attr("integration") def testBlockIsOpen(self): """blockIsOpen checks if a block is open""" self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.blockIsOpen(BLOCK)) @attr("integration") def testBlockToDatasetPath(self): """blockToDatasetPath extracts path from block name""" self.dbs = DBSReader(self.endpoint) self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET) self.assertFalse(self.dbs.blockToDatasetPath(BLOCK + 'asas'))
class DBSDataDiscovery(DataDiscovery): """Performing the data discovery through CMS DBS service. """ def checkDatasetStatus(self, dataset, kwargs): res = self.dbs.dbs.listDatasets(dataset=dataset, detail=1, dataset_access_type='*') if not res: raise TaskWorkerException( "Cannot find dataset %s in %s DBS instance" % (dataset, self.dbsInstance)) if len(res) > 1: raise TaskWorkerException( "Found more than one dataset while checking in DBS the status of %s" % dataset) res = res[0] #import pprint #self.logger.info("Input dataset details: %s", pprint.pformat(res)) accessType = res['dataset_access_type'] if accessType != 'VALID': # as per Dima's suggestion https://github.com/dmwm/CRABServer/issues/4739 msgForDeprecDS = "Please contact your physics group if you think the dataset should not be deprecated." if kwargs['task']['tm_nonvalid_input_dataset'] != 'T': msg = "CRAB refuses to proceed in getting the details of the dataset %s from DBS, because the dataset is not 'VALID' but '%s'." % ( dataset, accessType) if accessType == 'DEPRECATED': msg += " (%s)" % (msgForDeprecDS) msg += " To allow CRAB to consider a dataset that is not 'VALID', set Data.allowNonValidInputDataset = True in the CRAB configuration." msg += " Notice that this will not force CRAB to run over all files in the dataset;" msg += " CRAB will still check if there are any valid files in the dataset and run only over those files." raise TaskWorkerException(msg) msg = "The input dataset %s is not 'VALID' but '%s'." % ( dataset, accessType) msg += " CRAB will check if there are any valid files in the dataset and run only over those files." if accessType == 'DEPRECATED': msg += " %s" % (msgForDeprecDS) self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) return def keepOnlyDisks(self, locationsMap): phedex = PhEDEx() # TODO use certs from the config! # get all the PNNs that are of kind 'Disk' try: diskLocations = set([ pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node'] if pnn['kind'] == 'Disk' ]) except HTTPException as ex: self.logger.error(ex.headers) raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\ "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) # TODO addo the nodes phedex so the user can check themselves diskLocationsMap = {} for block, locations in locationsMap.iteritems(): locations[:] = [ x for x in locations if x != 'T3_CH_CERN_OpenData' ] # ignore OpenData until it is accessible by CRAB if set(locations) & diskLocations: # at least some locations are disk diskLocationsMap[block] = locationsMap[block] else: # no locations are in the disk list, assume that they are tape self.tapeLocations = self.tapeLocations.union( set(locations) - diskLocations) locationsMap.clear() # remove all blocks locationsMap.update( diskLocationsMap) # add only blocks with disk locations def checkBlocksSize(self, blocks): """ Make sure no single blocks has more than 100k lumis. See https://hypernews.cern.ch/HyperNews/CMS/get/dmDevelopment/2022/1/1/1/1/1/1/2.html """ MAX_LUMIS = 100000 for block in blocks: blockInfo = self.dbs.getDBSSummaryInfo(block=block) if blockInfo.get('NumberOfLumis', 0) > MAX_LUMIS: msg = "Block %s contains more than %s lumis.\nThis blows up CRAB server memory" % ( block, MAX_LUMIS) msg += "\nCRAB can only split this by ignoring lumi information. You can do this" msg += "\nusing FileBased split algorithm and avoiding any additional request" msg += "\nwich may cause lumi information to be looked up. See CRAB FAQ for more info:" msg += "\nhttps://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ" raise TaskWorkerException(msg) def execute(self, *args, **kwargs): """ This is a convenience wrapper around the executeInternal function """ # DBS3 requires X509_USER_CERT to be set - but we don't want to leak that to other modules # so use a context manager to set an ad hoc env and restore as soon as # executeInternal is over, even if it raises exception with self.config.TaskWorker.envForCMSWEB: result = self.executeInternal(*args, **kwargs) return result def executeInternal(self, *args, **kwargs): self.logger.info( "Data discovery with DBS") ## to be changed into debug dbsurl = self.config.Services.DBSUrl if kwargs['task']['tm_dbs_url']: dbsurl = kwargs['task']['tm_dbs_url'] self.dbs = DBSReader(dbsurl) self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"] isUserDataset = self.dbsInstance.split('/')[1] != 'global' # where to look locations in pre-Rucio world PhEDExOrDBS = 'PhEDEx' if not isUserDataset else 'DBS origin site' taskName = kwargs['task']['tm_taskname'] userProxy = kwargs['task']['user_proxy'] self.logger.debug("Data discovery through %s for %s", self.dbs, taskName) inputDataset = kwargs['task']['tm_input_dataset'] secondaryDataset = kwargs['task'].get('tm_secondary_input_dataset', None) self.checkDatasetStatus(inputDataset, kwargs) if secondaryDataset: self.checkDatasetStatus(secondaryDataset, kwargs) try: # Get the list of blocks for the locations. # The WMCore DBS3 implementation makes one call to DBS for each block # when using locations=True so we are using locations=False and looking up location later blocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(inputDataset, locations=False) ] if secondaryDataset: secondaryBlocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(secondaryDataset, locations=False) ] except DBSReaderError as dbsexc: # dataset not found in DBS is a known use case if str(dbsexc).find('No matching data'): raise TaskWorkerException( "CRAB could not find dataset %s in this DBS instance: %s" % inputDataset, dbsurl) raise ## Create a map for block's locations: for each block get the list of locations. ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no ## locations are found it gets the original locations from DBS. So it should ## never be the case at this point that some blocks have no locations. ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example: ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'], ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL']} # For now apply Rucio data location only to NANOAOD* # in time useRucioForLocations may become a more rich expression isNano = blocks[0].split("#")[0].split("/")[-1] in [ "NANOAOD", "NANOAODSIM" ] if isNano: self.logger.info( "NANOAOD* datset. Will use Rucio for data location") useRucioForLocations = isNano locationsFoundWithRucio = False if not useRucioForLocations: self.logger.info("Will not use Rucio for this dataset") # if locations should be in Rucio, try it first and fall back to old ways if Rucio calls fail # of if they return no locations (possible Rucio teething pain). If Rucio returns a list, trust it. if useRucioForLocations: locationsMap = {} scope = "cms" # If the dataset is a USER one, use the Rucio user scope to find it # TODO: we need a way to enable users to indicate others user scopes as source if isUserDataset: scope = "user.%s" % kwargs['task']['tm_username'] rucio_config_dict = { "phedexCompatible": True, "auth_type": "x509", "ca_cert": self.config.Services.Rucio_caPath, "logger": self.logger, "creds": { "client_cert": self.config.TaskWorker.cmscert, "client_key": self.config.TaskWorker.cmskey } } try: self.logger.info("Initializing Rucio client") # WMCore is awfully verbose with tempSetLogLevel(logger=self.logger, level=logging.ERROR): rucioClient = Rucio( self.config.Services.Rucio_account, hostUrl=self.config.Services.Rucio_host, authUrl=self.config.Services.Rucio_authUrl, configDict=rucio_config_dict) rucioClient.whoAmI() self.logger.info( "Looking up data location with Rucio in %s scope.", scope) with tempSetLogLevel(logger=self.logger, level=logging.ERROR): locations = rucioClient.getReplicaInfoForBlocks( scope=scope, block=list(blocks)) except Exception as exc: msg = "Rucio lookup failed with\n%s" % str(exc) # TODO when removing fall-back to PhEDEx, this should be a fatal error # raise TaskWorkerException(msg) self.logger.warn(msg) locations = None # TODO when removing fall-back to PhEDEx, above code will raise if it fails, therefore # the following "if" must be removed and the code shifted left if locations: located_blocks = locations['phedex']['block'] for element in located_blocks: if element[ 'replica']: # only fill map for blocks which have at least one location locationsMap.update({ element['name']: [x['node'] for x in element['replica']] }) if locationsMap: locationsFoundWithRucio = True else: msg = "No locations found with Rucio for this dataset" # since NANO* are not in PhEDEx, this should be a fatal error if isNano: raise TaskWorkerException(msg) else: # note it down and try with PhEDEx self.logger.warn(msg) if not locationsFoundWithRucio: # fall back to pre-Rucio methods try: self.logger.info("Looking up data locations using %s", PhEDExOrDBS) locationsMap = self.dbs.listFileBlockLocation( list(blocks), dbsOnly=isUserDataset) except Exception as ex: raise TaskWorkerException( "The CRAB3 server backend could not get the location of the files from dbs nor phedex nor rucio.\n"+\ "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex) ) # only fill map for blocks which have at least one location locationsMap = { key: value for key, value in locationsMap.iteritems() if value } if secondaryDataset: secondaryLocationsMap = {} # see https://github.com/dmwm/CRABServer/issues/6075#issuecomment-641569446 self.logger.info( "Trying data location of secondary blocks with Rucio") try: locations = rucioClient.getReplicaInfoForBlocks( scope=scope, block=list(secondaryBlocks)) except Exception as exc: locations = None secondaryLocationsMap = {} self.logger.warn("Rucio lookup failed with. %s", exc) if locations: located_blocks = locations['phedex']['block'] for element in located_blocks: if element[ 'replica']: # only fill map for blocks which have at least one location secondaryLocationsMap.update({ element['name']: [x['node'] for x in element['replica']] }) if not secondaryLocationsMap: msg = "No locations found with Rucio for secondaryDataset." # TODO when removing fall-back to PhEDEx, this should be a fatal error # raise TaskWorkerException(msg) self.logger.warn(msg) self.logger.info( "Trying data location of secondary blocks with PhEDEx") try: secondaryLocationsMap = self.dbs.listFileBlockLocation( list(secondaryBlocks), dbsOnly=isUserDataset) except Exception as ex: raise TaskWorkerException( "The CRAB3 server backend could not get the location of the secondary dataset files from dbs or phedex or rucio.\n" + \ "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)" + \ " and contact the experts if the error persists.\nError reason: %s" % str(ex) ) # only fill map for blocks which have at least one location secondaryLocationsMap = { key: value for key, value in secondaryLocationsMap.iteritems() if value } # From now on code is not dependent from having used Rucio or PhEDEx blocksWithLocation = locationsMap.keys() if secondaryDataset: secondaryBlocksWithLocation = secondaryLocationsMap.keys() self.keepOnlyDisks(locationsMap) if not locationsMap: msg = "Task could not be submitted because there is no DISK replica for dataset %s" % inputDataset if self.tapeLocations: msg += "\nN.B.: the input dataset is stored at %s, but those are TAPE locations." % ', '.join( sorted(self.tapeLocations)) # submit request to DDM ddmRequest = None ddmServer = self.config.TaskWorker.DDMServer try: ddmRequest = blocksRequest(blocksWithLocation, ddmServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, verbose=False) except HTTPException as hte: self.logger.exception(hte) msg += "\nThe automatic stage-out failed, please try again later. If the error persists contact the experts and provide this error message:" msg += "\nHTTP Error while contacting the DDM server %s:\n%s" % ( ddmServer, str(hte)) msg += "\nHTTP Headers are: %s" % hte.headers msg += "\nYou might want to contact your physics group if you need a disk replica." raise TaskWorkerException(msg, retry=True) self.logger.info("Contacted %s using %s and %s, got:\n%s", self.config.TaskWorker.DDMServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, ddmRequest) # The query above returns a JSON with a format {"result": "OK", "message": "Copy requested", "data": [{"request_id": 18, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:57:37", "last_request": "2018-02-26 23:57:37", "request_count": 1}]} if ddmRequest["result"] == "OK": # set status to TAPERECALL tapeRecallStatus = 'TAPERECALL' ddmReqId = ddmRequest["data"][0]["request_id"] configreq = { 'workflow': taskName, 'taskstatus': tapeRecallStatus, 'ddmreqid': ddmReqId, 'subresource': 'addddmreqid', } try: tapeRecallStatusSet = self.server.post( self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.exception(hte) msg = "HTTP Error while contacting the REST Interface %s:\n%s" % ( self.config.TaskWorker.restHost, str(hte)) msg += "\nSetting %s status and DDM request ID (%d) failed for task %s" % ( tapeRecallStatus, ddmReqId, taskName) msg += "\nHTTP Headers are: %s" % hte.headers raise TaskWorkerException(msg, retry=True) msg += "\nA disk replica has been requested on %s to CMS DDM (request ID: %d)" % ( ddmRequest["data"][0]["first_request"], ddmReqId) if tapeRecallStatusSet[2] == "OK": self.logger.info("Status for task %s set to '%s'", taskName, tapeRecallStatus) msg += "\nThis task will be automatically submitted as soon as the stage-out is completed." self.uploadWarning(msg, userProxy, taskName) raise TapeDatasetException(msg) else: msg += ", please try again in two days." else: msg += "\nThe disk replica request failed with this error:\n %s" % ddmRequest[ "message"] msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK." raise TaskWorkerException(msg) # will not need lumi info if user has asked for split by file with no run/lumi mask splitAlgo = kwargs['task']['tm_split_algo'] lumiMask = kwargs['task']['tm_split_args']['lumis'] runRange = kwargs['task']['tm_split_args']['runs'] needLumiInfo = splitAlgo != 'FileBased' or lumiMask != [] or runRange != [] # secondary dataset access relies on run/lumi info if secondaryDataset: needLumiInfo = True if needLumiInfo: self.checkBlocksSize( blocksWithLocation ) # Interested only in blocks with locations, 'blocks' may contain invalid ones and trigger an Exception if secondaryDataset: self.checkBlocksSize(secondaryBlocksWithLocation) try: filedetails = self.dbs.listDatasetFileDetails( inputDataset, getParents=True, getLumis=needLumiInfo, validFileOnly=0) if secondaryDataset: moredetails = self.dbs.listDatasetFileDetails( secondaryDataset, getParents=False, getLumis=needLumiInfo, validFileOnly=0) for secfilename, secinfos in moredetails.items(): secinfos['lumiobj'] = LumiList( runsAndLumis=secinfos['Lumis']) self.logger.info( "Beginning to match files from secondary dataset") for dummyFilename, infos in filedetails.items(): infos['Parents'] = [] lumis = LumiList(runsAndLumis=infos['Lumis']) for secfilename, secinfos in moredetails.items(): if lumis & secinfos['lumiobj']: infos['Parents'].append(secfilename) self.logger.info("Done matching files from secondary dataset") kwargs['task']['tm_use_parent'] = 1 except Exception as ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\ "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves if not filedetails: raise TaskWorkerException(("Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n" +\ "Aborting submission. Resubmitting your task will not help.") %\ ("https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s") %\ (self.dbsInstance, inputDataset)) ## Format the output creating the data structures required by WMCore. Filters out invalid files, ## files whose block has no location, and figures out the PSN result = self.formatOutput(task=kwargs['task'], requestname=taskName, datasetfiles=filedetails, locations=locationsMap, tempDir=kwargs['tempDir']) if not result.result: raise TaskWorkerException(( "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n" + "Aborting submission. Resubmitting your task will not help." ) % ( "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s" ) % (self.dbsInstance, inputDataset)) self.logger.debug("Got %s files", len(result.result.getFiles())) return result
class DBSReaderTest(unittest.TestCase): def setUp(self): """ _setUp_ Initialize the PhEDEx API to point at the test server. """ #self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" self.endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader' self.dbs = None return @attr("integration") def testListDatatiers(self): """ listDatatiers returns all datatiers available """ self.dbs = DBSReader(self.endpoint) results = self.dbs.listDatatiers() self.assertTrue('RAW' in results) self.assertTrue('GEN-SIM-RECO' in results) self.assertTrue('GEN-SIM' in results) self.assertFalse('RAW-ALAN' in results) return @attr("integration") def testListPrimaryDatasets(self): """ listPrimaryDatasets returns known primary datasets """ self.dbs = DBSReader(self.endpoint) results = self.dbs.listPrimaryDatasets('Jet*') self.assertTrue('Jet' in results) self.assertTrue('JetMET' in results) self.assertTrue('JetMETTau' in results) self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist')) return @attr("integration") def testMatchProcessedDatasets(self): """ matchProcessedDatasets returns known processed datasets """ self.dbs = DBSReader(self.endpoint) dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1') self.assertEqual(1, len(dataset)) self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList']) self.assertEqual('Run2011A-v1', dataset[0]['Name']) self.assertFalse( self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666')) def testlistRuns(self): """listRuns returns known runs""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRuns(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(174074 in runs) runs = self.dbs.listRuns(block=BLOCK) self.assertEqual(1, len(runs)) self.assertEqual([173657], runs) def testlistRunLumis(self): """listRunLumis returns known runs and lumicounts (None for DBS3)""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRunLumis(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(173692 in runs) self.assertEqual(runs[173692], None) runs = self.dbs.listRunLumis(block=BLOCK) self.assertEqual(1, len(runs)) self.assertTrue(173657 in runs) self.assertEqual(runs[173657], None) @attr("integration") def testListProcessedDatasets(self): """listProcessedDatasets returns known processed datasets""" self.dbs = DBSReader(self.endpoint) datasets = self.dbs.listProcessedDatasets('Jet', 'RAW') self.assertTrue('Run2011A-v1' in datasets) self.assertTrue('Run2011B-v1' in datasets) self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah')) self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW')) def testlistDatasetFiles(self): """listDatasetFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.listDatasetFiles(DATASET) self.assertEqual(49, len(files)) self.assertTrue(FILE in files) def testlistDatasetFileDetails(self): """testlistDatasetFilesDetails returns lumis, events, and parents of a dataset""" TESTFILE = '/store/data/Run2011A/HighPileUp/RAW/v1/000/173/658/56484BAB-CBCB-E011-AF00-BCAEC518FF56.root' self.dbs = DBSReader(self.endpoint) details = self.dbs.listDatasetFileDetails(DATASET) self.assertEqual(len(details), 49) self.assertTrue(TESTFILE in details) self.assertEqual(details[TESTFILE]['NumberOfEvents'], 545) self.assertEqual(details[TESTFILE]['file_size'], 286021145) self.assertEqual( details[TESTFILE]['BlockName'], '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace') self.assertEqual(details[TESTFILE]['Md5'], 'NOTSET') self.assertEqual(details[TESTFILE]['md5'], 'NOTSET') self.assertEqual(details[TESTFILE]['Adler32'], 'a41a1446') self.assertEqual(details[TESTFILE]['adler32'], 'a41a1446') self.assertEqual(details[TESTFILE]['Checksum'], '22218315') self.assertEqual(details[TESTFILE]['check_sum'], '22218315') self.assertTrue(173658 in details[TESTFILE]['Lumis']) self.assertEqual(sorted(details[TESTFILE]['Lumis'][173658]), [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111 ]) def testGetDBSSummaryInfo(self): """getDBSSummaryInfo returns summary of dataset and block""" self.dbs = DBSReader(self.endpoint) dataset = self.dbs.getDBSSummaryInfo(DATASET) self.assertEqual(dataset['path'], DATASET) self.assertEqual(dataset['block'], '') self.assertEqual(dataset['NumberOfEvents'], 22075) self.assertEqual(dataset['NumberOfBlocks'], 46) self.assertEqual(dataset['FileSize'], 4001680824) self.assertEqual(dataset['file_size'], 4001680824) self.assertEqual(dataset['NumberOfFiles'], 49) self.assertEqual(dataset['NumberOfLumis'], 7223) block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK) self.assertEqual(block['path'], '') self.assertEqual(block['block'], BLOCK) self.assertEqual(block['NumberOfEvents'], 377) self.assertEqual(block['NumberOfBlocks'], 1) self.assertEqual(block['FileSize'], 150780132) self.assertEqual(block['file_size'], 150780132) self.assertEqual(block['NumberOfFiles'], 2) self.assertEqual(block['NumberOfLumis'], 94) self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET + 'blah') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET, BLOCK + 'asas') @attr("integration") def testGetFileBlocksInfo(self): """getFileBlocksInfo returns block info, including location lookup""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.getFileBlocksInfo(DATASET) block = self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK) self.assertEqual(1, len(block)) block = block[0] self.assertEqual(46, len(blocks)) self.assertTrue(block['Name'] in [x['Name'] for x in blocks]) self.assertEqual(BLOCK, block['Name']) self.assertEqual(0, block['OpenForWriting']) self.assertEqual(150780132, block['BlockSize']) self.assertEqual(2, block['NumberOfFiles']) # possibly fragile but assume block located at least at cern sites = [ x['Name'] for x in block['PhEDExNodeList'] if x['Name'].find('CH_CERN') > -1 ] self.assertTrue(sites) # weird error handling - depends on whether block or dataset is missing self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET + 'blah') self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET, blockName=BLOCK + 'asas') def testListFileBlocks(self): """listFileBlocks returns block names in dataset""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.listFileBlocks(DATASET) self.assertTrue(BLOCK in blocks) # block is closed block = self.dbs.listFileBlocks(DATASET, blockName=BLOCK, onlyClosedBlocks=True)[0] self.assertEqual(block, BLOCK) self.assertTrue(BLOCK in block) def testListOpenFileBlocks(self): """listOpenFileBlocks finds open blocks""" # hard to find a dataset with open blocks, so don't bother self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.listOpenFileBlocks(DATASET)) def testBlockExists(self): """blockExists returns existence of blocks""" self.dbs = DBSReader(self.endpoint) self.assertTrue(self.dbs.blockExists(BLOCK)) self.assertRaises(DBSReaderError, self.dbs.blockExists, DATASET + '#somethingelse') def testListFilesInBlock(self): """listFilesInBlock returns files in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue( FILE in [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)]) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock, DATASET + '#blah') def testListFilesInBlockWithParents(self): """listFilesInBlockWithParents gets files with parents for a block""" self.dbs = DBSReader(self.endpoint) files = self.dbs.listFilesInBlockWithParents( '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0' ) self.assertEqual(4, len(files)) self.assertEqual( '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0', files[0]['block_name']) self.assertEqual( '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0', files[0]['BlockName']) self.assertEqual( '/store/data/Commissioning2015/Cosmics/RAW/v1/000/238/545/00000/1043E89F-2DCF-E411-9CAE-02163E013751.root', files[0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents, BLOCK + 'asas') def testLfnsInBlock(self): """lfnsInBlock returns lfns in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue( FILE in [x['logical_file_name'] for x in self.dbs.lfnsInBlock(BLOCK)]) self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas') @attr("integration") def testListFileBlockLocation(self): """listFileBlockLocation returns block location""" WRONG_BLOCK = BLOCK[:-4] + 'abcd' BLOCK2 = '/HighPileUp/Run2011A-v1/RAW#6021175e-cbfb-11e0-80a9-003048caaace' DBS_BLOCK = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\ 'ea0972193530f531086947d06eb0f121/USER#fb978442-a61b-413a-b4f4-526e6cdb142e' DBS_BLOCK2 = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\ 'ea0972193530f531086947d06eb0f121/USER#0b04d417-d734-4ef2-88b0-392c48254dab' self.dbs = DBSReader( 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader/') # assume one site is cern sites = [ x for x in self.dbs.listFileBlockLocation(BLOCK) if x and x.find('CH_CERN') > -1 ] self.assertTrue(sites) #This block is only found on DBS self.assertTrue(self.dbs.listFileBlockLocation(DBS_BLOCK)) # doesn't raise on non-existant block self.assertFalse(self.dbs.listFileBlockLocation(WRONG_BLOCK)) #test bulk call: ## two blocks in phedex self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, BLOCK2]))) ## one block in phedex one does not exist self.assertEqual( 1, len(self.dbs.listFileBlockLocation([BLOCK, WRONG_BLOCK]))) ## one in phedex one in dbs self.assertEqual( 2, len(self.dbs.listFileBlockLocation([BLOCK, DBS_BLOCK]))) ## two in dbs self.assertEqual( 2, len(self.dbs.listFileBlockLocation([DBS_BLOCK, DBS_BLOCK2]))) ## one in DBS and one does not exist self.assertEqual( 1, len(self.dbs.listFileBlockLocation([DBS_BLOCK, WRONG_BLOCK]))) def testGetFileBlock(self): """getFileBlock returns block""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlock(BLOCK) self.assertEqual(len(block), 1) block = block[BLOCK] self.assertEqual(2, len(block['Files'])) self.assertRaises(DBSReaderError, self.dbs.getFileBlock, BLOCK + 'asas') def testGetFileBlockWithParents(self): """getFileBlockWithParents returns block and parents""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlockWithParents( '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0' ) self.assertEqual(len(block), 1) block = block[ '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0'] self.assertEqual( '/store/data/Commissioning2015/Cosmics/RAW/v1/000/238/545/00000/1043E89F-2DCF-E411-9CAE-02163E013751.root', block['Files'][0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents, BLOCK + 'asas') def testGetFiles(self): """getFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.getFiles(DATASET) self.assertEqual(len(files), 46) def testListBlockParents(self): """listBlockParents returns block parents""" self.dbs = DBSReader(self.endpoint) parents = self.dbs.listBlockParents( '/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0' ) self.assertEqual(1, len(parents)) self.assertEqual( '/Cosmics/Commissioning2015-v1/RAW#942d76fe-cf0e-11e4-afad-001e67ac06a0', parents[0]['Name']) sites = [ x for x in parents[0]['PhEDExNodeList'] if x.find("CH_CERN") > -1 ] self.assertTrue(sites) self.assertFalse( self.dbs.listBlockParents( '/Cosmics/Commissioning2015-v1/RAW#942d76fe-cf0e-11e4-afad-001e67ac06a0' )) def testBlockIsOpen(self): """blockIsOpen checks if a block is open""" self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.blockIsOpen(BLOCK)) def testBlockToDatasetPath(self): """blockToDatasetPath extracts path from block name""" self.dbs = DBSReader(self.endpoint) self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET) self.assertRaises(DBSReaderError, self.dbs.blockToDatasetPath, BLOCK + 'asas')
class DBSDataDiscovery(DataDiscovery): """Performing the data discovery through CMS DBS service. """ def checkDatasetStatus(self, dataset, kwargs): res = self.dbs.dbs.listDatasets(dataset=dataset, detail=1, dataset_access_type='*') if len(res) > 1: raise TaskWorkerException( "Found more than one dataset while checking in DBS the status of %s" % dataset) if len(res) == 0: raise TaskWorkerException( "Cannot find dataset %s in %s DBS instance" % (dataset, self.dbsInstance)) res = res[0] self.logger.info("Input dataset details: %s", pprint.pformat(res)) accessType = res['dataset_access_type'] if accessType != 'VALID': # as per Dima's suggestion https://github.com/dmwm/CRABServer/issues/4739 msgForDeprecDS = "Please contact your physics group if you think the dataset should not be deprecated." if kwargs['task']['tm_nonvalid_input_dataset'] != 'T': msg = "CRAB refuses to proceed in getting the details of the dataset %s from DBS, because the dataset is not 'VALID' but '%s'." % ( dataset, accessType) if accessType == 'DEPRECATED': msg += " (%s)" % (msgForDeprecDS) msg += " To allow CRAB to consider a dataset that is not 'VALID', set Data.allowNonValidInputDataset = True in the CRAB configuration." msg += " Notice that this will not force CRAB to run over all files in the dataset;" msg += " CRAB will still check if there are any valid files in the dataset and run only over those files." raise TaskWorkerException(msg) msg = "The input dataset %s is not 'VALID' but '%s'." % ( dataset, accessType) msg += " CRAB will check if there are any valid files in the dataset and run only over those files." if accessType == 'DEPRECATED': msg += " %s" % (msgForDeprecDS) self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) return def keepOnlyDisks(self, locationsMap): self.otherLocations = set() phedex = PhEDEx() #TODO use certs from the config! #get all the PNN that are of kind disk try: diskLocations = set([ pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node'] if pnn['kind'] == 'Disk' ]) except HTTPException as ex: self.logger.error(ex.headers) raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\ "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves for block, locations in locationsMap.iteritems(): locationsMap[block] = set(locations) & diskLocations self.otherLocations = self.otherLocations.union( set(locations) - diskLocations) #remove any key with value that has set([]) for key, value in locationsMap.items(): #wont work in python3! if value == set([]): locationsMap.pop(key) def checkBlocksSize(self, blocks): """ Make sure no single blocks has more than 100k lumis. See https://hypernews.cern.ch/HyperNews/CMS/get/dmDevelopment/2022/1/1/1/1/1/1/2.html """ MAX_LUMIS = 100000 for block in blocks: blockInfo = self.dbs.getDBSSummaryInfo(block=block) if blockInfo['NumberOfLumis'] > MAX_LUMIS: msg = "Block %s contains more than %s lumis.\nThis blows up CRAB server memory" % ( block, MAX_LUMIS) msg += "\nCRAB can only split this by ignoring lumi information. You can do this" msg += "\nusing FileBased split algorithm and avoiding any additional request" msg += "\nwich may cause lumi information to be looked up. See CRAB FAQ for more info:" msg += "\nhttps://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ" raise TaskWorkerException(msg) def execute(self, *args, **kwargs): """ This is a convenience wrapper around the executeInternal function """ # DBS3 requires X509_USER_CERT to be set - but we don't want to leak that to other modules # so use a context manager to set an ad hoc env and restore as soon as # executeInternal is over, even if it raises exception with self.config.TaskWorker.envForCMSWEB: result = self.executeInternal(*args, **kwargs) return result def executeInternal(self, *args, **kwargs): self.logger.info( "Data discovery with DBS") ## to be changed into debug dbsurl = self.config.Services.DBSUrl if kwargs['task']['tm_dbs_url']: dbsurl = kwargs['task']['tm_dbs_url'] self.dbs = DBSReader(dbsurl) self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"] taskName = kwargs['task']['tm_taskname'] self.logger.debug("Data discovery through %s for %s", self.dbs, taskName) inputDataset = kwargs['task']['tm_input_dataset'] secondaryDataset = kwargs['task'].get('tm_secondary_input_dataset', None) self.checkDatasetStatus(inputDataset, kwargs) if secondaryDataset: self.checkDatasetStatus(secondaryDataset, kwargs) try: # Get the list of blocks for the locations and then call dls. # The WMCore DBS3 implementation makes one call to dls for each block # with locations = True so we are using locations=False and looking up location later blocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(inputDataset, locations=False) ] if secondaryDataset: secondaryBlocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(secondaryDataset, locations=False) ] except DBSReaderError as dbsexc: #dataset not found in DBS is a known use case if str(dbsexc).find('No matching data'): raise TaskWorkerException( "CRAB could not find dataset %s in this DBS instance: %s" % inputDataset, dbsurl) raise ## Create a map for block's locations: for each block get the list of locations. ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no ## locations are found it gets the original locations from DBS. So it should ## never be the case at this point that some blocks have no locations. ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example: ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'], ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'} try: dbsOnly = self.dbsInstance.split('/')[1] != 'global' locationsMap = self.dbs.listFileBlockLocation(list(blocks), dbsOnly=dbsOnly) except Exception as ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not get the location of the files from dbs or phedex.\n"+\ "This is could be a temporary phedex/dbs glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) self.keepOnlyDisks(locationsMap) if not locationsMap: msg = "Task could not be submitted because there is no DISK replica for dataset %s" % inputDataset if self.otherLocations: msg += "\nN.B.: the input dataset is stored at %s, but those are TAPE locations." % ', '.join( sorted(self.otherLocations)) # submit request to DDM ddmRequest = blocksRequest(blocks, self.config.TaskWorker.DDMServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, verbose=False) self.logger.info("Contacted %s using %s and %s, got:\n%s", self.config.TaskWorker.DDMServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, ddmRequest) # The query above returns a JSON with a format {"result": "OK", "message": "Copy requested", "data": [{"request_id": 18, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:57:37", "last_request": "2018-02-26 23:57:37", "request_count": 1}]} if ddmRequest["result"] == "OK": msg += "\nA disk replica has been requested on %s" % ddmRequest[ "data"][0]["first_request"] # set status to TAPERECALL tapeRecallStatus = 'TAPERECALL' ddmReqId = ddmRequest["data"][0]["request_id"] server = HTTPRequests( url=self.config.TaskWorker.resturl, localcert=kwargs['task']['user_proxy'], localkey=kwargs['task']['user_proxy'], verbose=False) configreq = { 'workflow': taskName, 'taskstatus': tapeRecallStatus, 'ddmreqid': ddmReqId, 'subresource': 'addddmreqid' } try: tapeRecallStatusSet = server.post( self.config.TaskWorker.restURInoAPI + 'task', data=urllib.urlencode(configreq)) except HTTPException as hte: msg = "HTTP Error while contacting the REST Interface %s:\n%s" % ( self.config.TaskWorker.resturl, str(hte)) msg += "\nSetting %s status and DDM request ID (%d) failed for task %s" % ( tapeRecallStatus, ddmReqId, taskName) msg += "\nHTTP Headers are: %s" % hte.headers raise TaskWorkerException(msg, retry=True) if tapeRecallStatusSet[2] == "OK": self.logger.info("Status for task %s set to '%s'", taskName, tapeRecallStatus) msg += " and the task will be submitted as soon as it is completed." self.uploadWarning(msg, kwargs['task']['user_proxy'], taskName) raise TapeDatasetException(msg) else: msg += ", please try again in two days." msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK." msg += " You might want to contact your physics group if you need a disk replica." raise TaskWorkerException(msg) if len(blocks) != len(locationsMap): self.logger.warning( "The locations of some blocks have not been found: %s", set(blocks) - set(locationsMap)) # will not need lumi info if user has asked for split by file with no run/lumi mask splitAlgo = kwargs['task']['tm_split_algo'] lumiMask = kwargs['task']['tm_split_args']['lumis'] runRange = kwargs['task']['tm_split_args']['runs'] needLumiInfo = splitAlgo != 'FileBased' or lumiMask != [] or runRange != [] # secondary dataset access relies on run/lumi info if secondaryDataset: needLumiInfo = True if needLumiInfo: self.checkBlocksSize(blocks) if secondaryDataset: self.checkBlocksSize(secondaryBlocks) try: filedetails = self.dbs.listDatasetFileDetails( inputDataset, getParents=True, getLumis=needLumiInfo, validFileOnly=0) if secondaryDataset: moredetails = self.dbs.listDatasetFileDetails( secondaryDataset, getParents=False, getLumis=needLumiInfo, validFileOnly=0) for secfilename, secinfos in moredetails.items(): secinfos['lumiobj'] = LumiList( runsAndLumis=secinfos['Lumis']) self.logger.info( "Beginning to match files from secondary dataset") for dummyFilename, infos in filedetails.items(): infos['Parents'] = [] lumis = LumiList(runsAndLumis=infos['Lumis']) for secfilename, secinfos in moredetails.items(): if (lumis & secinfos['lumiobj']): infos['Parents'].append(secfilename) self.logger.info("Done matching files from secondary dataset") kwargs['task']['tm_use_parent'] = 1 except Exception as ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\ "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves if not filedetails: raise TaskWorkerException(( "Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n" "Aborting submission. Resubmitting your task will not help." ) % ( "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s" ) % (self.dbsInstance, inputDataset)) ## Format the output creating the data structures required by wmcore. Filters out invalid files, ## files whose block has no location, and figures out the PSN result = self.formatOutput(task=kwargs['task'], requestname=taskName, datasetfiles=filedetails, locations=locationsMap, tempDir=kwargs['tempDir']) if not result.result: raise TaskWorkerException(( "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n" "Aborting submission. Resubmitting your task will not help." ) % ( "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s" ) % (self.dbsInstance, inputDataset)) self.logger.debug("Got %s files", len(result.result.getFiles())) return result
class DBSReaderTest(unittest.TestCase): def setUp(self): """ _setUp_ Initialize the PhEDEx API to point at the test server. """ #endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader' self.dbs = DBSReader(endpoint) return @attr("integration") def testListPrimaryDatasets(self): """ listPrimaryDatasets returns known primary datasets """ results = self.dbs.listPrimaryDatasets('Jet*') self.assertTrue('Jet' in results) self.assertTrue('JetMET' in results) self.assertTrue('JetMETTau' in results) self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist')) return @attr("integration") def testMatchProcessedDatasets(self): """ matchProcessedDatasets returns known processed datasets """ dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1') self.assertEqual(1, len(dataset)) self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList']) self.assertEqual('Run2011A-v1', dataset[0]['Name']) self.assertFalse(self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666')) @attr("integration") def testlistRuns(self): """listRuns returns known runs""" runs = self.dbs.listRuns(dataset = DATASET) self.assertEqual(46, len(runs)) self.assertTrue(174074 in runs) runs = self.dbs.listRuns(dataset = DATASET, block = BLOCK) self.assertEqual([173657], runs) @attr("integration") def testListProcessedDatasets(self): """listProcessedDatasets returns known processed datasets""" datasets = self.dbs.listProcessedDatasets('Jet', 'RAW') self.assertTrue('Run2011A-v1' in datasets) self.assertTrue('Run2011B-v1' in datasets) self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah')) self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW')) @attr("integration") def testlistDatasetFiles(self): """listDatasetFiles returns files in dataset""" files = self.dbs.listDatasetFiles(DATASET) self.assertEqual(49, len(files)) self.assertTrue(FILE in files) @attr("integrtion") def testGetDBSSummaryInfo(self): """getDBSSummaryInfo returns summary of dataset and block""" dataset = self.dbs.getDBSSummaryInfo(DATASET) self.assertEqual(dataset['path'], DATASET) self.assertEqual(dataset['block'], '') self.assertEqual(dataset['NumberOfEvents'], '22075') self.assertEqual(dataset['NumberOfBlocks'], '46') self.assertEqual(dataset['total_size'], '4001680824') self.assertEqual(dataset['NumberOfFiles'], '49') self.assertEqual(dataset['NumberOfLumis'], '7223') block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK) self.assertEqual(block['path'], '') self.assertEqual(block['block'], BLOCK) self.assertEqual(block['NumberOfEvents'], '377') self.assertEqual(block['NumberOfBlocks'], '1') self.assertEqual(block['total_size'], '150780132') self.assertEqual(block['NumberOfFiles'], '2') self.assertEqual(block['NumberOfLumis'], '94') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET + 'blah') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET, BLOCK + 'asas') @attr("integration") def testGetFileBlocksInfo(self): """getFileBlocksInfo returns block info, including location lookup""" blocks = self.dbs.getFileBlocksInfo(DATASET) block = self.dbs.getFileBlocksInfo(DATASET, blockName = BLOCK) self.assertEqual(1, len(block)) block = block[0] self.assertEqual(46, len(blocks)) self.assertTrue(block['Name'] in [x['Name'] for x in blocks]) self.assertEqual(BLOCK, block['Name']) #self.assertEqual(377, block['NumberOfEvents']) self.assertEqual(150780132, block['BlockSize']) self.assertEqual(2, block['NumberOfFiles']) # possibly fragile but assume block located at least at cern sites = [x['Name'] for x in block['StorageElementList'] if x['Name'].find('cern.ch') > -1] self.assertTrue(sites) # weird error handling - depends on whether block or dataset is missing self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET + 'blah') self.assertFalse(self.dbs.getFileBlocksInfo(DATASET, blockName = BLOCK + 'asas')) @attr("integration") def testListFileBlocks(self): """listFileBlocks returns block names in dataset""" blocks = self.dbs.listFileBlocks(DATASET) # block is closed block = self.dbs.listFileBlocks(DATASET, blockName = BLOCK, onlyClosedBlocks = True)[0] self.assertEqual(block, BLOCK) self.assertTrue(BLOCK in block) @attr("integration") def testListOpenFileBlocks(self): """listOpenFileBlocks finds open blocks""" # hard to find a dataset with open blocks, so don't bother self.assertFalse(self.dbs.listOpenFileBlocks(DATASET)) @attr("integration") def testBlockExists(self): """blockExists returns existence of blocks""" self.assertTrue(self.dbs.blockExists(BLOCK)) self.assertFalse(self.dbs.blockExists(DATASET + '#somethingelse')) @attr("integration") def testListFilesInBlock(self): """listFilesInBlock returns files in block""" self.assertTrue(FILE in [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)]) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock, DATASET + '#blah') @attr("integration") def testListFilesInBlockWithParents(self): """listFilesInBlockWithParents gets files with parents for a block""" # hope PromptReco doesn't get deleted files = self.dbs.listFilesInBlockWithParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60') self.assertEqual(1, len(files)) self.assertEqual('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60', files[0]['Block']['Name']) self.assertEqual('/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root', files[0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents, BLOCK + 'asas') @attr("integration") def testLfnsInBlock(self): """lfnsInBlock returns lfns in block""" self.assertTrue(FILE in self.dbs.lfnsInBlock(BLOCK)) self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas') @attr("integration") def testListFileBlockLocation(self): """listFileBlockLocation returns block location""" # assume one site is cern sites = [x for x in self.dbs.listFileBlockLocation(BLOCK) if x.find('cern.ch') > -1] self.assertTrue(sites) # doesn't raise on non-existant block self.assertFalse(self.dbs.listFileBlockLocation(BLOCK + 'blah')) @attr("integration") def testGetFileBlock(self): """getFileBlock returns block""" block = self.dbs.getFileBlock(BLOCK) self.assertEqual(len(block), 1) block = block[BLOCK] self.assertEqual(2, len(block['Files'])) self.assertRaises(DBSReaderError, self.dbs.getFileBlock, BLOCK + 'asas') @attr("integration") def testGetFileBlockWithParents(self): """getFileBlockWithParents returns block and parents""" block = self.dbs.getFileBlockWithParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60') self.assertEqual(len(block), 1) block = block['/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60'] self.assertEqual('/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root', block['Files'][0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents, BLOCK + 'asas') @attr("integration") def testGetFiles(self): """getFiles returns files in dataset""" files = self.dbs.getFiles(DATASET) self.assertEqual(len(files), 46) @attr("integration") def testListBlockParents(self): """listBlockParents returns block parents""" parents = self.dbs.listBlockParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60') self.assertEqual(1, len(parents)) self.assertEqual('/Jet/Run2011A-v1/RAW#37cf2a40-4e0e-11e0-9833-00151755cb60', parents[0]['Name']) sites = [x for x in parents[0]['StorageElementList'] if x.find("cern.ch") > -1] self.assertTrue(sites) self.assertFalse(self.dbs.listBlockParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60dsl')) @attr("integration") def testBlockIsOpen(self): """blockIsOpen checks if a block is open""" self.assertFalse(self.dbs.blockIsOpen(BLOCK)) @attr("integration") def testBlockToDatasetPath(self): """blockToDatasetPath extracts path from block name""" self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET) self.assertFalse(self.dbs.blockToDatasetPath(BLOCK + 'asas'))
class DBSReaderTest(EmulatedUnitTestCase): def setUp(self): """ _setUp_ Initialize the PhEDEx API to point at the test server. """ #self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" self.endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader' self.dbs = None super(DBSReaderTest, self).setUp() return def tearDown(self): """ _tearDown_ :return: """ super(DBSReaderTest, self).tearDown() return @attr("integration") def testListDatatiers(self): """ listDatatiers returns all datatiers available """ self.dbs = DBSReader(self.endpoint) results = self.dbs.listDatatiers() self.assertTrue('RAW' in results) self.assertTrue('GEN-SIM-RECO' in results) self.assertTrue('GEN-SIM' in results) self.assertFalse('RAW-ALAN' in results) return @attr("integration") def testListPrimaryDatasets(self): """ listPrimaryDatasets returns known primary datasets """ self.dbs = DBSReader(self.endpoint) results = self.dbs.listPrimaryDatasets('Jet*') self.assertTrue('Jet' in results) self.assertTrue('JetMET' in results) self.assertTrue('JetMETTau' in results) self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist')) return @attr("integration") def testMatchProcessedDatasets(self): """ matchProcessedDatasets returns known processed datasets """ self.dbs = DBSReader(self.endpoint) dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1') self.assertEqual(1, len(dataset)) self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList']) self.assertEqual('Run2011A-v1', dataset[0]['Name']) self.assertFalse(self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666')) def testlistRuns(self): """listRuns returns known runs""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRuns(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(174074 in runs) runs = self.dbs.listRuns(block=BLOCK) self.assertEqual(1, len(runs)) self.assertEqual([173657], runs) def testlistRunLumis(self): """listRunLumis returns known runs and lumicounts (None for DBS3)""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRunLumis(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(173692 in runs) self.assertEqual(runs[173692], None) runs = self.dbs.listRunLumis(block=BLOCK) self.assertEqual(1, len(runs)) self.assertTrue(173657 in runs) self.assertEqual(runs[173657], None) @attr("integration") def testListProcessedDatasets(self): """listProcessedDatasets returns known processed datasets""" self.dbs = DBSReader(self.endpoint) datasets = self.dbs.listProcessedDatasets('Jet', 'RAW') self.assertTrue('Run2011A-v1' in datasets) self.assertTrue('Run2011B-v1' in datasets) self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah')) self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW')) def testlistDatasetFiles(self): """listDatasetFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.listDatasetFiles(DATASET) self.assertEqual(49, len(files)) self.assertTrue(FILE in files) def testlistDatasetFileDetails(self): """testlistDatasetFilesDetails returns lumis, events, and parents of a dataset""" TESTFILE = '/store/data/Run2011A/HighPileUp/RAW/v1/000/173/658/56484BAB-CBCB-E011-AF00-BCAEC518FF56.root' self.dbs = DBSReader(self.endpoint) details = self.dbs.listDatasetFileDetails(DATASET) self.assertEqual(len(details), 49) self.assertTrue(TESTFILE in details) self.assertEqual(details[TESTFILE]['NumberOfEvents'], 545) self.assertEqual(details[TESTFILE]['file_size'], 286021145) self.assertEqual(details[TESTFILE]['BlockName'], '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace') self.assertEqual(details[TESTFILE]['Md5'], 'NOTSET') self.assertEqual(details[TESTFILE]['md5'], 'NOTSET') self.assertEqual(details[TESTFILE]['Adler32'], 'a41a1446') self.assertEqual(details[TESTFILE]['adler32'], 'a41a1446') self.assertEqual(details[TESTFILE]['Checksum'], '22218315') self.assertEqual(details[TESTFILE]['check_sum'], '22218315') self.assertTrue(173658 in details[TESTFILE]['Lumis']) self.assertEqual(sorted(details[TESTFILE]['Lumis'][173658]), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111]) def testGetDBSSummaryInfo(self): """getDBSSummaryInfo returns summary of dataset and block""" self.dbs = DBSReader(self.endpoint) dataset = self.dbs.getDBSSummaryInfo(DATASET) self.assertEqual(dataset['path'], DATASET) self.assertEqual(dataset['block'], '') self.assertEqual(dataset['NumberOfEvents'], 22075) self.assertEqual(dataset['NumberOfBlocks'], 46) self.assertEqual(dataset['FileSize'], 4001680824) self.assertEqual(dataset['file_size'], 4001680824) self.assertEqual(dataset['NumberOfFiles'], 49) self.assertEqual(dataset['NumberOfLumis'], 7223) block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK) self.assertEqual(block['path'], '') self.assertEqual(block['block'], BLOCK) self.assertEqual(block['NumberOfEvents'], 377) self.assertEqual(block['NumberOfBlocks'], 1) self.assertEqual(block['FileSize'], 150780132) self.assertEqual(block['file_size'], 150780132) self.assertEqual(block['NumberOfFiles'], 2) self.assertEqual(block['NumberOfLumis'], 94) self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET + 'blah') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET, BLOCK + 'asas') @attr("integration") def testGetFileBlocksInfo(self): """getFileBlocksInfo returns block info, including location lookup""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.getFileBlocksInfo(DATASET) block = self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK) self.assertEqual(1, len(block)) block = block[0] self.assertEqual(46, len(blocks)) self.assertTrue(block['Name'] in [x['Name'] for x in blocks]) self.assertEqual(BLOCK, block['Name']) self.assertEqual(0, block['OpenForWriting']) self.assertEqual(150780132, block['BlockSize']) self.assertEqual(2, block['NumberOfFiles']) # possibly fragile but assume block located at least at cern sites = [x['Name'] for x in block['PhEDExNodeList'] if x['Name'].find('CH_CERN') > -1] self.assertTrue(sites) # weird error handling - depends on whether block or dataset is missing self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET + 'blah') self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET, blockName=BLOCK + 'asas') def testListFileBlocks(self): """listFileBlocks returns block names in dataset""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.listFileBlocks(DATASET) self.assertTrue(BLOCK in blocks) # block is closed block = self.dbs.listFileBlocks(DATASET, blockName=BLOCK, onlyClosedBlocks=True)[0] self.assertEqual(block, BLOCK) self.assertTrue(BLOCK in block) def testListOpenFileBlocks(self): """listOpenFileBlocks finds open blocks""" # hard to find a dataset with open blocks, so don't bother self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.listOpenFileBlocks(DATASET)) def testBlockExists(self): """blockExists returns existence of blocks""" self.dbs = DBSReader(self.endpoint) self.assertTrue(self.dbs.blockExists(BLOCK)) self.assertRaises(DBSReaderError, self.dbs.blockExists, DATASET + '#somethingelse') def testListFilesInBlock(self): """listFilesInBlock returns files in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue(FILE in [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)]) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock, DATASET + '#blah') def testListFilesInBlockWithParents(self): """listFilesInBlockWithParents gets files with parents for a block""" self.dbs = DBSReader(self.endpoint) files = self.dbs.listFilesInBlockWithParents('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0') self.assertEqual(4, len(files)) self.assertEqual('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0', files[0]['block_name']) self.assertEqual('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0', files[0]['BlockName']) self.assertEqual('/store/data/Commissioning2015/Cosmics/RAW/v1/000/238/545/00000/1043E89F-2DCF-E411-9CAE-02163E013751.root', files[0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents, BLOCK + 'asas') def testLfnsInBlock(self): """lfnsInBlock returns lfns in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue(FILE in [x['logical_file_name'] for x in self.dbs.lfnsInBlock(BLOCK)]) self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas') @attr("integration") def testListFileBlockLocation(self): """listFileBlockLocation returns block location""" WRONG_BLOCK = BLOCK[:-4]+'abcd' BLOCK2 = '/HighPileUp/Run2011A-v1/RAW#6021175e-cbfb-11e0-80a9-003048caaace' DBS_BLOCK = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\ 'ea0972193530f531086947d06eb0f121/USER#fb978442-a61b-413a-b4f4-526e6cdb142e' DBS_BLOCK2 = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\ 'ea0972193530f531086947d06eb0f121/USER#0b04d417-d734-4ef2-88b0-392c48254dab' self.dbs = DBSReader('https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader/') # assume one site is cern sites = [x for x in self.dbs.listFileBlockLocation(BLOCK) if x and x.find('CH_CERN') > -1] self.assertTrue(sites) #This block is only found on DBS self.assertTrue(self.dbs.listFileBlockLocation(DBS_BLOCK)) # doesn't raise on non-existant block self.assertFalse(self.dbs.listFileBlockLocation(WRONG_BLOCK)) #test bulk call: ## two blocks in phedex self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, BLOCK2]))) ## one block in phedex one does not exist self.assertEqual(1, len(self.dbs.listFileBlockLocation([BLOCK, WRONG_BLOCK]))) ## one in phedex one in dbs self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, DBS_BLOCK]))) ## two in dbs self.assertEqual(2, len(self.dbs.listFileBlockLocation([DBS_BLOCK, DBS_BLOCK2]))) ## one in DBS and one does not exist self.assertEqual(1, len(self.dbs.listFileBlockLocation([DBS_BLOCK, WRONG_BLOCK]))) def testGetFileBlock(self): """getFileBlock returns block""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlock(BLOCK) self.assertEqual(len(block), 1) block = block[BLOCK] self.assertEqual(2, len(block['Files'])) self.assertRaises(DBSReaderError, self.dbs.getFileBlock, BLOCK + 'asas') def testGetFileBlockWithParents(self): """getFileBlockWithParents returns block and parents""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlockWithParents('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0') self.assertEqual(len(block), 1) block = block['/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0'] self.assertEqual('/store/data/Commissioning2015/Cosmics/RAW/v1/000/238/545/00000/1043E89F-2DCF-E411-9CAE-02163E013751.root', block['Files'][0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents, BLOCK + 'asas') def testGetFiles(self): """getFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.getFiles(DATASET) self.assertEqual(len(files), 46) def testListBlockParents(self): """listBlockParents returns block parents""" self.dbs = DBSReader(self.endpoint) parents = self.dbs.listBlockParents('/Cosmics/Commissioning2015-PromptReco-v1/RECO#004ac3ba-d09e-11e4-afad-001e67ac06a0') self.assertEqual(1, len(parents)) self.assertEqual('/Cosmics/Commissioning2015-v1/RAW#942d76fe-cf0e-11e4-afad-001e67ac06a0', parents[0]['Name']) sites = [x for x in parents[0]['PhEDExNodeList'] if x.find("CH_CERN") > -1] self.assertTrue(sites) self.assertFalse(self.dbs.listBlockParents('/Cosmics/Commissioning2015-v1/RAW#942d76fe-cf0e-11e4-afad-001e67ac06a0')) def testBlockIsOpen(self): """blockIsOpen checks if a block is open""" self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.blockIsOpen(BLOCK)) def testBlockToDatasetPath(self): """blockToDatasetPath extracts path from block name""" self.dbs = DBSReader(self.endpoint) self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET) self.assertRaises(DBSReaderError, self.dbs.blockToDatasetPath, BLOCK + 'asas')
class DBSDataDiscovery(DataDiscovery): """Performing the data discovery through CMS DBS service. """ def checkDatasetStatus(self, dataset, kwargs): res = self.dbs.dbs.listDatasets(dataset=dataset, detail=1, dataset_access_type='*') if len(res) > 1: raise TaskWorkerException( "Found more than one dataset while checking in DBS the status of %s" % dataset) if len(res) == 0: raise TaskWorkerException( "Cannot find dataset %s in %s DBS instance" % (dataset, self.dbsInstance)) res = res[0] self.logger.info("Input dataset details: %s" % pprint.pformat(res)) accessType = res['dataset_access_type'] if accessType != 'VALID': #as per Dima's suggestion https://github.com/dmwm/CRABServer/issues/4739 msgForDeprecDS = "Please contact your physics group if you think the dataset should not be deprecated." if kwargs['task']['tm_nonvalid_input_dataset'] != 'T': msg = "CRAB refuses to proceed in getting the details of the dataset %s from DBS, because the dataset is not 'VALID' but '%s'." % ( dataset, accessType) if accessType == 'DEPRECATED': msg += " (%s)" % (msgForDeprecDS) msg += " To allow CRAB to consider a dataset that is not 'VALID', set Data.allowNonValidInputDataset = True in the CRAB configuration." msg += " Notice that this will not force CRAB to run over all files in the dataset;" msg += " CRAB will still check if there are any valid files in the dataset and run only over those files." raise TaskWorkerException(msg) msg = "The input dataset %s is not 'VALID' but '%s'." % ( dataset, accessType) msg += " CRAB will check if there are any valid files in the dataset and run only over those files." if accessType == 'DEPRECATED': msg += " %s" % (msgForDeprecDS) self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) return def keepOnlyDisks(self, locationsMap): self.otherLocations = set() phedex = PhEDEx() #TODO use certs from the config! #get all the PNN that are of kind disk try: diskLocations = set([ pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node'] if pnn['kind'] == 'Disk' ]) except HTTPException as ex: self.logger.error(ex.headers) raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\ "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves for block, locations in locationsMap.iteritems(): locationsMap[block] = set(locations) & diskLocations self.otherLocations = self.otherLocations.union( set(locations) - diskLocations) #remove any key with value that has set([]) for key, value in locationsMap.items(): #wont work in python3! if value == set([]): locationsMap.pop(key) def checkBlocksSize(self, blocks): """ Make sure no single blocks has more than 100k lumis. See https://hypernews.cern.ch/HyperNews/CMS/get/dmDevelopment/2022/1/1/1/1/1/1/2.html """ MAX_LUMIS = 100000 for block in blocks: blockInfo = self.dbs.getDBSSummaryInfo(block=block) if blockInfo['NumberOfLumis'] > MAX_LUMIS: msg = "Block %s contains more than %s lumis and cannot be processed for splitting. " % ( block, MAX_LUMIS) msg += "For memory/time contraint big blocks are not allowed. Use another dataset as input." raise TaskWorkerException(msg) def execute(self, *args, **kwargs): self.logger.info( "Data discovery with DBS") ## to be changed into debug old_cert_val = os.getenv("X509_USER_CERT") old_key_val = os.getenv("X509_USER_KEY") try: os.environ['X509_USER_CERT'] = self.config.TaskWorker.cmscert os.environ['X509_USER_KEY'] = self.config.TaskWorker.cmskey # DBS3 requires X509_USER_CERT to be set - but we don't want to leak that to other modules dbsurl = self.config.Services.DBSUrl if kwargs['task']['tm_dbs_url']: dbsurl = kwargs['task']['tm_dbs_url'] self.dbs = DBSReader(dbsurl) self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"] finally: if old_cert_val != None: os.environ['X509_USER_CERT'] = old_cert_val else: del os.environ['X509_USER_CERT'] if old_key_val != None: os.environ['X509_USER_KEY'] = old_key_val else: del os.environ['X509_USER_KEY'] self.logger.debug("Data discovery through %s for %s" % (self.dbs, kwargs['task']['tm_taskname'])) self.checkDatasetStatus(kwargs['task']['tm_input_dataset'], kwargs) try: # Get the list of blocks for the locations and then call dls. # The WMCore DBS3 implementation makes one call to dls for each block # with locations = True so we are using locations=False and looking up location later blocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo( kwargs['task']['tm_input_dataset'], locations=False) ] except DBSReaderError as dbsexc: #dataset not found in DBS is a known use case if str(dbsexc).find('No matching data'): raise TaskWorkerException( "The CRAB3 server backend could not find dataset %s in this DBS instance: %s" % (kwargs['task']['tm_input_dataset'], dbsurl)) raise self.checkBlocksSize(blocks) ## Create a map for block's locations: for each block get the list of locations. ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no ## locations are found it gets the original locations from DBS. So it should ## never be the case at this point that some blocks have no locations. ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example: ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'], ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'} try: dbsOnly = self.dbsInstance.split('/')[1] != 'global' locationsMap = self.dbs.listFileBlockLocation(list(blocks), dbsOnly=dbsOnly) except Exception as ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not get the location of the files from dbs or phedex.\n"+\ "This is could be a temporary phedex/dbs glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) self.keepOnlyDisks(locationsMap) if not locationsMap: msg = "Task could not be submitted because there is no DISK replica for dataset %s ." % ( kwargs['task']['tm_input_dataset']) msg += " Please, check DAS, https://cmsweb.cern.ch/das, and make sure the dataset is accessible on DISK" msg += " You might want to contact your physics group if you need a disk replica." if self.otherLocations: msg += "\nN.B.: your dataset is stored at %s, but those are TAPE locations." % ','.join( sorted(self.otherLocations)) raise TaskWorkerException(msg) if len(blocks) != len(locationsMap): self.logger.warning( "The locations of some blocks have not been found: %s" % (set(blocks) - set(locationsMap))) try: filedetails = self.dbs.listDatasetFileDetails( kwargs['task']['tm_input_dataset'], getParents=True, validFileOnly=0) secondary = kwargs['task'].get('tm_secondary_input_dataset', None) if secondary: moredetails = self.dbs.listDatasetFileDetails(secondary, getParents=False, validFileOnly=0) for secfilename, secinfos in moredetails.items(): secinfos['lumiobj'] = LumiList( runsAndLumis=secinfos['Lumis']) self.logger.info( "Beginning to match files from secondary dataset") for dummyFilename, infos in filedetails.items(): infos['Parents'] = [] lumis = LumiList(runsAndLumis=infos['Lumis']) for secfilename, secinfos in moredetails.items(): if (lumis & secinfos['lumiobj']): infos['Parents'].append(secfilename) self.logger.info("Done matching files from secondary dataset") kwargs['task']['tm_use_parent'] = 1 except Exception as ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\ "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves if not filedetails: raise TaskWorkerException(( "Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n" "Aborting submission. Resubmitting your task will not help." ) % ( "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s" ) % (self.dbsInstance, kwargs['task']['tm_input_dataset'])) ## Format the output creating the data structures required by wmcore. Filters out invalid files, ## files whose block has no location, and figures out the PSN result = self.formatOutput(task=kwargs['task'], requestname=kwargs['task']['tm_taskname'], datasetfiles=filedetails, locations=locationsMap, tempDir=kwargs['tempDir']) if not result.result: raise TaskWorkerException(( "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n" "Aborting submission. Resubmitting your task will not help." ) % ( "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s" ) % (self.dbsInstance, kwargs['task']['tm_input_dataset'])) self.logger.debug("Got %s files" % len(result.result.getFiles())) return result
def importDataset(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed=True): """ _importDataset_ Import a dataset into the local scope DBS with full parentage hirerarchy (at least not slow because branches info is dropped) - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) blkCounter = 0 for inputBlock in inputBlocks: block = inputBlock["Name"] # // # // Test block does not exist in target # // blkCounter = blkCounter + 1 msg = "Importing block %s of %s: %s " % (blkCounter, len(inputBlocks), block) logging.debug(msg) if self.reader.blockExists(block): # // # // block exists # // If block is closed dont attempt transfer if str(inputBlock["OpenForWriting"]) != "1": msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock["NumberOfFiles"]) != "0": msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block, sename) logging.info(sename) continue try: self.dbs.migrateDatasetContents( sourceDBS, targetDBS, sourceDatasetPath, block_name=block, noParentsReadOnly=False ) except DbsException, ex: msg = "Error in DBSWriter.importDataset\n" msg += "Could not write content of dataset:\n ==> %s\n" % (sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock["NumberOfFiles"]) != "0": msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) for sename in locations: self.dbs.addReplicaToBlock(block, sename)
class DBSReaderTest(unittest.TestCase): def setUp(self): """ _setUp_ Initialize the PhEDEx API to point at the test server. """ #self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" self.endpoint = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader' self.dbs = None return @attr("integration") def testListPrimaryDatasets(self): """ listPrimaryDatasets returns known primary datasets """ self.dbs = DBSReader(self.endpoint) results = self.dbs.listPrimaryDatasets('Jet*') self.assertTrue('Jet' in results) self.assertTrue('JetMET' in results) self.assertTrue('JetMETTau' in results) self.assertFalse(self.dbs.listPrimaryDatasets('DoesntExist')) return @attr("integration") def testMatchProcessedDatasets(self): """ matchProcessedDatasets returns known processed datasets """ self.dbs = DBSReader(self.endpoint) dataset = self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v1') self.assertEqual(1, len(dataset)) self.assertEqual(['/Jet/Run2011A-v1/RAW'], dataset[0]['PathList']) self.assertEqual('Run2011A-v1', dataset[0]['Name']) self.assertFalse(self.dbs.matchProcessedDatasets('Jet', 'RAW', 'Run2011A-v666')) @attr("integration") def testlistRuns(self): """listRuns returns known runs""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRuns(dataset = DATASET) self.assertEqual(46, len(runs)) self.assertTrue(174074 in runs) runs = self.dbs.listRuns(dataset = DATASET, block = BLOCK) self.assertEqual([173657], runs) @attr("integration") def testlistRunLumis(self): """listRunLumis returns known runs and lumicounts""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRunLumis(dataset = DATASET) self.assertEqual(46, len(runs)) self.assertTrue(173692 in runs) self.assertEqual(runs[173692], 2782) runs = self.dbs.listRuns(dataset = DATASET, block = BLOCK) self.assertEqual({173657 : 94}, runs) @attr("integration") def testListProcessedDatasets(self): """listProcessedDatasets returns known processed datasets""" self.dbs = DBSReader(self.endpoint) datasets = self.dbs.listProcessedDatasets('Jet', 'RAW') self.assertTrue('Run2011A-v1' in datasets) self.assertTrue('Run2011B-v1' in datasets) self.assertFalse(self.dbs.listProcessedDatasets('Jet', 'blah')) self.assertFalse(self.dbs.listProcessedDatasets('blah', 'RAW')) @attr("integration") def testlistDatasetFiles(self): """listDatasetFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.listDatasetFiles(DATASET) self.assertEqual(49, len(files)) self.assertTrue(FILE in files) @attr("integration") def testlistDatasetFileDetails(self): """testlistDatasetFilesDetails returns lumis, events, and parents of a dataset""" TESTFILE = '/store/data/Run2011A/HighPileUp/RAW/v1/000/173/658/56484BAB-CBCB-E011-AF00-BCAEC518FF56.root' for endpoint in [self.endpoint, 'test/python/WMCore_t/Services_t/DBS_t/DBSReader_t.py:']: self.dbs = DBSReader(endpoint) details = self.dbs.listDatasetFileDetails(DATASET) self.assertEqual(len(details), 49) self.assertTrue(TESTFILE in details) self.assertEqual(details[TESTFILE]['NumberOfEvents'], 545) self.assertEqual(details[TESTFILE]['Size'], 286021145) self.assertEqual(details[TESTFILE]['BlockName'], '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace') self.assertEqual(details[TESTFILE]['Checksums'], {'Checksum': '22218315', 'Adler32': 'a41a1446', 'Md5': 'NOTSET'} ) self.assertTrue( 173658 in details[TESTFILE]['Lumis']) self.assertEqual( sorted(details[TESTFILE]['Lumis'][173658]), sorted( map( long, [8, 12, 9, 14, 10, 6, 2, 1, 4, 3, 36, 49, 16, 11, 27, 35, 46, 39, 20, 24, 52, 23, 40, 42, 45, 21, 32, 37, \ 25, 22, 5, 33, 17, 15, 26, 50, 18, 29, 51, 44, 69, 43, 30, 73, 19, 41, 13, 38, 7, 31, 75, 48, 59, 65, 55, \ 57, 34, 28, 74, 47, 64, 61, 68, 77, 66, 71, 60, 76, 70, 67, 62, 78, 82, 79, 88, 56, 101, 92, 58, 72, 54, \ 63, 96, 53, 84, 95, 89, 85, 99, 81, 91, 102, 80, 100, 107, 94, 93, 90, 86, 87, 83, 97, 104, 110, 111, 106,\ 108, 98, 103, 109, 105])) ) @attr("integration") def testGetDBSSummaryInfo(self): """getDBSSummaryInfo returns summary of dataset and block""" self.dbs = DBSReader(self.endpoint) dataset = self.dbs.getDBSSummaryInfo(DATASET) self.assertEqual(dataset['path'], DATASET) self.assertEqual(dataset['block'], '') self.assertEqual(dataset['NumberOfEvents'], '22075') self.assertEqual(dataset['NumberOfBlocks'], '46') self.assertEqual(dataset['total_size'], '4001680824') self.assertEqual(dataset['NumberOfFiles'], '49') self.assertEqual(dataset['NumberOfLumis'], '7223') block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK) self.assertEqual(block['path'], '') self.assertEqual(block['block'], BLOCK) self.assertEqual(block['NumberOfEvents'], '377') self.assertEqual(block['NumberOfBlocks'], '1') self.assertEqual(block['total_size'], '150780132') self.assertEqual(block['NumberOfFiles'], '2') self.assertEqual(block['NumberOfLumis'], '94') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET + 'blah') self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET, BLOCK + 'asas') @attr("integration") def testGetFileBlocksInfo(self): """getFileBlocksInfo returns block info, including location lookup""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.getFileBlocksInfo(DATASET) block = self.dbs.getFileBlocksInfo(DATASET, blockName = BLOCK) self.assertEqual(1, len(block)) block = block[0] self.assertEqual(46, len(blocks)) self.assertTrue(block['Name'] in [x['Name'] for x in blocks]) self.assertEqual(BLOCK, block['Name']) #self.assertEqual(377, block['NumberOfEvents']) self.assertEqual(150780132, block['BlockSize']) self.assertEqual(2, block['NumberOfFiles']) # possibly fragile but assume block located at least at cern sites = [x['Name'] for x in block['StorageElementList'] if x['Name'].find('cern.ch') > -1] self.assertTrue(sites) # weird error handling - depends on whether block or dataset is missing self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET + 'blah') self.assertFalse(self.dbs.getFileBlocksInfo(DATASET, blockName = BLOCK + 'asas')) @attr("integration") def testListFileBlocks(self): """listFileBlocks returns block names in dataset""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.listFileBlocks(DATASET) # block is closed block = self.dbs.listFileBlocks(DATASET, blockName = BLOCK, onlyClosedBlocks = True)[0] self.assertEqual(block, BLOCK) self.assertTrue(BLOCK in block) @attr("integration") def testListOpenFileBlocks(self): """listOpenFileBlocks finds open blocks""" # hard to find a dataset with open blocks, so don't bother self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.listOpenFileBlocks(DATASET)) @attr("integration") def testBlockExists(self): """blockExists returns existence of blocks""" self.dbs = DBSReader(self.endpoint) self.assertTrue(self.dbs.blockExists(BLOCK)) self.assertFalse(self.dbs.blockExists(DATASET + '#somethingelse')) @attr("integration") def testListFilesInBlock(self): """listFilesInBlock returns files in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue(FILE in [x['LogicalFileName'] for x in self.dbs.listFilesInBlock(BLOCK)]) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock, DATASET + '#blah') @attr("integration") def testListFilesInBlockWithParents(self): """listFilesInBlockWithParents gets files with parents for a block""" # hope PromptReco doesn't get deleted self.dbs = DBSReader(self.endpoint) files = self.dbs.listFilesInBlockWithParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60') self.assertEqual(1, len(files)) self.assertEqual('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60', files[0]['Block']['Name']) self.assertEqual('/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root', files[0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents, BLOCK + 'asas') @attr("integration") def testLfnsInBlock(self): """lfnsInBlock returns lfns in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue(FILE in self.dbs.lfnsInBlock(BLOCK)) self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + 'asas') @attr("integration") def testListFileBlockLocation(self): """listFileBlockLocation returns block location""" WRONG_BLOCK = BLOCK[:-4]+'abcd' BLOCK2 = '/HighPileUp/Run2011A-v1/RAW#6021175e-cbfb-11e0-80a9-003048caaace' DBS_BLOCK = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\ 'ea0972193530f531086947d06eb0f121/USER#fb978442-a61b-413a-b4f4-526e6cdb142e' DBS_BLOCK2 = '/GenericTTbar/hernan-140317_231446_crab_JH_ASO_test_T2_ES_CIEMAT_5000_100_140318_0014-'+\ 'ea0972193530f531086947d06eb0f121/USER#0b04d417-d734-4ef2-88b0-392c48254dab' self.dbs = DBSReader('https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader/') # assume one site is cern sites = [x for x in self.dbs.listFileBlockLocation(BLOCK) if x and x.find('cern.ch') > -1] self.assertTrue(sites) #This block is only found on DBS self.assertTrue(self.dbs.listFileBlockLocation(DBS_BLOCK)) # doesn't raise on non-existant block self.assertFalse(self.dbs.listFileBlockLocation(WRONG_BLOCK)) #test bulk call: ## two blocks in phedex self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, BLOCK2]))) ## one block in phedex one does not exist self.assertEqual(1, len(self.dbs.listFileBlockLocation([BLOCK, WRONG_BLOCK]))) ## one in phedex one in dbs self.assertEqual(2, len(self.dbs.listFileBlockLocation([BLOCK, DBS_BLOCK]))) ## two in dbs self.assertEqual(2, len(self.dbs.listFileBlockLocation([DBS_BLOCK, DBS_BLOCK2]))) ## one in DBS and one does not exist self.assertEqual(1, len(self.dbs.listFileBlockLocation([DBS_BLOCK, WRONG_BLOCK]))) @attr("integration") def testGetFileBlock(self): """getFileBlock returns block""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlock(BLOCK) self.assertEqual(len(block), 1) block = block[BLOCK] self.assertEqual(2, len(block['Files'])) self.assertRaises(DBSReaderError, self.dbs.getFileBlock, BLOCK + 'asas') @attr("integration") def testGetFileBlockWithParents(self): """getFileBlockWithParents returns block and parents""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlockWithParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60') self.assertEqual(len(block), 1) block = block['/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60'] self.assertEqual('/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root', block['Files'][0]['ParentList'][0]['LogicalFileName']) self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents, BLOCK + 'asas') @attr("integration") def testGetFiles(self): """getFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.getFiles(DATASET) self.assertEqual(len(files), 46) @attr("integration") def testListBlockParents(self): """listBlockParents returns block parents""" self.dbs = DBSReader(self.endpoint) parents = self.dbs.listBlockParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60') self.assertEqual(1, len(parents)) self.assertEqual('/Jet/Run2011A-v1/RAW#37cf2a40-4e0e-11e0-9833-00151755cb60', parents[0]['Name']) sites = [x for x in parents[0]['StorageElementList'] if x.find("cern.ch") > -1] self.assertTrue(sites) self.assertFalse(self.dbs.listBlockParents('/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60dsl')) @attr("integration") def testBlockIsOpen(self): """blockIsOpen checks if a block is open""" self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.blockIsOpen(BLOCK)) @attr("integration") def testBlockToDatasetPath(self): """blockToDatasetPath extracts path from block name""" self.dbs = DBSReader(self.endpoint) self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET) self.assertFalse(self.dbs.blockToDatasetPath(BLOCK + 'asas'))
def importDataset(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed=True): """ _importDataset_ Import a dataset into the local scope DBS with full parentage hirerarchy (at least not slow because branches info is dropped) - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed, locations=False) blkCounter = 0 for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// blkCounter = blkCounter + 1 msg = "Importing block %s of %s: %s " % (blkCounter, len(inputBlocks), block) logging.debug(msg) if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str( inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block, sename) logging.info(sename) continue try: self.dbs.migrateDatasetContents(sourceDBS, targetDBS, sourceDatasetPath, block_name=block, noParentsReadOnly=False) except DbsException as ex: msg = "Error in DBSWriter.importDataset\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDataset\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) for sename in locations: self.dbs.addReplicaToBlock(block, sename) return
def importDatasetWithExistingParents(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed=True): """ _importDataset_ Import a dataset into the local scope DBS. It complains if the parent dataset ar not there!! - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed, locations=False) for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if not str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str( inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block, sename) logging.info(sename) continue try: xferData = reader.dbs.listDatasetContents( sourceDatasetPath, block) except DbsException as ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not read content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) try: self.dbs.insertDatasetContents(xferData) except DbsException as ex: msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) del xferData locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithExistingParents\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) for sename in locations: self.dbs.addReplicaToBlock(block, sename) return
def importDatasetWithoutParentage(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed=True): """ _importDataset_ Import a dataset into the local scope DBS with one level parentage, however it has severe limitation on its use due to the "ReadOnly" concept. - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed) for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str( inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithoutParentage\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block, sename) logging.info(sename) continue try: self.dbs.migrateDatasetContents(sourceDBS, targetDBS, sourceDatasetPath, block_name=block, noParentsReadOnly=True) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithoutParentage\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath, ) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithoutParentage\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) for sename in locations: self.dbs.addReplicaToBlock(block, sename)
def importDatasetWithoutParentage(self, sourceDBS, sourceDatasetPath, targetDBS, onlyClosed = True): """ _importDataset_ Import a dataset into the local scope DBS with one level parentage, however it has severe limitation on its use due to the "ReadOnly" concept. - *sourceDBS* : URL for input DBS instance - *sourceDatasetPath* : Dataset Path to be imported - *targetDBS* : URL for DBS to have dataset imported to """ reader = DBSReader(sourceDBS) inputBlocks = reader.getFileBlocksInfo(sourceDatasetPath, onlyClosed, locations = False) for inputBlock in inputBlocks: block = inputBlock['Name'] # // # // Test block does not exist in target #// if self.reader.blockExists(block): # // # // block exists #// If block is closed dont attempt transfer if str(inputBlock['OpenForWriting']) != '1': msg = "Block already exists in target DBS and is closed:\n" msg += " ==> %s\n" % block msg += "Skipping Import of that block" logging.warning(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithoutParentage\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) logging.info("Update block locations to:") for sename in locations: self.dbs.addReplicaToBlock(block,sename) logging.info(sename) continue try: self.dbs.migrateDatasetContents(sourceDBS, targetDBS, sourceDatasetPath, block_name=block, noParentsReadOnly = True ) except DbsException, ex: msg = "Error in DBSWriter.importDatasetWithoutParentage\n" msg += "Could not write content of dataset:\n ==> %s\n" % ( sourceDatasetPath,) msg += "Block name:\n ==> %s\n" % block msg += "%s\n" % formatEx(ex) raise DBSWriterError(msg) locations = reader.listFileBlockLocation(block) # only empty file blocks can have no location if not locations and str(inputBlock['NumberOfFiles']) != "0": msg = "Error in DBSWriter.importDatasetWithoutParentage\n" msg += "Block has no locations defined: %s" % block raise DBSWriterError(msg) for sename in locations: self.dbs.addReplicaToBlock(block,sename)
class DBSReaderTest(unittest.TestCase): def setUp(self): """ _setUp_ Initialize the PhEDEx API to point at the test server. """ # self.endpoint = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" self.endpoint = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader" self.dbs = None return @attr("integration") def testListPrimaryDatasets(self): """ listPrimaryDatasets returns known primary datasets """ self.dbs = DBSReader(self.endpoint) results = self.dbs.listPrimaryDatasets("Jet*") self.assertTrue("Jet" in results) self.assertTrue("JetMET" in results) self.assertTrue("JetMETTau" in results) self.assertFalse(self.dbs.listPrimaryDatasets("DoesntExist")) return @attr("integration") def testMatchProcessedDatasets(self): """ matchProcessedDatasets returns known processed datasets """ self.dbs = DBSReader(self.endpoint) dataset = self.dbs.matchProcessedDatasets("Jet", "RAW", "Run2011A-v1") self.assertEqual(1, len(dataset)) self.assertEqual(["/Jet/Run2011A-v1/RAW"], dataset[0]["PathList"]) self.assertEqual("Run2011A-v1", dataset[0]["Name"]) self.assertFalse(self.dbs.matchProcessedDatasets("Jet", "RAW", "Run2011A-v666")) @attr("integration") def testlistRuns(self): """listRuns returns known runs""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRuns(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(174074 in runs) runs = self.dbs.listRuns(dataset=DATASET, block=BLOCK) self.assertEqual([173657], runs) @attr("integration") def testlistRunLumis(self): """listRunLumis returns known runs and lumicounts""" self.dbs = DBSReader(self.endpoint) runs = self.dbs.listRunLumis(dataset=DATASET) self.assertEqual(46, len(runs)) self.assertTrue(173692 in runs) self.assertEqual(runs[173692], 2782) runs = self.dbs.listRuns(dataset=DATASET, block=BLOCK) self.assertEqual({173657: 94}, runs) @attr("integration") def testListProcessedDatasets(self): """listProcessedDatasets returns known processed datasets""" self.dbs = DBSReader(self.endpoint) datasets = self.dbs.listProcessedDatasets("Jet", "RAW") self.assertTrue("Run2011A-v1" in datasets) self.assertTrue("Run2011B-v1" in datasets) self.assertFalse(self.dbs.listProcessedDatasets("Jet", "blah")) self.assertFalse(self.dbs.listProcessedDatasets("blah", "RAW")) @attr("integration") def testlistDatasetFiles(self): """listDatasetFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.listDatasetFiles(DATASET) self.assertEqual(49, len(files)) self.assertTrue(FILE in files) @attr("integrtion") def testGetDBSSummaryInfo(self): """getDBSSummaryInfo returns summary of dataset and block""" self.dbs = DBSReader(self.endpoint) dataset = self.dbs.getDBSSummaryInfo(DATASET) self.assertEqual(dataset["path"], DATASET) self.assertEqual(dataset["block"], "") self.assertEqual(dataset["NumberOfEvents"], "22075") self.assertEqual(dataset["NumberOfBlocks"], "46") self.assertEqual(dataset["total_size"], "4001680824") self.assertEqual(dataset["NumberOfFiles"], "49") self.assertEqual(dataset["NumberOfLumis"], "7223") block = self.dbs.getDBSSummaryInfo(DATASET, BLOCK) self.assertEqual(block["path"], "") self.assertEqual(block["block"], BLOCK) self.assertEqual(block["NumberOfEvents"], "377") self.assertEqual(block["NumberOfBlocks"], "1") self.assertEqual(block["total_size"], "150780132") self.assertEqual(block["NumberOfFiles"], "2") self.assertEqual(block["NumberOfLumis"], "94") self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET + "blah") self.assertRaises(DBSReaderError, self.dbs.getDBSSummaryInfo, DATASET, BLOCK + "asas") @attr("integration") def testGetFileBlocksInfo(self): """getFileBlocksInfo returns block info, including location lookup""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.getFileBlocksInfo(DATASET) block = self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK) self.assertEqual(1, len(block)) block = block[0] self.assertEqual(46, len(blocks)) self.assertTrue(block["Name"] in [x["Name"] for x in blocks]) self.assertEqual(BLOCK, block["Name"]) # self.assertEqual(377, block['NumberOfEvents']) self.assertEqual(150780132, block["BlockSize"]) self.assertEqual(2, block["NumberOfFiles"]) # possibly fragile but assume block located at least at cern sites = [x["Name"] for x in block["StorageElementList"] if x["Name"].find("cern.ch") > -1] self.assertTrue(sites) # weird error handling - depends on whether block or dataset is missing self.assertRaises(DBSReaderError, self.dbs.getFileBlocksInfo, DATASET + "blah") self.assertFalse(self.dbs.getFileBlocksInfo(DATASET, blockName=BLOCK + "asas")) @attr("integration") def testListFileBlocks(self): """listFileBlocks returns block names in dataset""" self.dbs = DBSReader(self.endpoint) blocks = self.dbs.listFileBlocks(DATASET) # block is closed block = self.dbs.listFileBlocks(DATASET, blockName=BLOCK, onlyClosedBlocks=True)[0] self.assertEqual(block, BLOCK) self.assertTrue(BLOCK in block) @attr("integration") def testListOpenFileBlocks(self): """listOpenFileBlocks finds open blocks""" # hard to find a dataset with open blocks, so don't bother self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.listOpenFileBlocks(DATASET)) @attr("integration") def testBlockExists(self): """blockExists returns existence of blocks""" self.dbs = DBSReader(self.endpoint) self.assertTrue(self.dbs.blockExists(BLOCK)) self.assertFalse(self.dbs.blockExists(DATASET + "#somethingelse")) @attr("integration") def testListFilesInBlock(self): """listFilesInBlock returns files in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue(FILE in [x["LogicalFileName"] for x in self.dbs.listFilesInBlock(BLOCK)]) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlock, DATASET + "#blah") @attr("integration") def testListFilesInBlockWithParents(self): """listFilesInBlockWithParents gets files with parents for a block""" # hope PromptReco doesn't get deleted self.dbs = DBSReader(self.endpoint) files = self.dbs.listFilesInBlockWithParents( "/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60" ) self.assertEqual(1, len(files)) self.assertEqual( "/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60", files[0]["Block"]["Name"] ) self.assertEqual( "/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root", files[0]["ParentList"][0]["LogicalFileName"], ) self.assertRaises(DBSReaderError, self.dbs.listFilesInBlockWithParents, BLOCK + "asas") @attr("integration") def testLfnsInBlock(self): """lfnsInBlock returns lfns in block""" self.dbs = DBSReader(self.endpoint) self.assertTrue(FILE in self.dbs.lfnsInBlock(BLOCK)) self.assertRaises(DBSReaderError, self.dbs.lfnsInBlock, BLOCK + "asas") @attr("integration") def testListFileBlockLocation(self): """listFileBlockLocation returns block location""" self.dbs = DBSReader(self.endpoint) # assume one site is cern sites = [x for x in self.dbs.listFileBlockLocation(BLOCK) if x.find("cern.ch") > -1] self.assertTrue(sites) # doesn't raise on non-existant block self.assertFalse(self.dbs.listFileBlockLocation(BLOCK + "blah")) @attr("integration") def testGetFileBlock(self): """getFileBlock returns block""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlock(BLOCK) self.assertEqual(len(block), 1) block = block[BLOCK] self.assertEqual(2, len(block["Files"])) self.assertRaises(DBSReaderError, self.dbs.getFileBlock, BLOCK + "asas") @attr("integration") def testGetFileBlockWithParents(self): """getFileBlockWithParents returns block and parents""" self.dbs = DBSReader(self.endpoint) block = self.dbs.getFileBlockWithParents( "/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60" ) self.assertEqual(len(block), 1) block = block["/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60"] self.assertEqual( "/store/data/Run2011A/Jet/RAW/v1/000/160/433/24B46223-0D4E-E011-B573-0030487C778E.root", block["Files"][0]["ParentList"][0]["LogicalFileName"], ) self.assertRaises(DBSReaderError, self.dbs.getFileBlockWithParents, BLOCK + "asas") @attr("integration") def testGetFiles(self): """getFiles returns files in dataset""" self.dbs = DBSReader(self.endpoint) files = self.dbs.getFiles(DATASET) self.assertEqual(len(files), 46) @attr("integration") def testListBlockParents(self): """listBlockParents returns block parents""" self.dbs = DBSReader(self.endpoint) parents = self.dbs.listBlockParents("/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60") self.assertEqual(1, len(parents)) self.assertEqual("/Jet/Run2011A-v1/RAW#37cf2a40-4e0e-11e0-9833-00151755cb60", parents[0]["Name"]) sites = [x for x in parents[0]["StorageElementList"] if x.find("cern.ch") > -1] self.assertTrue(sites) self.assertFalse( self.dbs.listBlockParents("/Jet/Run2011A-PromptReco-v1/RECO#f8d36af3-4fb6-11e0-9d39-00151755cb60dsl") ) @attr("integration") def testBlockIsOpen(self): """blockIsOpen checks if a block is open""" self.dbs = DBSReader(self.endpoint) self.assertFalse(self.dbs.blockIsOpen(BLOCK)) @attr("integration") def testBlockToDatasetPath(self): """blockToDatasetPath extracts path from block name""" self.dbs = DBSReader(self.endpoint) self.assertEqual(self.dbs.blockToDatasetPath(BLOCK), DATASET) self.assertFalse(self.dbs.blockToDatasetPath(BLOCK + "asas"))