def test_5_harvest_multi(self): '''test harvesting in multiprocessing''' infos, _ = harvest.get_ds_infos('.*') destdir = '/gridgroup/cms/cbernet/unittests/multi' harvest.harvest(infos, destdir, ntgzs=2, nworkers=2, delete='y') results = os.listdir(destdir) self.assertEqual(len(results), 2)
def test_6_harvest_sequential(self): '''test that harvesting can be done in steps''' infos, _ = harvest.get_ds_infos('.*') destdir = '/gridgroup/cms/cbernet/unittests/multi' infos1 = infos[:1] infos2 = infos[1:2] harvest.harvest(infos1, destdir, ntgzs=2, nworkers=1, delete='y') harvest.harvest(infos2, destdir, ntgzs=2, nworkers=1, delete='n') results = os.listdir(destdir) self.assertEqual(len(results), 2)
def test_1_ds_info(self): '''test that dataset info can be readout and filtered''' # import pdb; pdb.set_trace() infos, done = harvest.get_ds_infos('.*DY1Jets.*') self.assertEqual(len(infos), 2) self.assertEqual(len(done), 0) for info in infos: self.assertTrue('path' in info) self.assertTrue(len(info['tgzs']['0000']) > 1) # test that datasets already harvested are masked # create dummy harvesting information: harv_info = { 'time': time.time(), 'parent': None, 'dir': 'foo', 'tgzs': infos[1]['tgzs'] } infos[1]['harvesting'] = harv_info harvest.datasetdb.insert('se', infos[1]) infos, done = harvest.get_ds_infos('.*DY1Jets.*') print('selected') pprint.pprint(infos) print('done') pprint.pprint(done) self.assertEqual(len(infos), 1) self.assertEqual(len(done), 1) # add one more tgz to a sample that has already been harvested newinfo = copy.copy(done[0]) newinfo['tgzs']['0000'].append('heppyOutput_666.tgz') harvest.datasetdb.insert('se', newinfo) infos2, done2 = harvest.get_ds_infos('.*DY1Jets.*') self.assertEqual(len(infos2), 2) self.assertEqual(len(done2), 0) # add one more subdir newinfo = copy.copy(done[0]) newinfo['tgzs']['0001'] = [] infos2, done2 = harvest.get_ds_infos('.*DY1Jets.*') self.assertEqual(len(infos2), 2) self.assertEqual(len(done2), 0)
def test_4_harvest_one(self): infos, _ = harvest.get_ds_infos('.*DY1Jets.*_ext') destdir = '/gridgroup/cms/cbernet/unittests/single' info = infos[0] start = time.time() hinfo = harvest.harvest_one(info, destdir, ntgzs=2) harvest.datasetdb.insert('se', hinfo) # check that dataset exists on destination: # result = subprocess.check_output( # 'ssh -p 2222 localhost ls {}'.format(destdir).split() # ) result = os.listdir(destdir) self.assertEqual(len(result), 1) self.assertEqual(result[0], info['name']) # check harvesting time in db hinfo2 = harvest.datasetdb.find('se', {'name': info['name']}) self.assertEqual(len(hinfo2), 1) # pprint.pprint(hinfo2[0]) self.assertTrue(hinfo2[0]['harvesting']['time'] > start)
def test_2_fetch(self): infos, _ = harvest.get_ds_infos('.*DY1Jets.*_ext') outdir = tempfile.mkdtemp() # print(outdir) ntgzs = 2 harvest.fetch(infos[0], outdir, ntgzs) harvest.unpack(infos[0], outdir, ntgzs) chunks = os.listdir(outdir) # pprint.pprint(chunks) self.assertListEqual(chunks, [ '190503%DY1JetsToLL_M50_LO_ext%tt_DY_nominal_Chunk10', '190503%DY1JetsToLL_M50_LO_ext%tt_DY_nominal_Chunk1' ]) harvest.hadd(outdir) results = os.listdir(outdir) self.assertEqual(len(results), 1) self.assertEqual(results[0], '190503%DY1JetsToLL_M50_LO_ext%tt_DY_nominal') shutil.rmtree(outdir)