def test_structure_from_path(self): s = DRSFile.find_structure_from_path(self.fn) self.assertEqual(s, 'cmip5') s = DRSFile.find_structure_from_path(self.fn, allow_multiples=True) self.assertEqual(s, ['cmip5']) self.assertRaises(Exception, DRSFile.find_structure_from_path, '/no/valid/file_path')
def test_find_structure_in_path(self): s = DRSFile.find_structure_in_path('/tmp/some_temp_solr_core/cmip5') self.assertEqual(s, 'cmip5') s = DRSFile.find_structure_in_path('/tmp/some_temp_solr_core/cmip5', allow_multiples=True) self.assertEqual(s, ['cmip5']) self.assertRaises(Exception, DRSFile.find_structure_in_path, '/no/valid/path')
def test_compare(self): fn2 = os.path.join(self.tmpdir, self.files[1]) drs2 = DRSFile.from_path(fn2) self.assertTrue(self.drs == self.drs) self.assertFalse(self.drs == drs2) self.assertFalse(drs2 == fn2)
def test_solr_search(self): # test path_only search res = DRSFile.solr_search(path_only=True, variable='tauu') self.assertEqual(list(res), [ u'/tmp/some_temp_solr_core/cmip5/output1/MOHC/HadCM3/decadal2008/mon/atmos/Amon/r9i3p1/v20120523/tauu/tauu_Amon_HadCM3_decadal2008_r9i3p1_200811-201812.nc' ]) # test drs search res = DRSFile.solr_search(variable='ua') for i in res: self.assertTrue(isinstance(i, DRSFile)) # use drs_structure res = DRSFile.solr_search(drs_structure=CMIP5) for j, i in enumerate(res): self.assertTrue(isinstance(i, DRSFile)) self.assertEqual(j + 1, 3)
def setUp(self): os.environ['EVALUATION_SYSTEM_CONFIG_FILE'] = os.path.dirname( __file__) + '/test.conf' config.reloadConfiguration() self.solr_port = config.get('solr.port') self.solr_host = config.get('solr.host') # test instances, check they are as expected self.all_files = SolrCore(core='files', host=self.solr_host, port=self.solr_port) self.latest = SolrCore(core='latest', host=self.solr_host, port=self.solr_port) self.assertEquals(self.all_files.status()['index']['numDocs'], 0) self.assertEquals(self.latest.status()['index']['numDocs'], 0) # add some files to the cores supermakedirs('/tmp/some_temp_solr_core/', 0777) self.tmpdir = '/tmp/some_temp_solr_core' self.orig_dir = DRSFile.DRS_STRUCTURE[CMIP5]['root_dir'] DRSFile.DRS_STRUCTURE[CMIP5]['root_dir'] = self.tmpdir self.files = [ 'cmip5/output1/MOHC/HadCM3/historical/mon/aerosol/aero/r2i1p1/v20110728/wetso2/wetso2_aero_HadCM3_historical_r2i1p1_190912-193411.nc', 'cmip5/output1/MOHC/HadCM3/decadal2008/mon/atmos/Amon/r9i3p1/v20120523/tauu/tauu_Amon_HadCM3_decadal2008_r9i3p1_200811-201812.nc', 'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20110719/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc', 'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20110819/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc', 'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20110419/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc' ] for f in self.files: abs_path = os.path.abspath(os.path.join(self.tmpdir, f)) try: os.makedirs(os.path.dirname(abs_path)) except: # pragma nocover pass with open(abs_path, 'w') as f_out: f_out.write(' ') dump_file = self.tmpdir + '/dump1.csv' # add the files to solr SolrCore.dump_fs_to_file(self.tmpdir + '/cmip5', dump_file) SolrCore.load_fs_from_file(dump_file, abort_on_errors=True, core_all_files=self.all_files, core_latest=self.latest) self.fn = os.path.join(self.tmpdir, self.files[0]) self.drs = DRSFile.from_path(self.fn)
def load_fs_from_file(dump_file, batch_size=10000, abort_on_errors=False, core_all_files=None, core_latest=None): """This is the opposite method of :class:`SolrCore.dump_fs_to_file`. It loads the files system information to Solr from the given file. The syntax is defined already in the mentioned dump method. Contrary to what was previously done, this method loads the information from a file and decides if it should be added to just the common file core, holding the index of all files, or also to the *latest* core, holding information about the latest version of all files (remember that in CMIP5 not all files are version, just the datasets). :param dump_file: the path to the file that contains the dump. if the file ends with '.gz' the file is assumed to be gziped. :param batch_size: number of entries that will be written to the Solr main core (the latest core will be flushed at the same time and is guaranteed to have at most as many as the other. :param abort_on_errors: If dumping should get aborted as soon as an error is found, i.e. a file that can't be ingested. Most of the times there are many files being found in the dump file that are no data at all :param core_all_files: if desired you can pass the SolrCore managing all the files (if not the one named 'files'will be used, using the configuration from the config file). :param core_latest: if desired you can pass the SolrCore managing the latest file versions (if not the one named 'latest' will be used, using the configuration from the config file). """ if dump_file.endswith('.gz'): # print "Using gzip" import gzip # the with statement support started with python 2.7 (http://docs.python.org/2/library/gzip.html) # Let's leave this python 2.6 compatible... f = gzip.open(dump_file, 'rb') else: f = open(dump_file, 'r') if core_latest is None: core_latest = SolrCore(core='latest') if core_all_files is None: core_all_files = SolrCore(core='files') try: batch_count = 0 batch = [] batch_latest = [] batch_latest_new = {} latest_versions = {} header = True import re meta = re.compile('[^ \t]{1,}[ \t]{1,}(.*)$') for line in f: line = line.strip() if not line or line.startswith('#'): continue if header: if line.startswith(META_DATA.CRAWL_DIR): crawl_dir = meta.match(line).group(1).strip() # we should delete these. We need to scape the first slash since Solr # will expect a regexp if not (new to Solr 4.0) core_all_files.delete('file:\\%s*' % crawl_dir) core_latest.delete('file:\\%s*' % crawl_dir) elif line.startswith(META_DATA.DATA): header = False continue else: file_path, timestamp = line.split(',') try: drs_file = DRSFile.from_path(file_path) metadata = SolrCore.to_solr_dict(drs_file) ts = float(timestamp) metadata['timestamp'] = ts metadata['creation_time'] = timestamp_to_solr_date(ts) batch.append(metadata) if drs_file.is_versioned(): version = latest_versions.get( drs_file.to_dataset(versioned=False), None) if version is None or drs_file.get_version( ) > version: # unknown or new version, update version = drs_file.get_version() latest_versions[drs_file.to_dataset( versioned=False)] = version batch_latest_new[drs_file.to_dataset( versioned=False)] = metadata #batch_latest = batch_latest_new.values() if not drs_file.get_version() < version: # print latest_versions #print drs_file.get_version(), version, metadata batch_latest.append(metadata) else: # if not version always add to latest batch_latest_new[drs_file.to_dataset( versioned=False)] = metadata batch_latest.append(metadata) if len(batch) >= batch_size: print "Sending entries %s-%s" % ( batch_count * batch_size, (batch_count + 1) * batch_size) core_all_files.post(batch) batch = [] batch_count += 1 if batch_latest: core_latest.post(batch_latest) batch_latest = [] batch_latest_new = {} except: print "Can't ingest file %s" % file_path if abort_on_errors: raise # flush if len(batch) > 0: print "Sending last %s entries and %s entries to latest core" % ( len(batch), len(batch_latest)) #print len(batch_latest) core_all_files.post(batch) batch = [] batch_count += 1 if batch_latest: core_latest.post(batch_latest) batch_latest = [] finally: f.close()
def dump_fs_to_file(start_dir, dump_file, batch_size=1000, check=False, abort_on_errors=False): """This is the currently used method for ingestion. This method generates a file with a listing of paths and timestamps from the file system. The syntax of the file looks like this:: crawl_dir /path/to/some/directory data /path/to/a/file,1239879.0 /path/to/another/file,1239879.0 ... The crawl_dir indicates the directory being crawled and results in the deletion of all files whose path starts with that one (i.e. everything under that path will be *replaced*). Generating this file takes at least 8 hours for the whole /miklip/integration/data4miklip directory. It would be nice to generate it in a different manner (e.g. using the gpfs policy API). :param start_dir: The directory from which the file system will be crawled :param dump_file: the path to the file that will contain the dump. if the file ends with '.gz' the resulting file will be gziped (preferred) :param batch_size: number of entries that will be written to disk at once. This might help pin-pointing crashes. :param check: if the paths should be checked. While checking path the resulting paths are guaranteed to be accepted later on normally this is too slow for this phase, so the default is False. :param abort_on_errors: If dumping should get aborted as soon as an error is found, i.e. a file that can't be ingested. Most of the times there are many files being found that are no data at all.""" log.debug('starting sequential ingest') if dump_file.endswith('.gz'): # print "Using gzip" import gzip # the with statement support started with python 2.7 (http://docs.python.org/2/library/gzip.html) # Let's leave this python 2.6 compatible... f = gzip.open(dump_file, 'wb') else: f = open(dump_file, 'w') try: batch_count = 0 # store metadata f.write('%s\t%s\n' % (META_DATA.CRAWL_DIR, start_dir)) # store data f.write('\n%s\n' % META_DATA.DATA) for path in dir_iter(start_dir): if check: try: DRSFile.from_path(path) except: if abort_on_errors: raise else: print "Error ingesting %s" % path continue ts = os.path.getmtime(path) f.write('%s,%s\n' % (path, ts)) batch_count += 1 if batch_count >= batch_size: f.flush() finally: f.close()
def test_from_json(self): j = self.drs.to_json() t = DRSFile.from_json(j, CMIP5) self.assertTrue(isinstance(t, DRSFile)) self.assertEqual(self.drs.to_path(), t.to_path())
def test_from_dict(self): d = self.drs.dict t = DRSFile.from_dict(d, CMIP5) self.assertTrue(isinstance(t, DRSFile)) self.assertEqual(self.drs.to_path(), t.to_path())