示例#1
0
    def test_structure_from_path(self):

        s = DRSFile.find_structure_from_path(self.fn)
        self.assertEqual(s, 'cmip5')
        s = DRSFile.find_structure_from_path(self.fn, allow_multiples=True)
        self.assertEqual(s, ['cmip5'])
        self.assertRaises(Exception, DRSFile.find_structure_from_path,
                          '/no/valid/file_path')
示例#2
0
    def test_find_structure_in_path(self):

        s = DRSFile.find_structure_in_path('/tmp/some_temp_solr_core/cmip5')
        self.assertEqual(s, 'cmip5')
        s = DRSFile.find_structure_in_path('/tmp/some_temp_solr_core/cmip5',
                                           allow_multiples=True)
        self.assertEqual(s, ['cmip5'])
        self.assertRaises(Exception, DRSFile.find_structure_in_path,
                          '/no/valid/path')
示例#3
0
    def test_compare(self):
        fn2 = os.path.join(self.tmpdir, self.files[1])
        drs2 = DRSFile.from_path(fn2)

        self.assertTrue(self.drs == self.drs)
        self.assertFalse(self.drs == drs2)
        self.assertFalse(drs2 == fn2)
示例#4
0
    def test_solr_search(self):

        # test path_only search
        res = DRSFile.solr_search(path_only=True, variable='tauu')
        self.assertEqual(list(res), [
            u'/tmp/some_temp_solr_core/cmip5/output1/MOHC/HadCM3/decadal2008/mon/atmos/Amon/r9i3p1/v20120523/tauu/tauu_Amon_HadCM3_decadal2008_r9i3p1_200811-201812.nc'
        ])

        # test drs search
        res = DRSFile.solr_search(variable='ua')
        for i in res:
            self.assertTrue(isinstance(i, DRSFile))

        # use drs_structure
        res = DRSFile.solr_search(drs_structure=CMIP5)
        for j, i in enumerate(res):
            self.assertTrue(isinstance(i, DRSFile))
        self.assertEqual(j + 1, 3)
示例#5
0
    def setUp(self):
        os.environ['EVALUATION_SYSTEM_CONFIG_FILE'] = os.path.dirname(
            __file__) + '/test.conf'
        config.reloadConfiguration()
        self.solr_port = config.get('solr.port')
        self.solr_host = config.get('solr.host')
        # test instances, check they are as expected
        self.all_files = SolrCore(core='files',
                                  host=self.solr_host,
                                  port=self.solr_port)
        self.latest = SolrCore(core='latest',
                               host=self.solr_host,
                               port=self.solr_port)
        self.assertEquals(self.all_files.status()['index']['numDocs'], 0)
        self.assertEquals(self.latest.status()['index']['numDocs'], 0)

        # add some files to the cores
        supermakedirs('/tmp/some_temp_solr_core/', 0777)
        self.tmpdir = '/tmp/some_temp_solr_core'
        self.orig_dir = DRSFile.DRS_STRUCTURE[CMIP5]['root_dir']
        DRSFile.DRS_STRUCTURE[CMIP5]['root_dir'] = self.tmpdir

        self.files = [
            'cmip5/output1/MOHC/HadCM3/historical/mon/aerosol/aero/r2i1p1/v20110728/wetso2/wetso2_aero_HadCM3_historical_r2i1p1_190912-193411.nc',
            'cmip5/output1/MOHC/HadCM3/decadal2008/mon/atmos/Amon/r9i3p1/v20120523/tauu/tauu_Amon_HadCM3_decadal2008_r9i3p1_200811-201812.nc',
            'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20110719/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc',
            'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20110819/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc',
            'cmip5/output1/MOHC/HadCM3/decadal2009/mon/atmos/Amon/r7i2p1/v20110419/ua/ua_Amon_HadCM3_decadal2009_r7i2p1_200911-201912.nc'
        ]
        for f in self.files:
            abs_path = os.path.abspath(os.path.join(self.tmpdir, f))
            try:
                os.makedirs(os.path.dirname(abs_path))
            except:  # pragma nocover
                pass
            with open(abs_path, 'w') as f_out:
                f_out.write(' ')
        dump_file = self.tmpdir + '/dump1.csv'
        # add the files to solr
        SolrCore.dump_fs_to_file(self.tmpdir + '/cmip5', dump_file)
        SolrCore.load_fs_from_file(dump_file,
                                   abort_on_errors=True,
                                   core_all_files=self.all_files,
                                   core_latest=self.latest)

        self.fn = os.path.join(self.tmpdir, self.files[0])
        self.drs = DRSFile.from_path(self.fn)
示例#6
0
    def load_fs_from_file(dump_file,
                          batch_size=10000,
                          abort_on_errors=False,
                          core_all_files=None,
                          core_latest=None):
        """This is the opposite method of :class:`SolrCore.dump_fs_to_file`. It loads the files system information to Solr
from the given file. The syntax is defined already in the mentioned dump method.
Contrary to what was previously done, this method loads the information from a file and decides if it should be added
to just the common file core, holding the index of all files, or also to the *latest* core, holding information
about the latest version of all files (remember that in CMIP5 not all files are version, just the datasets).

:param dump_file: the path to the file that contains the dump. if the file ends with '.gz' the file is assumed to
be gziped.
:param batch_size: number of entries that will be written to the Solr main core (the latest core will be flushed at
the same time and is guaranteed to have at most as many as the other.
:param abort_on_errors: If dumping should get aborted as soon as an error is found, i.e. a file that can't be ingested.
 Most of the times there are many files being found in the dump file that are no data at all
:param core_all_files: if desired you can pass the SolrCore managing all the files (if not the one named 'files'will
be used, using the configuration from the config file).
:param core_latest: if desired you can pass the SolrCore managing the latest file versions (if not the one named
'latest' will be used, using the configuration from the config file).
"""

        if dump_file.endswith('.gz'):
            # print "Using gzip"
            import gzip
            # the with statement support started with python 2.7 (http://docs.python.org/2/library/gzip.html)
            # Let's leave this python 2.6 compatible...
            f = gzip.open(dump_file, 'rb')
        else:
            f = open(dump_file, 'r')

        if core_latest is None:
            core_latest = SolrCore(core='latest')
        if core_all_files is None:
            core_all_files = SolrCore(core='files')

        try:
            batch_count = 0
            batch = []
            batch_latest = []
            batch_latest_new = {}

            latest_versions = {}

            header = True

            import re
            meta = re.compile('[^ \t]{1,}[ \t]{1,}(.*)$')

            for line in f:
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                if header:
                    if line.startswith(META_DATA.CRAWL_DIR):
                        crawl_dir = meta.match(line).group(1).strip()
                        # we should delete these. We need to scape the first slash since Solr
                        # will expect a regexp if not (new to Solr 4.0)
                        core_all_files.delete('file:\\%s*' % crawl_dir)
                        core_latest.delete('file:\\%s*' % crawl_dir)
                    elif line.startswith(META_DATA.DATA):
                        header = False
                    continue
                else:
                    file_path, timestamp = line.split(',')
                    try:
                        drs_file = DRSFile.from_path(file_path)

                        metadata = SolrCore.to_solr_dict(drs_file)
                        ts = float(timestamp)
                        metadata['timestamp'] = ts
                        metadata['creation_time'] = timestamp_to_solr_date(ts)

                        batch.append(metadata)

                        if drs_file.is_versioned():
                            version = latest_versions.get(
                                drs_file.to_dataset(versioned=False), None)
                            if version is None or drs_file.get_version(
                            ) > version:
                                # unknown or new version, update
                                version = drs_file.get_version()
                                latest_versions[drs_file.to_dataset(
                                    versioned=False)] = version
                                batch_latest_new[drs_file.to_dataset(
                                    versioned=False)] = metadata
                                #batch_latest = batch_latest_new.values()

                            if not drs_file.get_version() < version:
                                # print latest_versions
                                #print drs_file.get_version(), version, metadata
                                batch_latest.append(metadata)
                        else:
                            # if not version always add to latest
                            batch_latest_new[drs_file.to_dataset(
                                versioned=False)] = metadata
                            batch_latest.append(metadata)

                        if len(batch) >= batch_size:
                            print "Sending entries %s-%s" % (
                                batch_count * batch_size,
                                (batch_count + 1) * batch_size)
                            core_all_files.post(batch)
                            batch = []
                            batch_count += 1
                            if batch_latest:
                                core_latest.post(batch_latest)
                                batch_latest = []
                                batch_latest_new = {}
                    except:
                        print "Can't ingest file %s" % file_path
                        if abort_on_errors:
                            raise

            # flush
            if len(batch) > 0:
                print "Sending last %s entries and %s entries to latest core" % (
                    len(batch), len(batch_latest))
                #print len(batch_latest)
                core_all_files.post(batch)
                batch = []
                batch_count += 1
                if batch_latest:
                    core_latest.post(batch_latest)
                    batch_latest = []

        finally:
            f.close()
示例#7
0
    def dump_fs_to_file(start_dir,
                        dump_file,
                        batch_size=1000,
                        check=False,
                        abort_on_errors=False):
        """This is the currently used method for ingestion. This method generates a file with
a listing of paths and timestamps from the file system. The syntax of the file looks like this::

  crawl_dir    /path/to/some/directory
  
  data
  /path/to/a/file,1239879.0
  /path/to/another/file,1239879.0
  ...

The crawl_dir indicates the directory being crawled and results in the deletion of all files whose path starts with
that one (i.e. everything under that path will be *replaced*).

Generating this file takes at least 8 hours for the whole /miklip/integration/data4miklip directory. It would be
nice to generate it in a different manner (e.g. using the gpfs policy API).

:param start_dir: The directory from which the file system will be crawled
:param dump_file: the path to the file that will contain the dump. if the file ends with '.gz' the resulting file
will be gziped (preferred)
:param batch_size: number of entries that will be written to disk at once. This might help pin-pointing crashes.
:param check: if the paths should be checked. While checking path the resulting paths are guaranteed to be accepted
later on normally this is too slow for this phase, so the default is False.
:param abort_on_errors: If dumping should get aborted as soon as an error is found, i.e. a file that can't be ingested.
 Most of the times there are many files being found that are no data at all."""

        log.debug('starting sequential ingest')

        if dump_file.endswith('.gz'):
            # print "Using gzip"
            import gzip
            # the with statement support started with python 2.7 (http://docs.python.org/2/library/gzip.html)
            # Let's leave this python 2.6 compatible...
            f = gzip.open(dump_file, 'wb')
        else:
            f = open(dump_file, 'w')

        try:
            batch_count = 0

            # store metadata
            f.write('%s\t%s\n' % (META_DATA.CRAWL_DIR, start_dir))

            # store data
            f.write('\n%s\n' % META_DATA.DATA)
            for path in dir_iter(start_dir):
                if check:
                    try:
                        DRSFile.from_path(path)
                    except:
                        if abort_on_errors:
                            raise
                        else:
                            print "Error ingesting %s" % path
                            continue
                ts = os.path.getmtime(path)
                f.write('%s,%s\n' % (path, ts))
                batch_count += 1
                if batch_count >= batch_size:
                    f.flush()
        finally:
            f.close()
示例#8
0
 def test_from_json(self):
     j = self.drs.to_json()
     t = DRSFile.from_json(j, CMIP5)
     self.assertTrue(isinstance(t, DRSFile))
     self.assertEqual(self.drs.to_path(), t.to_path())
示例#9
0
 def test_from_dict(self):
     d = self.drs.dict
     t = DRSFile.from_dict(d, CMIP5)
     self.assertTrue(isinstance(t, DRSFile))
     self.assertEqual(self.drs.to_path(), t.to_path())