Пример #1
0
    def fetch_dataset(self, var, d_key):
        """Copy files to temporary directory.
        (GCP can't copy to home dir, so always copy to a temp dir)
        """
        tmpdir = core.TempDirManager().make_tempdir()
        self.log.debug("Created GCP fetch temp dir at %s.", tmpdir)
        (cp_command, smartsite) = self._get_fetch_method(self._fetch_method)

        paths = d_key.remote_data()
        if isinstance(paths, pd.Series):
            paths = paths.to_list()
        if not util.is_iterable(paths):
            paths = (paths, )

        local_paths = []
        for path in paths:
            # exceptions caught in parent loop in data_manager.DataSourceBase
            local_path = os.path.join(tmpdir, os.path.basename(path))
            self.log.info(f"\tFetching {path[len(self.attrs.CASE_ROOT_DIR):]}")
            util.run_command(
                cp_command + [
                    smartsite + path,
                    # gcp requires trailing slash, ln ignores it
                    smartsite + tmpdir + os.sep
                ],
                timeout=self.timeout,
                dry_run=self.dry_run,
                log=self.log)
            local_paths.append(local_path)
        d_key.local_data = local_paths
def gcp_wrapper(source_path, dest_dir, timeout=None, dry_run=None):
    """Wrapper for file and recursive directory copying using the GFDL 
    site-specific General Copy Program (`https://gitlab.gfdl.noaa.gov/gcp/gcp`__.)
    Assumes GCP environment module has been loaded beforehand, and calls GCP in
    a subprocess.
    """
    modMgr = ModuleManager()
    modMgr.load('gcp')
    config = core.ConfigManager()
    if timeout is None:
        timeout = config.get('file_transfer_timeout', 0)
    if dry_run is None:
        dry_run = config.get('dry_run', False)

    source_path = os.path.normpath(source_path)
    dest_dir = os.path.normpath(dest_dir)
    # gcp requires trailing slash, ln ignores it
    if os.path.isdir(source_path):
        source = ['-r', 'gfdl:' + source_path + os.sep]
        # gcp /A/B/ /C/D/ will result in /C/D/B, so need to specify parent dir
        dest = ['gfdl:' + os.path.dirname(dest_dir) + os.sep]
    else:
        source = ['gfdl:' + source_path]
        dest = ['gfdl:' + dest_dir + os.sep]
    _log.info('\tGCP {} -> {}'.format(source[-1], dest[-1]))
    util.run_command(['gcp', '--sync', '-v', '-cd'] + source + dest,
                     timeout=timeout,
                     dry_run=dry_run)
Пример #3
0
    def pre_fetch_hook(self, vars_to_fetch):
        """Issue dmget for all files we're about to fetch, if those files are
        on a tape filesystem.
        """
        if self.tape_filesystem:
            paths = set([])
            for var in vars_to_fetch:
                for d_key in var.iter_data_keys(status=core.ObjectStatus.ACTIVE):
                    paths.update(d_key.remote_data())

            self.log.info(f"Start dmget of {len(paths)} files...")
            util.run_command(['dmget','-t','-v'] + list(paths),
                timeout= len(paths) * self.timeout,
                dry_run=self.dry_run, log=self.log
            )
            self.log.info("Successful exit of dmget.")
Пример #4
0
    def pre_fetch_hook(self, vars_to_fetch):
        """Issue dmget for all files we're about to fetch, if those files are
        on a tape filesystem.
        """
        if self.tape_filesystem:
            paths = set([])
            for var in vars_to_fetch:
                for data_key in self.iter_data_keys(var):
                    paths.update(self.remote_data(data_key))

            _log.info(f"Start dmget of {len(paths)} files.")
            util.run_command(['dmget','-t','-v'] + list(paths),
                timeout= len(paths) * self.timeout,
                dry_run=self.dry_run
            ) 
            _log.info("Successful exit of dmget.")
Пример #5
0
def gcp_wrapper(source_path, dest_dir, timeout=0, dry_run=False):
    modMgr = ModuleManager()
    modMgr.load('gcp')
    source_path = os.path.normpath(source_path)
    dest_dir = os.path.normpath(dest_dir)
    # gcp requires trailing slash, ln ignores it
    if os.path.isdir(source_path):
        source = ['-r', 'gfdl:' + source_path + os.sep]
        # gcp /A/B/ /C/D/ will result in /C/D/B, so need to specify parent dir
        dest = ['gfdl:' + os.path.dirname(dest_dir) + os.sep]
    else:
        source = ['gfdl:' + source_path]
        dest = ['gfdl:' + dest_dir + os.sep]
    print('\tDEBUG: GCP {} -> {}'.format(source[-1], dest[-1]))
    util.run_command(['gcp', '--sync', '-v', '-cd'] + source + dest,
                     timeout=timeout,
                     dry_run=dry_run)
Пример #6
0
    def plan_data_fetch_hook(self):
        """Filter files on model component and chunk frequency.
        """
        d_to_u_dict = self._decide_allowed_components()
        for data_key in self.data_keys:
            u_key = d_to_u_dict[data_key]
            print("Selected {} for {} @ {}".format(u_key,
                                                   data_key.name_in_model,
                                                   data_key.date_freq))
            # check we didn't eliminate everything:
            assert self._component_map[u_key, data_key]
            self.data_files[data_key] = self._component_map[u_key, data_key]

        paths = set()
        for data_key in self.data_keys:
            for f in self.data_files[data_key]:
                paths.add(f._remote_data)
        if self.tape_filesystem:
            print("start dmget of {} files".format(len(paths)))
            util.run_command(['dmget', '-t', '-v'] + list(paths),
                             timeout=len(paths) * self.file_transfer_timeout,
                             dry_run=self.dry_run)
            print("end dmget")
Пример #7
0
    def ncdump_h(cls, in_file=None, cwd=None, dry_run=False):
        """Return header information for all variables in a file.
        """
        def _parse_xml_wrapper(xml_):
            # strips namespaces; https://stackoverflow.com/a/25920989
            it = ET.iterparse(io.StringIO(xml_))
            for _, el in it:
                _, _, el.tag = el.tag.rpartition('}')  # strip namespaces
                for at in el.attrib:  # strip namespaces of attributes too
                    if '}' in at:
                        newat = at.split('}', 1)[1]
                        el.attrib[newat] = el.attrib[at]
                        del el.attrib[at]
            return it.root

        d = {'dimensions': dict(), 'variables': dict()}
        if dry_run:
            return d  # dummy answer
        # JSON output for -m is malformed in NCO <=4.5.4, verified OK for 4.7.6
        xml_out = util.run_command(['ncks', '--xml', '-m', in_file],
                                   cwd=cwd,
                                   dry_run=dry_run)
        root = _parse_xml_wrapper('\n'.join(xml_out))
        for dim in root.iter('dimension'):
            d['dimensions'][dim.attrib['name']] = int(dim.attrib['length'])
        dv = d['variables']
        for var in root.iter('variable'):
            k = var.attrib['name']
            dv[k] = var.attrib.copy()
            del dv[k]['name']
            for att in var:
                if 'name' not in att.attrib or 'value' not in att.attrib:
                    continue
                dv[k][att.attrib['name']] = att.attrib['value']
            if dv[k].get('shape', None):
                dv[k]['shape'] = dv[k]['shape'].split(' ')
        return d
Пример #8
0
    def fetch_dataset(self, d_key, method='auto'):
        """Copy files to temporary directory and combine chunks.
        """
        # pylint: disable=maybe-no-member
        (cp_command, smartsite) = self._determine_fetch_method(method)
        dest_path = self.local_path(d_key)
        dest_dir = os.path.dirname(dest_path)
        # ncrcat will error instead of creating destination directories
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        # GCP can't copy to home dir, so always copy to temp
        tmpdirs = util_mdtf.TempDirManager()
        work_dir = tmpdirs.make_tempdir(hash_obj=d_key)
        remote_files = list(self.data_files[d_key])

        # copy remote files
        # TODO: Do something intelligent with logging, caught OSErrors
        for f in remote_files:
            print("\tcopying ...{} to {}".format(
                f._remote_data[len(self.root_dir):], work_dir))
            util.run_command(
                cp_command + [
                    smartsite + f._remote_data,
                    # gcp requires trailing slash, ln ignores it
                    smartsite + work_dir + os.sep
                ],
                timeout=self.file_transfer_timeout,
                dry_run=self.dry_run)

        # ----------------------------------------
        # Processing of copied files: TODO: refactor individual steps into
        # separate functions

        # set axis names from header info
        # only look at first file; if other chunks for same var differ, NCO will
        # raise error when we try to concat them
        file_name = os.path.basename(remote_files[0]._remote_data)
        var_name = remote_files[0].name_in_model
        file_axes = self.nc_get_axes_attributes(var_name,
                                                in_file=file_name,
                                                cwd=work_dir,
                                                dry_run=self.dry_run)
        for fax, fax_attrs in iter(file_axes.items()):
            # update DataSets with axis info - need to loop since multiple PODs
            # may reference this file (warning will be repeated; TODO fix that)
            error_flag = 0
            for var in self.data_keys[d_key]:
                if fax in var.axes:
                    # file's axis in list of case's axis names; check their
                    # axis attributes match if they're both defined
                    if 'axis' in fax_attrs and 'axis' in var.axes[fax] \
                        and fax_attrs['axis'].lower() != var.axes[fax]['axis'].lower() \
                        and error_flag != 1:
                        print(
                            ("\tWarning: unexpected axis attribute for {0} in "
                             "{1} (found {2}, {3} convention is {4})").format(
                                 fax, file_name, fax_attrs['axis'],
                                 self.convention, var.axes[fax]['axis']))
                        error_flag = 1
                    var.axes[fax]['MDTF_set_from_axis'] = False
                else:
                    # file has different axis name, try to match by attribute
                    for vax, vax_attrs in iter(var.axes.items()):
                        if 'axis' not in fax_attrs or 'axis' not in vax_attrs:
                            continue
                        elif vax_attrs['axis'].lower(
                        ) == fax_attrs['axis'].lower():
                            # matched axis attributes: log warning & reassign
                            if error_flag != 2:
                                print((
                                    "\tWarning: unexpected {0} axis name in {1} "
                                    "(found {2}, {3} convention is {4})"
                                ).format(fax_attrs['axis'], file_name, fax,
                                         self.convention, vax))
                                error_flag = 2
                            # only update so we don't overwrite the envvar name
                            var.axes[fax] = vax_attrs.copy()
                            var.axes[fax].update(fax_attrs)
                            var.axes[fax]['MDTF_set_from_axis'] = True
                            del var.axes[vax]
                            break
                    else:
                        # get here if we didn't hit 'break' above -- give up
                        if error_flag != 3:
                            print(("\tWarning: unable to assign {0} axis "
                                   "in {1}.").format(fax, file_name))
                            error_flag = 3

        # crop time axis to requested range
        # do this *before* combining chunks to reduce disk activity
        for vax, vax_attrs in iter(var.axes.items()):
            if 'axis' not in vax_attrs or vax_attrs['axis'].lower() != 't':
                continue
            else:
                time_var_name = vax
                break
        else:
            print("\tCan't determine time axis for {}.".format(file_name))
            time_var_name = 'time'  # will probably give KeyError
        trim_count = 0
        for f in remote_files:
            file_name = os.path.basename(f._remote_data)
            if f.date_range.is_static:
                # skip date trimming logic for time-independent files
                continue
            if not self.date_range.overlaps(f.date_range):
                print(("\tWarning: {} has dates {} outside of requested "
                       "range {}.").format(file_name, f.date_range,
                                           self.date_range))
                continue
            if not self.date_range.contains(f.date_range):
                # file overlaps analysis range but is not strictly contained
                # in it means we need to trim either start or end or both
                trimmed_range = f.date_range.intersection(
                    self.date_range, precision=f.date_range.precision)
                print("\ttrimming '{}' of {} from {} to {}".format(
                    time_var_name, file_name, f.date_range, trimmed_range))
                trim_count = trim_count + 1
                self.nc_crop_time_axis(time_var_name,
                                       trimmed_range,
                                       in_file=file_name,
                                       cwd=work_dir,
                                       dry_run=self.dry_run)
        if trim_count > 2:
            print("trimmed {} files!".format(trim_count))
            raise AssertionError()

        # cat chunks to destination, if more than one
        if len(remote_files) > 1:
            # not running in shell, so can't use glob expansion.
            print("\tcatting {} chunks to {}".format(d_key.name_in_model,
                                                     dest_path))
            chunks = [os.path.basename(f._remote_data) for f in remote_files]
            self.nc_cat_chunks(chunks,
                               dest_path,
                               cwd=work_dir,
                               dry_run=self.dry_run)
        else:
            f = util.coerce_from_iter(remote_files)
            file_name = os.path.basename(f._remote_data)
            print("\tsymlinking {} to {}".format(d_key.name_in_model,
                                                 dest_path))
            util.run_command(['ln', '-fs', \
                os.path.join(work_dir, file_name), dest_path],
                dry_run=self.dry_run
            )
Пример #9
0
 def test_run_command_exitcode(self):
     input = ['exit', '1']
     with self.assertRaises(Exception):
         # I couldn't get this to catch CalledProcessError specifically,
         # maybe because it takes args?
         util.run_command(input)
Пример #10
0
 def test_run_command_stdout1(self):
     out = util.run_command(['echo', '"foo"'])
     self.assertEqual(len(out), 1)
     self.assertEqual(out[0], '"foo"')