예제 #1
0
def main(argv=None):
    opts, args = cli(argv)

    # Create the input object for the Reshaper
    spec = specification.create_specifier()

    # Generate the input file list from (potentially) globs/wildcards
    full_input_file_list = []
    for infile in args:
        full_input_file_list.extend(glob.glob(infile))

    # Add input to the specifier
    spec.input_file_list = full_input_file_list
    spec.io_backend = opts.backend
    spec.compression_level = opts.compression_level
    spec.least_significant_digit = opts.least_significant_digit
    spec.netcdf_format = opts.netcdf_format
    spec.output_file_prefix = opts.output_prefix
    spec.output_file_suffix = opts.output_suffix
    spec.time_series = opts.time_series
    spec.time_variant_metadata = opts.metadata
    spec.metadata_filename = opts.metafile
    spec.exclude_list = opts.exclude
    spec.assume_1d_time_variant_metadata = opts.meta1d

    # Validate before saving
    spec.validate()

    # Write the specfile
    spec.write(opts.specfile)
def readArchiveXML(caseroot, input_rootdir, output_rootdir, casename,
                   standalone, completechunk, debug, debugMsg):
    """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of 
         reshaper specifications to be passed to the pyReshaper tool.

    Arguments:
    caseroot (string) - case root path
    input_rootdir (string) - rootdir to input raw history files
    output_rootdir (string) - rootdir to output single variable time series files
    casename (string) - casename
    standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not
    completechunk (boolean) - end on a ragid boundary if True.  Otherwise, do not create incomplete chunks if False
    """
    specifiers = list()
    xml_tree = ET.ElementTree()

    # get path to env_timeseries.xml file
    env_timeseries = '{0}/env_timeseries.xml'.format(caseroot)

    # read tseries log file to see if we've already started converting files, if so, where did we leave off
    log = chunking.read_log('{0}/logs/ts_status.log'.format(caseroot))

    # check if the env_timeseries.xml file exists
    if (not os.path.isfile(env_timeseries)):
        err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format(
            env_timeseries)
        raise OSError(err_msg)
    else:
        # parse the xml
        xml_tree.parse(env_timeseries)

        # loop through all the comp_archive_spec elements to find the tseries related elements
        for comp_archive_spec in xml_tree.findall(
                "components/comp_archive_spec"):
            comp = comp_archive_spec.get("name")
            rootdir = comp_archive_spec.find("rootdir").text
            multi_instance = comp_archive_spec.find("multi_instance").text
            default_calendar = comp_archive_spec.find("default_calendar").text
            debugMsg("default_calendar = {0}".format(default_calendar),
                     header=True)

            # for now, set instance value to empty string implying only 1 instance
            instance = ""

            # loop through all the files/file_spec elements
            for file_spec in comp_archive_spec.findall("files/file_extension"):
                file_extension = file_spec.get("suffix")
                subdir = file_spec.find("subdir").text

                # check if tseries_create is an element for this file_spec
                if file_spec.find("tseries_create") is not None:
                    tseries_create = file_spec.find("tseries_create").text

                    # check if the tseries_create element is set to TRUE
                    if tseries_create.upper() in ["T", "TRUE"]:

                        # check if tseries_format is an element for this file_spec and if it is valid
                        if file_spec.find("tseries_output_format") is not None:
                            tseries_output_format = file_spec.find(
                                "tseries_output_format").text
                            if tseries_output_format not in [
                                    "netcdf", "netcdf4", "netcdf4c",
                                    "netcdfLarge"
                            ]:
                                err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(
                                    comp, file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(
                                comp, file_extension)
                            raise TypeError(err_msg)

                        # load the tseries_time_variant_variables into a list
                        variable_list = list()
                        if comp_archive_spec.find(
                                "tseries_time_variant_variables") is not None:
                            for variable in comp_archive_spec.findall(
                                    "tseries_time_variant_variables/variable"):
                                variable_list.append(variable.text)

                        # get a list of all the input files for this stream from the archive location
                        history_files = list()
                        in_file_path = '/'.join(
                            [input_rootdir, rootdir, subdir])

                        # get XML tseries elements for chunking
                        if file_spec.find("tseries_tper") is not None:
                            tseries_tper = file_spec.find("tseries_tper").text
                        if file_spec.find("tseries_filecat_tper") is not None:
                            tper = file_spec.find("tseries_filecat_tper").text
                        if file_spec.find("tseries_filecat_n") is not None:
                            size = file_spec.find("tseries_filecat_n").text
                        comp_name = comp
                        stream = file_extension.split('.[')[0]

                        stream_dates, file_slices, cal, units, time_period_freq = chunking.get_input_dates(
                            in_file_path + '/*' + file_extension + '*.nc')
                        # check if the calendar attribute was read or not
                        if cal is None or cal == "none":
                            cal = default_calendar
                        debugMsg("calendar = {0}".format(cal), header=True)

                        # the tseries_tper should be set in using the time_period_freq global file attribute if it exists
                        if time_period_freq is not None:
                            tseries_tper = time_period_freq
                        tseries_output_dir = '/'.join([
                            output_rootdir, rootdir, 'proc/tseries',
                            tseries_tper
                        ])
                        debugMsg("tseries_output_dir = {0}".format(
                            tseries_output_dir),
                                 header=True)

                        if not os.path.exists(tseries_output_dir):
                            os.makedirs(tseries_output_dir)

                        if comp + stream not in log.keys():
                            log[comp + stream] = {'slices': [], 'index': 0}
                        ts_log_dates = log[comp + stream]['slices']
                        index = log[comp + stream]['index']
                        files, dates, index = chunking.get_chunks(
                            tper, index, size, stream_dates, ts_log_dates, cal,
                            units, completechunk)
                        for d in dates:
                            log[comp + stream]['slices'].append(float(d))
                        log[comp + stream]['index'] = index
                        for cn, cf in files.iteritems():

                            history_files = cf['fn']
                            start_time_parts = cf['start']
                            last_time_parts = cf['end']

                            # create the tseries output prefix needs to end with a "."
                            tseries_output_prefix = "{0}/{1}.{2}{3}.".format(
                                tseries_output_dir, casename, comp_name,
                                stream)
                            debugMsg("tseries_output_prefix = {0}".format(
                                tseries_output_prefix),
                                     header=True)

                            # format the time series variable output suffix based on the
                            # tseries_tper setting suffix needs to start with a "."
                            freq_array = ["week", "day", "hour", "min"]
                            if "year" in tseries_tper:
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + "-" + last_time_parts[0] + ".nc"
                            elif "month" in tseries_tper:
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[
                                        1] + "-" + last_time_parts[
                                            0] + last_time_parts[1] + ".nc"
                            elif "day" in tseries_tper:
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[1] + start_time_parts[
                                        2] + "-" + last_time_parts[
                                            0] + last_time_parts[
                                                1] + last_time_parts[2] + ".nc"
                            elif any(freq_string in tseries_tper
                                     for freq_string in freq_array):
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[1] + start_time_parts[
                                        2] + start_time_parts[
                                            3] + "-" + last_time_parts[
                                                0] + last_time_parts[
                                                    1] + last_time_parts[
                                                        2] + last_time_parts[
                                                            3] + ".nc"
                            else:
                                err_msg = "cesm_tseries_generator.py error: invalid tseries_tper = {0}.".format(
                                    tseries_tper)
                                raise TypeError(err_msg)
                            debugMsg("tseries_output_suffix = {0}".format(
                                tseries_output_suffix),
                                     header=True)

                            # get a reshaper specification object
                            spec = specification.create_specifier()

                            # populate the spec object with data for this history stream
                            spec.input_file_list = history_files
                            spec.netcdf_format = tseries_output_format
                            spec.output_file_prefix = tseries_output_prefix
                            spec.output_file_suffix = tseries_output_suffix
                            spec.time_variant_metadata = variable_list

                            # print the specifier
                            if debug:
                                dbg = list()
                                pp = pprint.PrettyPrinter(indent=5)
                                dbg = [
                                    comp_name, spec.input_file_list,
                                    spec.netcdf_format,
                                    spec.output_file_prefix,
                                    spec.output_file_suffix,
                                    spec.time_variant_metadata
                                ]
                                pp.pprint(dbg)

                            # append this spec to the list of specifiers
                            specifiers.append(spec)
    return specifiers, log
def readArchiveXML(cesmEnv):
    '''
    returns a fully defined list of reshaper specifications
    '''
    from pyreshaper import specification

    specifiers = list()
    xml_tree = ET.ElementTree()
# check if the env_archive.xml file exists
    if ( not os.path.isfile('./env_archive.xml') ):
        err_msg = "cesm_tseries_generator.py ERROR: env_archive.xml does not exist."
        raise OSError(err_msg)
    else:
# parse the xml
        xml_tree.parse('./env_archive.xml')

# loop through all the comp_archive_spec elements to find the tseries related elements
        for comp_archive_spec in xml_tree.findall("components/comp_archive_spec"):
            comp = comp_archive_spec.get("name")
            rootdir = comp_archive_spec.find("rootdir").text
            multi_instance = comp_archive_spec.find("multi_instance").text

# for now, set instance value to empty string implying only 1 instance
            instance = ""

# loop through all the files/file_spec elements
            for file_spec in comp_archive_spec.findall("files/file_extension"):
                file_extension = file_spec.get("suffix")
                subdir = file_spec.find("subdir").text

# check if tseries_create is an element for this file_spec
                if file_spec.find("tseries_create") is not None:
                    tseries_create = file_spec.find("tseries_create").text

# check if the tseries_create element is set to TRUE            
                    if tseries_create.upper() in ["T","TRUE"]:

# check if tseries_format is an element for this file_spec and if it is valid
                        if file_spec.find("tseries_output_format") is not None:
                            tseries_output_format = file_spec.find("tseries_output_format").text
                            if tseries_output_format not in ["netcdf","netcdf4","netcdf4c"]:
                                err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(comp,file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(comp,file_extension)
                            raise TypeError(err_msg)


# check if the tseries_output_subdir is specified and create the tseries_output_dir
                        if file_spec.find("tseries_output_subdir") is not None:
                            tseries_output_subdir = file_spec.find("tseries_output_subdir").text
                            tseries_output_dir = '/'.join( [cesmEnv["DOUT_S_ROOT"], rootdir,tseries_output_subdir] )
                            if not os.path.exists(tseries_output_dir):
                                os.makedirs(tseries_output_dir)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_subdir undefined for data stream {0}.*.{1}".format(comp,file_extension)
                            raise TypeError(err_msg)
                        

# check if tseries_tper is specified and is valid 
                        if file_spec.find("tseries_tper") is not None:
                            tseries_tper = file_spec.find("tseries_tper").text
                            if tseries_tper not in ["yearly","monthly","weekly","daily","hourly6","hourly3","hourly1","min30"]:
                                err_msg = "cesm_tseries_generator.py error: tseries_tper invalid for data stream {0}.*.{1}".format(comp,file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_tper undefined for data stream {0}.*.{1}".format(comp,file_extension)
                            raise TypeError(err_msg)


# load the tseries_time_variant_variables into a list
                        if comp_archive_spec.find("tseries_time_variant_variables") is not None:
                            variable_list = list()
                            for variable in comp_archive_spec.findall("tseries_time_variant_variables/variable"):
                                variable_list.append(variable.text)


# get a list of all the input files for this stream from the archive location
                        history_files = list()
                        in_file_path = '/'.join( [cesmEnv["DOUT_S_ROOT"],rootdir,subdir] )                        
                        all_in_files = os.listdir(in_file_path)

# check that there are actually a list of history files to work with
                        for in_file in all_in_files:
                            if re.search(file_extension, in_file):
                                history_files.append(in_file_path+"/"+in_file)

# sort the list of input history files in order to get the output suffix from the first and last file
                        if len(history_files) > 0:
                            history_files.sort()

                            start_file = history_files[0]
                            start_file_parts = list()
                            start_file_parts = start_file.split( "." )
                            start_file_time = start_file_parts[-2]

                            last_file = history_files[-1]
                            last_file_parts = list()
                            last_file_parts = last_file.split( "." )
                            last_file_time = last_file_parts[-2]

# get the actual component name from the history file - will also need to deal with the instance numbers based on the comp_name
                            comp_name = last_file_parts[-4]
                            stream = last_file_parts[-3]

# check for pop.h nday1 and nyear1 history streams
                            if last_file_parts[-3] in ["nday1","nyear1"]:
                                comp_name = last_file_parts[-5]
                                stream = last_file_parts[-4]+"."+last_file_parts[-3]

# create the tseries output prefix needs to end with a "."
                            tseries_output_prefix = tseries_output_dir+"/"+cesmEnv["CASE"]+"."+comp_name+"."+stream+"."

# format the time series variable output suffix based on the tseries_tper setting suffix needs to start with a "."
                            if tseries_tper == "yearly":
                                tseries_output_suffix = "."+start_file_time+"-"+last_file_time+".nc"
                            elif tseries_tper == "monthly":
                                start_time_parts = start_file_time.split( "-" )
                                last_time_parts = last_file_time.split( "-" )
                                tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+"-"+last_time_parts[0]+last_time_parts[1]+".nc"
                            elif tseries_tper in ["weekly","daily","hourly6","hourly3","hourly1","min30"]:
                                start_time_parts = start_file_time.split( "-" )
                                last_time_parts = last_file_time.split( "-" )
                                tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+".nc"

# START HERE... need to create specifiers based on the tseries_filecat_years spec

# get a reshpaer specification object
                            spec = specification.create_specifier()

# populate the spec object with data for this history stream
                            spec.input_file_list = history_files
                            spec.netcdf_format = tseries_output_format
                            spec.output_file_prefix = tseries_output_prefix
                            spec.output_file_suffix = tseries_output_suffix
                            spec.time_variant_metadata = variable_list

                            dbg = list()
                            pp = pprint.PrettyPrinter(indent=5)
                            dbg = [comp_name, spec.input_file_list, spec.netcdf_format, spec.output_file_prefix, spec.output_file_suffix, spec.time_variant_metadata]
                            pp.pprint(dbg)
                            
# append this spec to the list of specifiers
                            specifiers.append(spec)

    return specifiers
예제 #4
0
def readArchiveXML(cesmEnv):
    '''
    returns a fully defined list of reshaper specifications
    '''
    from pyreshaper import specification

    specifiers = list()
    xml_tree = ET.ElementTree()
    # check if the env_archive.xml file exists
    if (not os.path.isfile('../env_archive.xml')):
        err_msg = "cesm_tseries_generator.py ERROR: env_archive.xml does not exists."
        raise OSError(err_msg)
    else:
        # parse the xml
        xml_tree.parse('../env_archive.xml')

        # loop through all the comp_archive_spec elements to find the tseries related elements
        for comp_archive_spec in xml_tree.findall(
                "components/comp_archive_spec"):
            comp = comp_archive_spec.get("name")
            rootdir = comp_archive_spec.find("rootdir").text
            multi_instance = comp_archive_spec.find("multi_instance").text

            # for now, set instance value to empty string implying only 1 instance
            instance = ""

            # loop through all the files/file_spec elements
            for file_spec in comp_archive_spec.findall("files/file_extension"):
                file_extension = file_spec.get("suffix")
                subdir = file_spec.find("subdir").text

                # check if tseries_create is an element for this file_spec
                if file_spec.find("tseries_create") is not None:
                    tseries_create = file_spec.find("tseries_create").text

                    # check if the tseries_create element is set to TRUE
                    if tseries_create.upper() in ["T", "TRUE"]:

                        # check if tseries_format is an element for this file_spec and if it is valid
                        if file_spec.find("tseries_output_format") is not None:
                            tseries_output_format = file_spec.find(
                                "tseries_output_format").text
                            if tseries_output_format not in [
                                    "netcdf", "netcdf4", "netcdf4c"
                            ]:
                                err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(
                                    comp, file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(
                                comp, file_extension)
                            raise TypeError(err_msg)

# check if the tseries_output_subdir is specified and create the tseries_output_dir
                        if file_spec.find("tseries_output_subdir") is not None:
                            tseries_output_subdir = file_spec.find(
                                "tseries_output_subdir").text
                            tseries_output_dir = '/'.join([
                                cesmEnv["DOUT_S_ROOT"], rootdir,
                                tseries_output_subdir
                            ])
                            if not os.path.exists(tseries_output_dir):
                                os.makedirs(tseries_output_dir)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_subdir undefined for data stream {0}.*.{1}".format(
                                comp, file_extension)
                            raise TypeError(err_msg)

# check if tseries_tper is specified and is valid
                        if file_spec.find("tseries_tper") is not None:
                            tseries_tper = file_spec.find("tseries_tper").text
                            if tseries_tper not in [
                                    "yearly", "monthly", "weekly", "daily",
                                    "hourly6", "hourly3", "hourly1", "min30"
                            ]:
                                err_msg = "cesm_tseries_generator.py error: tseries_tper invalid for data stream {0}.*.{1}".format(
                                    comp, file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_tper undefined for data stream {0}.*.{1}".format(
                                comp, file_extension)
                            raise TypeError(err_msg)

# load the tseries_time_variant_variables into a list
                        if comp_archive_spec.find(
                                "tseries_time_variant_variables") is not None:
                            variable_list = list()
                            for variable in comp_archive_spec.findall(
                                    "tseries_time_variant_variables/variable"):
                                variable_list.append(variable.text)

# get a list of all the input files for this stream from the archive location
                        history_files = list()
                        in_file_path = '/'.join(
                            [cesmEnv["DOUT_S_ROOT"], rootdir, subdir])
                        all_in_files = os.listdir(in_file_path)

                        # check that there are actually a list of history files to work with
                        for in_file in all_in_files:
                            if re.search(file_extension, in_file):
                                history_files.append(in_file_path + "/" +
                                                     in_file)

# sort the list of input history files in order to get the output suffix from the first and last file
                        if len(history_files) > 0:
                            history_files.sort()

                            start_file = history_files[0]
                            start_file_parts = list()
                            start_file_parts = start_file.split(".")
                            start_file_time = start_file_parts[-2]

                            last_file = history_files[-1]
                            last_file_parts = list()
                            last_file_parts = last_file.split(".")
                            last_file_time = last_file_parts[-2]

                            # get the actual component name from the history file - will also need to deal with the instance numbers based on the comp_name
                            comp_name = last_file_parts[-4]
                            stream = last_file_parts[-3]

                            # check for pop.h nday1 and nyear1 history streams
                            if last_file_parts[-3] in ["nday1", "nyear1"]:
                                comp_name = last_file_parts[-5]
                                stream = last_file_parts[
                                    -4] + "." + last_file_parts[-3]

# create the tseries output prefix needs to end with a "."
                            tseries_output_prefix = tseries_output_dir + "/" + cesmEnv[
                                "CASE"] + "." + comp_name + "." + stream + "."

                            # format the time series variable output suffix based on the tseries_tper setting suffix needs to start with a "."
                            if tseries_tper == "yearly":
                                tseries_output_suffix = "." + start_file_time + "-" + last_file_time + ".nc"
                            elif tseries_tper == "monthly":
                                start_time_parts = start_file_time.split("-")
                                last_time_parts = last_file_time.split("-")
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[
                                        1] + "-" + last_time_parts[
                                            0] + last_time_parts[1] + ".nc"
                            elif tseries_tper in [
                                    "weekly", "daily", "hourly6", "hourly3",
                                    "hourly1", "min30"
                            ]:
                                start_time_parts = start_file_time.split("-")
                                last_time_parts = last_file_time.split("-")
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[1] + start_time_parts[
                                        2] + "-" + last_time_parts[
                                            0] + last_time_parts[
                                                1] + last_time_parts[2] + ".nc"

# START HERE... need to create specifiers based on the tseries_filecat_years spec

# get a reshpaer specification object
                            spec = specification.create_specifier()

                            # populate the spec object with data for this history stream
                            spec.input_file_list = history_files
                            spec.netcdf_format = tseries_output_format
                            spec.output_file_prefix = tseries_output_prefix
                            spec.output_file_suffix = tseries_output_suffix
                            spec.time_variant_metadata = variable_list

                            dbg = list()
                            pp = pprint.PrettyPrinter(indent=5)
                            dbg = [
                                comp_name, spec.input_file_list,
                                spec.netcdf_format, spec.output_file_prefix,
                                spec.output_file_suffix,
                                spec.time_variant_metadata
                            ]
                            pp.pprint(dbg)

                            # append this spec to the list of specifiers
                            specifiers.append(spec)

    return specifiers
def readArchiveXML(caseroot, input_rootdir, output_rootdir, casename, standalone, completechunk, 
                   generate_all, debug, debugMsg, comm, rank, size):
    """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of 
         reshaper specifications to be passed to the pyReshaper tool.

    Arguments:
    caseroot (string) - case root path
    input_rootdir (string) - rootdir to input raw history files
    output_rootdir (string) - rootdir to output single variable time series files
    casename (string) - casename
    standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not
    completechunk (boolean) - end on a ragid boundary if True.  Otherwise, do not create incomplete chunks if False
    generate_all (boolean) - generate timeseries for all streams if True.  Otherwise, use the tseries_create setting.
    """
    specifiers = list()
    xml_tree = ET.ElementTree()

    # get path to env_timeseries.xml file
    env_timeseries = '{0}/env_timeseries.xml'.format(caseroot)

    # read tseries log file to see if we've already started converting files, if so, where did we leave off
    log = chunking.read_log('{0}/logs/ts_status.log'.format(caseroot))

    # check if the env_timeseries.xml file exists
    if ( not os.path.isfile(env_timeseries) ):
        err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format(env_timeseries)
        raise OSError(err_msg)
    else:
        # parse the xml
        xml_tree.parse(env_timeseries)

        # loop through all the comp_archive_spec elements to find the tseries related elements
        for comp_archive_spec in xml_tree.findall("components/comp_archive_spec"):
            comp = comp_archive_spec.get("name")
            rootdir = comp_archive_spec.find("rootdir").text
            multi_instance = comp_archive_spec.find("multi_instance").text
            default_calendar = comp_archive_spec.find("default_calendar").text
            if rank == 0:
                debugMsg("default_calendar = {0}".format(default_calendar), header=True, verbosity=1)

            # for now, set instance value to empty string implying only 1 instance
            instance = ""

            # loop through all the files/file_spec elements
            for file_spec in comp_archive_spec.findall("files/file_extension"):
                file_extension = file_spec.get("suffix")
                subdir = file_spec.find("subdir").text

                # check if tseries_create is an element for this file_spec
                if file_spec.find("tseries_create") is not None:
                    tseries_create = file_spec.find("tseries_create").text

                    # check if the tseries_create element is set to TRUE            
                    if tseries_create.upper() in ["T","TRUE"] or generate_all.upper() in ["T","TRUE"]:
                        
                        # check if tseries_format is an element for this file_spec and if it is valid
                        if file_spec.find("tseries_output_format") is not None:
                            tseries_output_format = file_spec.find("tseries_output_format").text
                            if tseries_output_format not in ["netcdf","netcdf4","netcdf4c","netcdfLarge"]:
                                err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(comp,file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(comp,file_extension)
                            raise TypeError(err_msg)

                        # load the tseries_time_variant_variables into a list
                        variable_list = list()
                        if comp_archive_spec.find("tseries_time_variant_variables") is not None:
                            for variable in comp_archive_spec.findall("tseries_time_variant_variables/variable"):
                                variable_list.append(variable.text)

                        # load the tseries_exclude_variables into a list
                        exclude_list = list()
                        if comp_archive_spec.find("tseries_exclude_variables") is not None:
                            for variable in comp_archive_spec.findall("tseries_exclude_variables/variable"):
                                exclude_list.append(variable.text)

                        # get a list of all the input files for this stream from the archive location
                        history_files = list()
                        in_file_path = '/'.join( [input_rootdir,rootdir,subdir] )                        

                        # get XML tseries elements for chunking
                        if file_spec.find("tseries_tper") is not None:
                            tseries_tper = file_spec.find("tseries_tper").text
                        if file_spec.find("tseries_filecat_tper") is not None:
                            tper = file_spec.find("tseries_filecat_tper").text
                        if file_spec.find("tseries_filecat_n") is not None:
                            size_n = file_spec.find("tseries_filecat_n").text
                        comp_name = comp
                        stream = file_extension.split('.[')[0]

                        stream_dates,file_slices,cal,units,time_period_freq = chunking.get_input_dates(in_file_path+'/*'+file_extension+'*.nc', comm, rank, size)
                        # check if the calendar attribute was read or not
                        if cal is None or cal == "none":
                            cal = default_calendar
                        if rank == 0:
                            debugMsg("calendar = {0}".format(cal), header=True, verbosity=1)

                        # the tseries_tper should be set in using the time_period_freq global file attribute if it exists
                        if time_period_freq is not None:
                            tseries_tper = time_period_freq
                        tseries_output_dir = '/'.join( [output_rootdir, rootdir, 'proc/tseries', tseries_tper] )
                        if rank == 0:
                            debugMsg("tseries_output_dir = {0}".format(tseries_output_dir), header=True, verbosity=1)

                        if comp+stream not in log.keys():
                            log[comp+stream] = {'slices':[],'index':0}
                        ts_log_dates = log[comp+stream]['slices']
                        index = log[comp+stream]['index']
                        files,dates,index = chunking.get_chunks(tper, index, size_n, stream_dates, ts_log_dates, cal, units, completechunk, tseries_tper)
                        for d in dates:
                            log[comp+stream]['slices'].append(float(d))
                        log[comp+stream]['index']=index
                        for cn,cf in files.iteritems():

                            if rank == 0:
                                if not os.path.exists(tseries_output_dir):
                                    os.makedirs(tseries_output_dir)
                            comm.sync()

                            history_files = cf['fn']
                            start_time_parts = cf['start']
                            last_time_parts = cf['end']

                            # create the tseries output prefix needs to end with a "."
                            tseries_output_prefix = "{0}/{1}.{2}{3}.".format(tseries_output_dir,casename,comp_name,stream)
                            if rank == 0:
                                debugMsg("tseries_output_prefix = {0}".format(tseries_output_prefix), header=True, verbosity=1)

                            # format the time series variable output suffix based on the 
                            # tseries_tper setting suffix needs to start with a "."
                            freq_array = ["week","day","hour","min"]
                            if "year" in tseries_tper:
                                tseries_output_suffix = "."+start_time_parts[0]+"-"+last_time_parts[0]+".nc"
                            elif "month" in tseries_tper:
                                tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+"-"+last_time_parts[0]+last_time_parts[1]+".nc"
                            elif "day" in tseries_tper:
                                tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+".nc"
                            elif any(freq_string in tseries_tper for freq_string in freq_array):
                                tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+start_time_parts[3]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+last_time_parts[3]+".nc"
                            else:
                                err_msg = "cesm_tseries_generator.py error: invalid tseries_tper = {0}.".format(tseries_tper)
                                raise TypeError(err_msg)
                            if rank == 0:
                                debugMsg("tseries_output_suffix = {0}".format(tseries_output_suffix), header=True, verbosity=1)

                            # get a reshaper specification object/
                            spec = specification.create_specifier()

                            # populate the spec object with data for this history stream
                            spec.input_file_list = history_files
                            spec.netcdf_format = tseries_output_format
                            spec.output_file_prefix = tseries_output_prefix
                            spec.output_file_suffix = tseries_output_suffix
                            spec.time_variant_metadata = variable_list
                            spec.exclude_list = exclude_list
                            # setting the default backend; netCDF4 or pynio
                            spec.backend = 'netCDF4'

                            if rank == 0:
                                debugMsg("specifier: comp_name = {0}".format(comp_name), header=True, verbosity=1)
                                debugMsg("    input_file_list = {0}".format(spec.input_file_list), header=True, verbosity=1)
                                debugMsg("    netcdf_format = {0}".format(spec.netcdf_format), header=True, verbosity=1)
                                debugMsg("    output_file_prefix = {0}".format(spec.output_file_prefix), header=True, verbosity=1)
                                debugMsg("    output_file_suffix = {0}".format(spec.output_file_suffix), header=True, verbosity=1)
                                debugMsg("    time_variant_metadata = {0}".format(spec.time_variant_metadata), header=True, verbosity=1)
                                debugMsg("    exclude_list = {0}".format(spec.exclude_list), header=True, verbosity=1)
                            
                            # append this spec to the list of specifiers
                            specifiers.append(spec)
    return specifiers,log
예제 #6
0
# input_fnames = ['FAMIPC5.cam.h0.2010-09.nc', 'FAMIPC5.cam.h0.2010-10.nc']
input_fnames = conversion_utils.fetch_fnames(dirs.INPUT, 'FAMIPC5', 'cam')
input_files  = [os.path.join(dirs.INPUT, f) for f in input_fnames]

# Create output directory if it doesn't already exist
if not os.path.isdir(dirs.OUTPUT):
    print('Creating directory {}'.format(dirs.OUTPUT))
    os.makedirs(dirs.OUTPUT)

# Converted time-series file prefix & suffix
prefix = 'FAMIPC5.cam.'
output_prefix = os.path.join(dirs.OUTPUT, prefix)
output_suffix = conversion_utils.parse_output_suffix(input_fnames)

# --- Create PyReshaper specifier object ---------------------------------------
specifier = specification.create_specifier()

# Define specifier input needed perform the conversion
specifier.input_file_list = input_files
specifier.netcdf_format = "netcdf4"
specifier.compression_level = 1
specifier.output_file_prefix = output_prefix
specifier.output_file_suffix = output_suffix
specifier.time_variant_metadata = ["time", "time_bounds"]
# specifier.exclude_list = ['HKSAT','ZLAKE']

# Create the PyReshaper object
rshpr = reshaper.create_reshaper(specifier,
                                 serial=False,
                                 verbosity=1,
                                 wmode='s')
def readArchiveXML(caseroot, dout_s_root, casename, standalone, debug):
    """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of 
         reshaper specifications to be passed to the pyReshaper tool.

    Arguments:
    caseroot (string) - case root path
    dout_s_root (string) - short term archive root path
    casename (string) - casename
    standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not
    """
    specifiers = list()
    xml_tree = ET.ElementTree()

    # get path to env_timeseries.xml file
    env_timeseries = '{0}/postprocess/env_timeseries.xml'.format(caseroot)
    if standalone:
        env_timeseries = '{0}/env_timeseries.xml'.format(caseroot)

    # check if the env_timeseries.xml file exists
    if (not os.path.isfile(env_timeseries)):
        err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format(
            env_timeseries)
        raise OSError(err_msg)
    else:
        # parse the xml
        xml_tree.parse(env_timeseries)

        # loop through all the comp_archive_spec elements to find the tseries related elements
        for comp_archive_spec in xml_tree.findall(
                "components/comp_archive_spec"):
            comp = comp_archive_spec.get("name")
            rootdir = comp_archive_spec.find("rootdir").text
            multi_instance = comp_archive_spec.find("multi_instance").text

            # for now, set instance value to empty string implying only 1 instance
            instance = ""

            # loop through all the files/file_spec elements
            for file_spec in comp_archive_spec.findall("files/file_extension"):
                file_extension = file_spec.get("suffix")
                subdir = file_spec.find("subdir").text

                # check if tseries_create is an element for this file_spec
                if file_spec.find("tseries_create") is not None:
                    tseries_create = file_spec.find("tseries_create").text

                    # check if the tseries_create element is set to TRUE
                    if tseries_create.upper() in ["T", "TRUE"]:

                        # check if tseries_format is an element for this file_spec and if it is valid
                        if file_spec.find("tseries_output_format") is not None:
                            tseries_output_format = file_spec.find(
                                "tseries_output_format").text
                            if tseries_output_format not in [
                                    "netcdf", "netcdf4", "netcdf4c"
                            ]:
                                err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(
                                    comp, file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(
                                comp, file_extension)
                            raise TypeError(err_msg)

                        # check if the tseries_output_subdir is specified and create the tseries_output_dir
                        if file_spec.find("tseries_output_subdir") is not None:
                            tseries_output_subdir = file_spec.find(
                                "tseries_output_subdir").text
                            tseries_output_dir = '/'.join(
                                [dout_s_root, rootdir, tseries_output_subdir])
                            if not os.path.exists(tseries_output_dir):
                                os.makedirs(tseries_output_dir)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_subdir undefined for data stream {0}.*.{1}".format(
                                comp, file_extension)
                            raise TypeError(err_msg)

                        # check if tseries_tper is specified and is valid
                        if file_spec.find("tseries_tper") is not None:
                            tseries_tper = file_spec.find("tseries_tper").text
                            if tseries_tper not in [
                                    "annual", "yearly", "monthly", "weekly",
                                    "daily", "hourly6", "hourly3", "hourly1",
                                    "min30"
                            ]:
                                err_msg = "cesm_tseries_generator.py error: tseries_tper invalid for data stream {0}.*.{1}".format(
                                    comp, file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_tper undefined for data stream {0}.*.{1}".format(
                                comp, file_extension)
                            raise TypeError(err_msg)

                        # load the tseries_time_variant_variables into a list
                        if comp_archive_spec.find(
                                "tseries_time_variant_variables") is not None:
                            variable_list = list()
                            for variable in comp_archive_spec.findall(
                                    "tseries_time_variant_variables/variable"):
                                variable_list.append(variable.text)

                        # get a list of all the input files for this stream from the archive location
                        history_files = list()
                        in_file_path = '/'.join([dout_s_root, rootdir, subdir])
                        all_in_files = os.listdir(in_file_path)

                        # check that there are actually a list of history files to work with
                        for in_file in all_in_files:
                            if re.search(file_extension, in_file):
                                # check to make sure this file ends in .nc and not something else
                                if in_file.endswith('.nc'):
                                    history_files.append(in_file_path + "/" +
                                                         in_file)
                                else:
                                    print(
                                        'cesm_tseries_generator.py WARNING - unable to operate on file {0}/{1}'
                                        .format(in_file_path, in_file))

                        # sort the list of input history files in order to get the output suffix
                        # from the first and last file
                        if len(history_files) > 0:
                            history_files.sort()

                            start_file = history_files[0]
                            start_file_parts = list()
                            start_file_parts = start_file.split(".")
                            start_file_time = start_file_parts[-2]

                            last_file = history_files[-1]
                            last_file_parts = list()
                            last_file_parts = last_file.split(".")
                            last_file_time = last_file_parts[-2]

                            # get the actual component name from the history file
                            # will also need to deal with the instance numbers based on the comp_name
                            comp_name = last_file_parts[-4]
                            stream = last_file_parts[-3]

                            # check for pop.h nday1 and nyear1 history streams
                            if last_file_parts[-3] in ["nday1", "nyear1"]:
                                comp_name = last_file_parts[-5]
                                stream = last_file_parts[
                                    -4] + "." + last_file_parts[-3]

                            # create the tseries output prefix needs to end with a "."
                            tseries_output_prefix = tseries_output_dir + "/" + casename + "." + comp_name + "." + stream + "."

                            # format the time series variable output suffix based on the
                            # tseries_tper setting suffix needs to start with a "."
                            if tseries_tper == "yearly":
                                tseries_output_suffix = "." + start_file_time + "-" + last_file_time + ".nc"
                            elif tseries_tper == "monthly":
                                start_time_parts = start_file_time.split("-")
                                last_time_parts = last_file_time.split("-")
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[
                                        1] + "-" + last_time_parts[
                                            0] + last_time_parts[1] + ".nc"
                            elif tseries_tper in [
                                    "weekly", "daily", "hourly6", "hourly3",
                                    "hourly1", "min30"
                            ]:
                                start_time_parts = start_file_time.split("-")
                                last_time_parts = last_file_time.split("-")
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[1] + start_time_parts[
                                        2] + "-" + last_time_parts[
                                            0] + last_time_parts[
                                                1] + last_time_parts[2] + ".nc"

                            # get a reshaper specification object
                            spec = specification.create_specifier()

                            # populate the spec object with data for this history stream
                            spec.input_file_list = history_files
                            spec.netcdf_format = tseries_output_format
                            spec.output_file_prefix = tseries_output_prefix
                            spec.output_file_suffix = tseries_output_suffix
                            spec.time_variant_metadata = variable_list

                            # print the specifier
                            if debug:
                                dbg = list()
                                pp = pprint.PrettyPrinter(indent=5)
                                dbg = [
                                    comp_name, spec.input_file_list,
                                    spec.netcdf_format,
                                    spec.output_file_prefix,
                                    spec.output_file_suffix,
                                    spec.time_variant_metadata
                                ]
                                pp.pprint(dbg)

                            # append this spec to the list of specifiers
                            specifiers.append(spec)

    return specifiers