def readArchiveXML(caseroot, input_rootdir, output_rootdir, casename,
                   standalone, completechunk, debug, debugMsg):
    """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of 
         reshaper specifications to be passed to the pyReshaper tool.

    Arguments:
    caseroot (string) - case root path
    input_rootdir (string) - rootdir to input raw history files
    output_rootdir (string) - rootdir to output single variable time series files
    casename (string) - casename
    standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not
    completechunk (boolean) - end on a ragid boundary if True.  Otherwise, do not create incomplete chunks if False
    """
    specifiers = list()
    xml_tree = ET.ElementTree()

    # get path to env_timeseries.xml file
    env_timeseries = '{0}/env_timeseries.xml'.format(caseroot)

    # read tseries log file to see if we've already started converting files, if so, where did we leave off
    log = chunking.read_log('{0}/logs/ts_status.log'.format(caseroot))

    # check if the env_timeseries.xml file exists
    if (not os.path.isfile(env_timeseries)):
        err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format(
            env_timeseries)
        raise OSError(err_msg)
    else:
        # parse the xml
        xml_tree.parse(env_timeseries)

        # loop through all the comp_archive_spec elements to find the tseries related elements
        for comp_archive_spec in xml_tree.findall(
                "components/comp_archive_spec"):
            comp = comp_archive_spec.get("name")
            rootdir = comp_archive_spec.find("rootdir").text
            multi_instance = comp_archive_spec.find("multi_instance").text
            default_calendar = comp_archive_spec.find("default_calendar").text
            debugMsg("default_calendar = {0}".format(default_calendar),
                     header=True)

            # for now, set instance value to empty string implying only 1 instance
            instance = ""

            # loop through all the files/file_spec elements
            for file_spec in comp_archive_spec.findall("files/file_extension"):
                file_extension = file_spec.get("suffix")
                subdir = file_spec.find("subdir").text

                # check if tseries_create is an element for this file_spec
                if file_spec.find("tseries_create") is not None:
                    tseries_create = file_spec.find("tseries_create").text

                    # check if the tseries_create element is set to TRUE
                    if tseries_create.upper() in ["T", "TRUE"]:

                        # check if tseries_format is an element for this file_spec and if it is valid
                        if file_spec.find("tseries_output_format") is not None:
                            tseries_output_format = file_spec.find(
                                "tseries_output_format").text
                            if tseries_output_format not in [
                                    "netcdf", "netcdf4", "netcdf4c",
                                    "netcdfLarge"
                            ]:
                                err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(
                                    comp, file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(
                                comp, file_extension)
                            raise TypeError(err_msg)

                        # load the tseries_time_variant_variables into a list
                        variable_list = list()
                        if comp_archive_spec.find(
                                "tseries_time_variant_variables") is not None:
                            for variable in comp_archive_spec.findall(
                                    "tseries_time_variant_variables/variable"):
                                variable_list.append(variable.text)

                        # get a list of all the input files for this stream from the archive location
                        history_files = list()
                        in_file_path = '/'.join(
                            [input_rootdir, rootdir, subdir])

                        # get XML tseries elements for chunking
                        if file_spec.find("tseries_tper") is not None:
                            tseries_tper = file_spec.find("tseries_tper").text
                        if file_spec.find("tseries_filecat_tper") is not None:
                            tper = file_spec.find("tseries_filecat_tper").text
                        if file_spec.find("tseries_filecat_n") is not None:
                            size = file_spec.find("tseries_filecat_n").text
                        comp_name = comp
                        stream = file_extension.split('.[')[0]

                        stream_dates, file_slices, cal, units, time_period_freq = chunking.get_input_dates(
                            in_file_path + '/*' + file_extension + '*.nc')
                        # check if the calendar attribute was read or not
                        if cal is None or cal == "none":
                            cal = default_calendar
                        debugMsg("calendar = {0}".format(cal), header=True)

                        # the tseries_tper should be set in using the time_period_freq global file attribute if it exists
                        if time_period_freq is not None:
                            tseries_tper = time_period_freq
                        tseries_output_dir = '/'.join([
                            output_rootdir, rootdir, 'proc/tseries',
                            tseries_tper
                        ])
                        debugMsg("tseries_output_dir = {0}".format(
                            tseries_output_dir),
                                 header=True)

                        if not os.path.exists(tseries_output_dir):
                            os.makedirs(tseries_output_dir)

                        if comp + stream not in log.keys():
                            log[comp + stream] = {'slices': [], 'index': 0}
                        ts_log_dates = log[comp + stream]['slices']
                        index = log[comp + stream]['index']
                        files, dates, index = chunking.get_chunks(
                            tper, index, size, stream_dates, ts_log_dates, cal,
                            units, completechunk)
                        for d in dates:
                            log[comp + stream]['slices'].append(float(d))
                        log[comp + stream]['index'] = index
                        for cn, cf in files.iteritems():

                            history_files = cf['fn']
                            start_time_parts = cf['start']
                            last_time_parts = cf['end']

                            # create the tseries output prefix needs to end with a "."
                            tseries_output_prefix = "{0}/{1}.{2}{3}.".format(
                                tseries_output_dir, casename, comp_name,
                                stream)
                            debugMsg("tseries_output_prefix = {0}".format(
                                tseries_output_prefix),
                                     header=True)

                            # format the time series variable output suffix based on the
                            # tseries_tper setting suffix needs to start with a "."
                            freq_array = ["week", "day", "hour", "min"]
                            if "year" in tseries_tper:
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + "-" + last_time_parts[0] + ".nc"
                            elif "month" in tseries_tper:
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[
                                        1] + "-" + last_time_parts[
                                            0] + last_time_parts[1] + ".nc"
                            elif "day" in tseries_tper:
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[1] + start_time_parts[
                                        2] + "-" + last_time_parts[
                                            0] + last_time_parts[
                                                1] + last_time_parts[2] + ".nc"
                            elif any(freq_string in tseries_tper
                                     for freq_string in freq_array):
                                tseries_output_suffix = "." + start_time_parts[
                                    0] + start_time_parts[1] + start_time_parts[
                                        2] + start_time_parts[
                                            3] + "-" + last_time_parts[
                                                0] + last_time_parts[
                                                    1] + last_time_parts[
                                                        2] + last_time_parts[
                                                            3] + ".nc"
                            else:
                                err_msg = "cesm_tseries_generator.py error: invalid tseries_tper = {0}.".format(
                                    tseries_tper)
                                raise TypeError(err_msg)
                            debugMsg("tseries_output_suffix = {0}".format(
                                tseries_output_suffix),
                                     header=True)

                            # get a reshaper specification object
                            spec = specification.create_specifier()

                            # populate the spec object with data for this history stream
                            spec.input_file_list = history_files
                            spec.netcdf_format = tseries_output_format
                            spec.output_file_prefix = tseries_output_prefix
                            spec.output_file_suffix = tseries_output_suffix
                            spec.time_variant_metadata = variable_list

                            # print the specifier
                            if debug:
                                dbg = list()
                                pp = pprint.PrettyPrinter(indent=5)
                                dbg = [
                                    comp_name, spec.input_file_list,
                                    spec.netcdf_format,
                                    spec.output_file_prefix,
                                    spec.output_file_suffix,
                                    spec.time_variant_metadata
                                ]
                                pp.pprint(dbg)

                            # append this spec to the list of specifiers
                            specifiers.append(spec)
    return specifiers, log
def readArchiveXML(caseroot, input_rootdir, output_rootdir, casename, standalone, completechunk, 
                   generate_all, debug, debugMsg, comm, rank, size):
    """ reads the $CASEROOT/env_timeseries.xml file and builds a fully defined list of 
         reshaper specifications to be passed to the pyReshaper tool.

    Arguments:
    caseroot (string) - case root path
    input_rootdir (string) - rootdir to input raw history files
    output_rootdir (string) - rootdir to output single variable time series files
    casename (string) - casename
    standalone (boolean) - logical to indicate if postprocessing case is stand-alone or not
    completechunk (boolean) - end on a ragid boundary if True.  Otherwise, do not create incomplete chunks if False
    generate_all (boolean) - generate timeseries for all streams if True.  Otherwise, use the tseries_create setting.
    """
    specifiers = list()
    xml_tree = ET.ElementTree()

    # get path to env_timeseries.xml file
    env_timeseries = '{0}/env_timeseries.xml'.format(caseroot)

    # read tseries log file to see if we've already started converting files, if so, where did we leave off
    log = chunking.read_log('{0}/logs/ts_status.log'.format(caseroot))

    # check if the env_timeseries.xml file exists
    if ( not os.path.isfile(env_timeseries) ):
        err_msg = "cesm_tseries_generator.py ERROR: {0} does not exist.".format(env_timeseries)
        raise OSError(err_msg)
    else:
        # parse the xml
        xml_tree.parse(env_timeseries)

        # loop through all the comp_archive_spec elements to find the tseries related elements
        for comp_archive_spec in xml_tree.findall("components/comp_archive_spec"):
            comp = comp_archive_spec.get("name")
            rootdir = comp_archive_spec.find("rootdir").text
            multi_instance = comp_archive_spec.find("multi_instance").text
            default_calendar = comp_archive_spec.find("default_calendar").text
            if rank == 0:
                debugMsg("default_calendar = {0}".format(default_calendar), header=True, verbosity=1)

            # for now, set instance value to empty string implying only 1 instance
            instance = ""

            # loop through all the files/file_spec elements
            for file_spec in comp_archive_spec.findall("files/file_extension"):
                file_extension = file_spec.get("suffix")
                subdir = file_spec.find("subdir").text

                # check if tseries_create is an element for this file_spec
                if file_spec.find("tseries_create") is not None:
                    tseries_create = file_spec.find("tseries_create").text

                    # check if the tseries_create element is set to TRUE            
                    if tseries_create.upper() in ["T","TRUE"] or generate_all.upper() in ["T","TRUE"]:
                        
                        # check if tseries_format is an element for this file_spec and if it is valid
                        if file_spec.find("tseries_output_format") is not None:
                            tseries_output_format = file_spec.find("tseries_output_format").text
                            if tseries_output_format not in ["netcdf","netcdf4","netcdf4c","netcdfLarge"]:
                                err_msg = "cesm_tseries_generator.py error: tseries_output_format invalid for data stream {0}.*.{1}".format(comp,file_extension)
                                raise TypeError(err_msg)
                        else:
                            err_msg = "cesm_tseries_generator.py error: tseries_output_format undefined for data stream {0}.*.{1}".format(comp,file_extension)
                            raise TypeError(err_msg)

                        # load the tseries_time_variant_variables into a list
                        variable_list = list()
                        if comp_archive_spec.find("tseries_time_variant_variables") is not None:
                            for variable in comp_archive_spec.findall("tseries_time_variant_variables/variable"):
                                variable_list.append(variable.text)

                        # load the tseries_exclude_variables into a list
                        exclude_list = list()
                        if comp_archive_spec.find("tseries_exclude_variables") is not None:
                            for variable in comp_archive_spec.findall("tseries_exclude_variables/variable"):
                                exclude_list.append(variable.text)

                        # get a list of all the input files for this stream from the archive location
                        history_files = list()
                        in_file_path = '/'.join( [input_rootdir,rootdir,subdir] )                        

                        # get XML tseries elements for chunking
                        if file_spec.find("tseries_tper") is not None:
                            tseries_tper = file_spec.find("tseries_tper").text
                        if file_spec.find("tseries_filecat_tper") is not None:
                            tper = file_spec.find("tseries_filecat_tper").text
                        if file_spec.find("tseries_filecat_n") is not None:
                            size_n = file_spec.find("tseries_filecat_n").text
                        comp_name = comp
                        stream = file_extension.split('.[')[0]

                        stream_dates,file_slices,cal,units,time_period_freq = chunking.get_input_dates(in_file_path+'/*'+file_extension+'*.nc', comm, rank, size)
                        # check if the calendar attribute was read or not
                        if cal is None or cal == "none":
                            cal = default_calendar
                        if rank == 0:
                            debugMsg("calendar = {0}".format(cal), header=True, verbosity=1)

                        # the tseries_tper should be set in using the time_period_freq global file attribute if it exists
                        if time_period_freq is not None:
                            tseries_tper = time_period_freq
                        tseries_output_dir = '/'.join( [output_rootdir, rootdir, 'proc/tseries', tseries_tper] )
                        if rank == 0:
                            debugMsg("tseries_output_dir = {0}".format(tseries_output_dir), header=True, verbosity=1)

                        if comp+stream not in log.keys():
                            log[comp+stream] = {'slices':[],'index':0}
                        ts_log_dates = log[comp+stream]['slices']
                        index = log[comp+stream]['index']
                        files,dates,index = chunking.get_chunks(tper, index, size_n, stream_dates, ts_log_dates, cal, units, completechunk, tseries_tper)
                        for d in dates:
                            log[comp+stream]['slices'].append(float(d))
                        log[comp+stream]['index']=index
                        for cn,cf in files.iteritems():

                            if rank == 0:
                                if not os.path.exists(tseries_output_dir):
                                    os.makedirs(tseries_output_dir)
                            comm.sync()

                            history_files = cf['fn']
                            start_time_parts = cf['start']
                            last_time_parts = cf['end']

                            # create the tseries output prefix needs to end with a "."
                            tseries_output_prefix = "{0}/{1}.{2}{3}.".format(tseries_output_dir,casename,comp_name,stream)
                            if rank == 0:
                                debugMsg("tseries_output_prefix = {0}".format(tseries_output_prefix), header=True, verbosity=1)

                            # format the time series variable output suffix based on the 
                            # tseries_tper setting suffix needs to start with a "."
                            freq_array = ["week","day","hour","min"]
                            if "year" in tseries_tper:
                                tseries_output_suffix = "."+start_time_parts[0]+"-"+last_time_parts[0]+".nc"
                            elif "month" in tseries_tper:
                                tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+"-"+last_time_parts[0]+last_time_parts[1]+".nc"
                            elif "day" in tseries_tper:
                                tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+".nc"
                            elif any(freq_string in tseries_tper for freq_string in freq_array):
                                tseries_output_suffix = "."+start_time_parts[0]+start_time_parts[1]+start_time_parts[2]+start_time_parts[3]+"-"+last_time_parts[0]+last_time_parts[1]+last_time_parts[2]+last_time_parts[3]+".nc"
                            else:
                                err_msg = "cesm_tseries_generator.py error: invalid tseries_tper = {0}.".format(tseries_tper)
                                raise TypeError(err_msg)
                            if rank == 0:
                                debugMsg("tseries_output_suffix = {0}".format(tseries_output_suffix), header=True, verbosity=1)

                            # get a reshaper specification object/
                            spec = specification.create_specifier()

                            # populate the spec object with data for this history stream
                            spec.input_file_list = history_files
                            spec.netcdf_format = tseries_output_format
                            spec.output_file_prefix = tseries_output_prefix
                            spec.output_file_suffix = tseries_output_suffix
                            spec.time_variant_metadata = variable_list
                            spec.exclude_list = exclude_list
                            # setting the default backend; netCDF4 or pynio
                            spec.backend = 'netCDF4'

                            if rank == 0:
                                debugMsg("specifier: comp_name = {0}".format(comp_name), header=True, verbosity=1)
                                debugMsg("    input_file_list = {0}".format(spec.input_file_list), header=True, verbosity=1)
                                debugMsg("    netcdf_format = {0}".format(spec.netcdf_format), header=True, verbosity=1)
                                debugMsg("    output_file_prefix = {0}".format(spec.output_file_prefix), header=True, verbosity=1)
                                debugMsg("    output_file_suffix = {0}".format(spec.output_file_suffix), header=True, verbosity=1)
                                debugMsg("    time_variant_metadata = {0}".format(spec.time_variant_metadata), header=True, verbosity=1)
                                debugMsg("    exclude_list = {0}".format(spec.exclude_list), header=True, verbosity=1)
                            
                            # append this spec to the list of specifiers
                            specifiers.append(spec)
    return specifiers,log