Пример #1
0
def get_idurls_from_dataset( dataset ):
    """Input is a dataset name, e.g. 'cmip5.output1.NCC.NorESM1-M.historical.mon.atmos.Amon.r1i1p1'.
    This will return a list of (id,url) pairs for files in the dataset; if located at PCMDI.
    """
    idurls = []
    pcmdiservers=['pcmdi7.llnl.gov','pcmdi9.llnl.gov']
    for server in pcmdiservers:
        query_argv = ['--type', 'f', '-q', 'drs_id='+dataset+',data_node='+server, '--fields', 'url', '-p']
        try:
            (fullResults,countOnly,facetValues,allFacets,prettyPrint,delim,outpath,numFound,\
             outpathIsStdout,format) = esgquery_index.preoutput(query_argv)
            idurls.extend( [(a,c) for a,b,c in fullResults if b=='url' ] )
        except lxml.etree.XMLSyntaxError as e:
            print "in get_idurls_from_dataset(",dataset,")"
            print "with query",query_argv
            print "caught exception",e
            return None
    return idurls
Пример #2
0
def try3_pubpath2version(pubpath):
    """Input is a path of data as published at PCMDI, e.g.
    /cmip5/data/cmip5/output1/INM/inmcm4/1pctCO2/mon/atmos/Amon/r1i1p1/zg/1/zg_Amon_inmcm4_1pctCO2_r1i1p1_210001-210912.nc
    Output is a version string.
    Make sure you have p2p.py in your PYTHONPATH:
    export PYTHONPATH=$PYTHONPATH:$HOME/src/esgf-contrib/estani/esgf-replication-61f7311/replication/model/
    and
export PYTHONPATH=$PYTHONPATH:/export/home/painter/pytools:/export/home/painter/src/esgf-contrib/estani/replicas/
    """

    import p2p
    import esgquery_index

    sppath = pubpath.split('/')
    if sppath[2]!='data':
        print "don't recognize path as for published data"
        return None
    abs_path = '/'.join(sppath[3:])
    dataset = '.'.join(sppath[3:12])
    pcmdiservers=['pcmdi7.llnl.gov','pcmdi9.llnl.gov']
    for server in pcmdiservers:
        query_argv = ['--type', 'f', '-q', 'drs_id='+dataset+',data_node='+server, '--fields', 'url', '-p']
        (fullResults,countOnly,facetValues,allFacets,prettyPrint,delim,outpath,numFound,\
         outpathIsStdout,format) = esgquery_index.preoutput(query_argv)
        idurl = [(a,c) for a,b,c in fullResults if b=='url' and c.find(abs_path)>-1 ]
        if len(idurl)>0:
            break
    versions = ([a.split('.')[9] for (a,c) in idurl])
    if len(versions)==0:
        print "No match found"
        return None
    elif len(versions)>1:
        print "Too many matches!  Version cannot be determined. idurl=",idurl
        return None
    else:
        return versions[0]
Пример #3
0
def try2_pubpath2version(pubpath):
    """Input is a path of data as published at PCMDI, e.g.
    /cmip5/data/cmip5/output1/INM/inmcm4/1pctCO2/mon/atmos/Amon/r1i1p1/zg/1/zg_Amon_inmcm4_1pctCO2_r1i1p1_210001-210912.nc
    Hardly anything about the path can look different from this!  The format is, in full generality,
    /root/data/project/product/institute/model/experiment/time_frequency/realm/table/ensemble/variable/file_version/filename
    where "data" is exactly that string, and other names have the usual meanings.
    Output is a version string.
    Make sure you have p2p.py in your PYTHONPATH:
    export PYTHONPATH=$PYTHONPATH:$HOME/src/esgf-contrib/estani/esgf-replication-61f7311/replication/model/
    and
export PYTHONPATH=$PYTHONPATH:/export/home/painter/pytools:/export/home/painter/src/esgf-contrib/estani/replicas/
    """

    import p2p
    import esgquery_index

    sppath = pubpath.split('/')
    if sppath[2]!='data':
        print "don't recognize path as for published data"
        return None
    url1 = 'http://pcmdi9.llnl.gov'
    if sppath[1]=='cmip5':
        url2 = '/thredds/fileServer/cmip5_data/'
    elif sppath[1]=='css02-cmip5':
        url2 = '/thredds/fileServer/cmip5_css02_data/'
    else:
        print "don't recognize path as for published data"
        return None
    url3 = '/'.join(sppath[3:])
    url4 = '\|application/netcdf\|HTTPServer'
    urlfull = url1+url2+url3+url4
    # A typical urlfull is
    # 'http://pcmdi9.llnl.gov/thredds/fileServer/cmip5_data/cmip5/output1/INM/inmcm4/1pctCO2/mon/atmos/Amon/r1i1p1/zg/1/zg_Amon_inmcm4_1pctCO2_r1i1p1_210001-210912.nc\|application/netcdf\|HTTPServer'
    try:
        query_argv = ['--type', 'f', '-q', 'url='+urlfull, '--fields', 'url,version', '-p']
        #print "query_argv1=",query_argv
        (fullResults,countOnly,facetValues,allFacets,prettyPrint,delim,outpath,numFound,\
         outpathIsStdout,format) = esgquery_index.preoutput(query_argv)
    except lxml.etree.XMLSyntaxError as e:
        print "in pubpath2version(",pubpath,")"
        print "and query",query_argv
        print "exception",e
        return None
    if len(fullResults)==0:
        # try the other server:
        url1 = 'http://pcmdi7.llnl.gov'
        urlfull = url1+url2+url3+url4
        try:
            query_argv = ['--type', 'f', '-q', 'url='+urlfull, '--fields', 'url,version', '-p']
            #print "query_argv2=",query_argv
            (fullResults,countOnly,facetValues,allFacets,prettyPrint,delim,outpath,numFound,\
             outpathIsStdout,format) = esgquery_index.preoutput(query_argv)
        except lxml.etree.XMLSyntaxError as e:
            print "in pubpath2version(",pubpath,")"
            print "and query",query_argv
            print "exception",e
            return None
    if len(fullResults)==0:
        # try the other server:
        url1 = 'http://pcmdi7.llnl.gov'
        url2 = '/thredds/fileServer//cmip5_css02/data/'
        url4 = '|application/netcdf|HTTPServer'
        urlfull = url1+url2+url3+url4
        print "urlfull=",urlfull
        try:
            query_argv = ['--type', 'f', '-q', 'url='+urlfull, '--fields', 'url,version', '-p']
            print "query_argv3=",query_argv
            (fullResults,countOnly,facetValues,allFacets,prettyPrint,delim,outpath,numFound,\
             outpathIsStdout,format) = esgquery_index.preoutput(query_argv)
        except lxml.etree.XMLSyntaxError as e:
            print "in pubpath2version(",pubpath,")"
            print "and query",query_argv
            print "exception",e
            return None

    results_ids = set([a for a,b,c in fullResults])
    fullResults_dicts = [(a, { b:c for a,b,c in fullResults if a1==a }) for a1 in results_ids]
    # print "fullResults_dicts=",fullResults_dicts
    if len(results_ids)==0:
        print "No dataset found"
        return None
    elif len(results_ids)>1:
        print "Too many datasets!  Version cannot be determined. ids=",results_ids
        return None
    else:
        return list(results_ids)[0].split('.')[9]