Пример #1
0
def crawl(url):
    c = Crawl(url, select=['.*meta.nc'], skip=None, debug=None)
    locsite = []

    for jr in c.datasets:
        locsite.append(str(jr.id))
    rdro = [
        'http://tds0.ifremer.fr/thredds/catalog/' +
        '/'.join(jj.split('/')[:-1]) + '/catalog.html' for jj in locsite
    ]
    print(rdro)

    #print ('the new data is',rdro[0:1])
    gd = 0
    sitenamerd = []

    #for kk in rdro:
    for kk in rdro[0:4]:
        # Only first Argo float is called inorder to reduce the computational time. Change it into rdro:
        urd = rdro[gd]
        crr = Crawl(urd,
                    select=None,
                    skip=['.*meta.nc', '.*Rtraj.nc', '.*tech.nc'],
                    debug=None)
        gd += 1

        for pp in crr.datasets:
            #print (pp.id)
            sitenamerd.append(
                ['http://tds0.ifremer.fr/thredds/dodsC/' + str(pp.id)])

    return sitenamerd
Пример #2
0
    def test_modified_time(self):
        # after with timezone
        af = datetime(2015, 12, 30, 0, 0, tzinfo=pytz.utc)
        c = Crawl(
            "http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml",
            after=af)
        assert len(c.datasets) == 3

        # after without timezone
        af = datetime(2015, 12, 30, 0, 0)
        c = Crawl(
            "http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2015/catalog.xml",
            after=af)
        assert len(c.datasets) == 3

        # before
        bf = datetime(2016, 1, 8, 0, 0)
        c = Crawl(
            "http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml",
            before=bf)
        assert len(c.datasets) == 3

        # both
        af = datetime(2016, 1, 20, 0, 0)
        bf = datetime(2016, 2, 1, 0, 0)
        c = Crawl(
            "http://tds.maracoos.org/thredds/catalog/MODIS-Chesapeake-Salinity/raw/2016/catalog.xml",
            before=bf,
            after=af)
        assert len(c.datasets) == 11
Пример #3
0
 def url_trawler(self, url, expr):
     if url.endswith(".xml"):
         c = Crawl(url, select=[expr])
     elif url.endswith(
             "/"
     ):  # we'll try and add catalog.xml as the user may have just provided a directory
         c = Crawl(url + "catalog.xml", select=[expr])
     else:  # we'll try and add catalog.xml as the user may have just provided a directory
         c = Crawl(url + "/catalog.xml", select=[expr])
     urls = [
         s.get("url") for d in c.datasets for s in d.services
         if s.get("service").lower() == "opendap"
     ]
     return urls
    def test_root_finder(self):
        urls = [
            ('http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.dataset.106_224.thredds.xml',
                 'http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.thredds.xml', False),
            ('http://www.esrl.noaa.gov/psd/thredds/catalog/Datasets/noaa.oisst.v2.derived/catalog.xml',
                 'http://www.esrl.noaa.gov/psd/thredds/catalog.xml', True),
            ('https://rsg.pml.ac.uk/thredds/catalog/cnr/3b42-3h/1998/01/01/catalog.xml',
                 'https://rsg.pml.ac.uk/thredds/catalog.xml', True)
        ]

        for url, expected, output in urls:
            crawler = Crawl(url)
            found_url = crawler._find_root_url()

            assert (found_url == expected) == output
Пример #5
0
def loadGliders(loader, stride=1):
    '''
    Crawl the IOOS Glider TDS for OPeNDAP links of Time aggregated files and load into STOQS
    '''

    c = Crawl("http://tds.gliders.ioos.us/thredds/catalog.xml",
              select=[".*_Time$"])
    urls = [
        s.get("url") for d in c.datasets for s in d.services
        if s.get("service").lower() == "opendap"
    ]
    colors = loader.colors.values()

    for url in urls:
        aName = url.split('/')[-1].split('.')[0]
        pName = aName.replace('_Time', '')
        if pName.find('-') != -1:
            logger.warn(
                "Replacing '-' characters in platform name %s with '_'s",
                pName)
            pName = pName.replace('-', '_')

        logger.info("Executing runGliderLoader with url = %s", url)
        try:
            runGliderLoader(url, loader.campaignName, il.campaignDescription,
                            aName, pName, colors.pop(), 'glider',
                            'Glider Mission', loader.parms, loader.dbAlias,
                            stride, loader.startDatetime, loader.endDatetime,
                            il.grdTerrain)
        except Exception, e:
            logger.error('%s. Skipping this dataset.', e)
Пример #6
0
    def test_root_finder(self):
        urls = [
            ('http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.dataset.106_224.thredds.xml',
             'http://data.eol.ucar.edu/jedi/catalog/ucar.ncar.eol.thredds.xml',
             False),
            ('http://www.esrl.noaa.gov/psd/thredds/catalog/Datasets/noaa.oisst.v2.derived/catalog.xml',
             'http://www.esrl.noaa.gov/psd/thredds/catalog.xml', True),
            ('https://rsg.pml.ac.uk/thredds/catalog/cnr/3b42-3h/1998/01/01/catalog.xml',
             'https://rsg.pml.ac.uk/thredds/catalog.xml', True)
        ]

        for url, expected, output in urls:
            crawler = Crawl(url)
            found_url = crawler._find_root_url()

            assert (found_url == expected) == output
Пример #7
0
    def test_regex_selects(self):
        c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=[".*-Agg"])
        assert len(c.datasets) == 9

        # Get all DAP links:
        services = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap"]
        assert len(services) == 9
Пример #8
0
def main(
        url='http://opendap-devel.ooi.rutgers.edu:8090/thredds/catalog/first-in-class/catalog.xml',
        stmt='.*ncml'):
    C = Crawl(url, select=[stmt])
    tds = 'http://opendap-devel.ooi.rutgers.edu:8090/thredds/dodsC/'
    reg_ex = re.compile('|'.join(['config', 'meta', 'engine', 'diag']))

    data = []
    for dataset in C.datasets:
        if reg_ex.search(dataset.id) is not None:
            continue
        file = tds + dataset.id
        with xr.open_dataset(file) as ds:
            ds_disk = ds.swap_dims(
                {'obs': 'time'})  # change dimensions from 'obs' to 'time'
            # ds_variables = ds.data_vars.keys()  # List of dataset variables
            refdes = ds.subsite + '-' + ds.node + '-' + ds.sensor
            stream = ds_disk.stream  # List stream name associated with the data
            delivery = ds.collection_method
            start = ds.time_coverage_start
            end = ds.time_coverage_end
            # data.append((refdes, stream, delivery, 'Yes', 'Yes', start + ' to ' + end))
    pd.DataFrame(data,
                 columns=[
                     'RefDes', 'Stream', 'Delivery Method', 'Data Downloaded',
                     'Data range - MIO', 'Time Range'
                 ])
    pd.DataFrame.to_csv('/Users/michaesm/Documents/Summary.csv')
Пример #9
0
 def test_single_dataset(self):
     c = Crawl("http://tds.maracoos.org/thredds/MODIS.xml", select=["MODIS-Agg"])
     assert len(c.datasets) == 1
     assert c.datasets[0].id == "MODIS-Agg"
     assert len(c.datasets[0].services) == 2
     service_names = sorted(map(lambda x: x.get('service'), c.datasets[0].services))
     assert service_names == ["ISO", "OPENDAP"]
Пример #10
0
def thredds_find_glob(base_catalog: str,
                      skips: List[str],
                      select: List[str],
                      workers: int = 8) -> List[str]:
    """Glob YAML's from base Thredds Catalog recursively
    Arguments:
        base_catalog {str} -- Base of the catlog to crawl from
        user_skips {list} -- Paths to skip in addition to NCI specific defaults
        select {list} -- Paths to select (useful YAML's)
        workers {int} -- Number of workers to use for Thredds Crawling
    Returns:
        list -- List of Thredds hosted dataset YAML url's to Index
    """
    user_skips = Crawl.SKIPS
    user_skips = user_skips.extend(skips)

    results = Crawl(base_catalog + "/catalog.xml",
                    select=select,
                    skip=user_skips,
                    workers=workers).datasets

    urls = [
        service["url"] for dataset in results for service in dataset.services
        if service["service"].lower() == "httpserver"
    ]

    return urls
Пример #11
0
def get_metadata(thredds_servers,
                 save_dir,
                 skips=Crawl.SKIPS,
                 select=None,
                 debug=True,
                 logger_name=None):
    logger = logging.getLogger(logger_name)
    tsi = thredds_servers.items()
    local_metadata_paths = []
    for subfolder, thredds_url in tsi:
        logger.info("Crawling {0} ({1})".format(subfolder, thredds_url))
        crawler = Crawl(thredds_url, skip=skips, select=select, debug=debug)
        filefolder = os.path.join(save_dir, subfolder)
        if not os.path.exists(filefolder):
            os.makedirs(filefolder)
        isos = [(d.id, s.get("url")) for d in crawler.datasets
                for s in d.services if s.get("service").lower() == "iso"]
        for iso in isos:
            filename = '{0}{1}'.format(iso[0].replace('/', '_'), '.iso.xml')
            filepath = os.path.join(filefolder, filename)
            try:
                urlretrieve(iso[1], filepath)
            except BaseException:
                logger.exception("Error!")
            else:
                local_metadata_paths.append(filepath)
    return local_metadata_paths
Пример #12
0
def find_urls(base, select, startdate, enddate):
    url = os.path.join(base, 'catalog.xml')
    print("Crawling: {}".format(url))
    skips = Crawl.SKIPS + [
        ".*Courier*", ".*Express*", ".*Normal*, '.*Priority*", ".*.cfg$"
    ]
    u = urlparse(url)
    name, ext = os.path.splitext(u.path)
    if ext == ".html":
        u = urlparse(url.replace(".html", ".xml"))
    url = u.geturl()
    urls = []
    try:
        c = Crawl(url, select=[".*dlist"])

        # Crawl the catalogRefs:
        for dataset in c.datasets:

            try:
                # get the mission directory name and extract the start and ending dates
                dlist = os.path.basename(dataset.id)
                mission_dir_name = dlist.split('.')[0]
                dts = mission_dir_name.split('_')
                dir_start = datetime.strptime(dts[0], '%Y%m%d')
                dir_end = datetime.strptime(dts[1], '%Y%m%d')

                # if within a valid range, grab the valid urls
                if dir_start >= startdate and dir_end <= enddate:
                    catalog = '{}_{}/catalog.xml'.format(
                        dir_start.strftime('%Y%m%d'),
                        dir_end.strftime('%Y%m%d'))
                    c = Crawl(os.path.join(base, catalog),
                              select=[select],
                              skip=skips)
                    d = [
                        s.get("url") for d in c.datasets for s in d.services
                        if s.get("service").lower() == "opendap"
                    ]
                    for url in d:
                        urls.append(url)
            except Exception as ex:
                print("Error reading mission directory name {}".format(ex))

    except BaseException:
        print("Skipping {} (error parsing the XML XML)".format(url))

    return urls
Пример #13
0
def drifters(drifter_id, projection, resolution, extent):
    buoy_id = []
    lat = []
    lon = []
    status = []

    if drifter_id in ['all', 'active', 'inactive', 'not responding']:
        c = Crawl(app.config['DRIFTER_CATALOG_URL'], select=[".*.nc$"])
        drifters = [d.name[:-3] for d in c.datasets]
    else:
        drifters = drifter_id.split(",")

    for d in drifters:
        with Dataset(app.config["DRIFTER_URL"] % d, 'r') as ds:
            if drifter_id == 'active' and ds.status != 'normal':
                continue
            elif drifter_id == 'inactive' and ds.status != 'inactive':
                continue
            elif drifter_id == 'not responding' and \
                    ds.status != 'not responding':
                continue
            buoy_id.append(ds.buoyid)
            lat.append(ds['latitude'][:])
            lon.append(ds['longitude'][:])
            status.append(ds.status)

    proj = pyproj.Proj(init=projection)
    view = _get_view(extent)

    res = []
    for i, bid in enumerate(buoy_id):
        x, y = proj(lon[i], lat[i])

        ls = LineString(zip(y, x))
        if view.envelope.intersects(ls):
            path = np.array(ls.simplify(resolution * 1.5).coords)
            path = np.array(
                proj(path[:, 1], path[:, 0], inverse=True)).transpose()

            res.append({
                'type': "Feature",
                'geometry': {
                    'type': "LineString",
                    'coordinates': path.astype(float).tolist()
                },
                'properties': {
                    'name': bid,
                    'status': status[i],
                    'type': "drifter",
                    'resolution': resolution,
                }
            })

    result = {
        'type': "FeatureCollection",
        'features': res,
    }

    return result
Пример #14
0
 def get(self):
     catalog_url = self.get_argument('catalog_url')
     self.set_header('Content-Type', 'application/json')
     c = ThreddsConfig(config=self.config)
     crawl = Crawl(catalog_url, workers=c.workers)
     datasets = sorted([flatten_dataset(d) for d in crawl.datasets],
                       key=lambda d: d['id'])
     self.finish(json.dumps(datasets))
Пример #15
0
 def test_iso_links(self):
     c = Crawl("http://thredds.axiomdatascience.com/thredds/global.html")
     isos = [
         s.get("url") for d in c.datasets for s in d.services
         if s.get("service").lower() == "iso"
     ]
     assert "?dataset=" in isos[0]
     assert "&catalog=" in isos[0]
Пример #16
0
def find_urls(base, search_str):
    INV_NS = "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0"
    url = os.path.join(base, 'catalog.xml')
    print "Crawling: %s" % url
    skips = Crawl.SKIPS + [
        ".*Courier*", ".*Express*", ".*Normal*, '.*Priority*", ".*.cfg$"
    ]
    u = urlparse.urlsplit(url)
    name, ext = os.path.splitext(u.path)
    if ext == ".html":
        u = urlparse.urlsplit(url.replace(".html", ".xml"))
    url = u.geturl()
    urls = []
    # Get an etree object
    try:
        r = requests.get(url)
        tree = etree.XML(r.text.encode('utf-8'))

        # Crawl the catalogRefs:
        for ref in tree.findall('.//{%s}catalogRef' % INV_NS):

            try:
                # get the mission directory name and extract the start and ending dates
                mission_dir_name = ref.attrib[
                    '{http://www.w3.org/1999/xlink}title']
                dts = mission_dir_name.split('_')
                dir_start = datetime.datetime.strptime(dts[0], '%Y%m%d')
                dir_end = datetime.datetime.strptime(dts[1], '%Y%m%d')

                # if within a valid range, grab the valid urls
                if dir_start >= startdate and dir_end <= enddate:

                    print 'Found mission directory ' + dts[0]
                    print 'Searching if within range %s and %s  %s %s' % (
                        startdate, enddate, dir_start, dir_end)
                    catalog = ref.attrib['{http://www.w3.org/1999/xlink}href']
                    c = Crawl(os.path.join(base, catalog),
                              select=[search_str],
                              skip=skips)
                    d = [
                        s.get("url") for d in c.datasets for s in d.services
                        if s.get("service").lower() == "opendap"
                    ]
                    for url in d:
                        urls.append(url)
            except Exception as ex:
                print "Error reading mission directory name %s" % ex

    except BaseException:
        print "Skipping %s (error parsing the XML)" % url

    if not urls:
        raise FileNotFound('No urls matching "{}" found in {}'.format(
            search_str, os.path.join(base, 'catalog.html')))

    return urls
Пример #17
0
def parse_opendap(crawl_path):

    if verbose > 0:
        print('crawl', crawl_path)

    skips = Crawl.SKIPS
    #skips = Crawl.SKIPS + [".*FV00", ".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded", ".*burst", ".*gridded", ".*long-timeseries"]
    #skips = Crawl.SKIPS + [".*realtime", ".*Real-time", ".*daily", ".*REAL_TIME", ".*regridded"]
    #skips = Crawl.SKIPS + [".*regridded"]

    #crawl_path = 'http://thredds.aodn.org.au/thredds/catalog/IMOS/' + path + '/catalog.xml'
    #crawl_path='http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Biogeochem_profiles/catalog.html'

    c = Crawl(crawl_path, select=['.*'], skip=skips)

    #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-EAC/catalog.xml', select=['.*'])
    #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/IMOS-ITF/catalog.xml', select=['.*'])
    #c = Crawl('http://dods.ndbc.noaa.gov/thredds/catalog/oceansites/DATA/SOTS/catalog.xml', select=['.*'])

    # print(c.datasets)

    # serice can be httpService or dapService
    urls = [
        s.get("url") for d in c.datasets for s in d.services
        if s.get("service").lower() == "httpserver"
    ]  # httpserver or opendap

    if verbose > 1:
        for url in urls:
            print(url)

    for d in c.datasets:
        if verbose > 2:
            print('datasets', d)

        for s in d.services:
            if verbose > 2:
                print('serices', s)

            if s.get("service").lower() == 'opendap':

                url = s.get("url")

                if verbose > 1:
                    print('url', s.get("url"))

                #get_nc_dataset(s.get("url"), 'TEMP')
                nc = None
                try:
                    nc = Dataset(url, mode="r")
                    postgres_insert(nc, url=url)
                except Exception as e:
                    print(url, e)
                if nc:
                    nc.close()
def web_crawler_mooring(beginDT,
                        endDT,
                        location='shelf',
                        method='telemetered'):
    USERNAME = '******'
    TOKEN = 'TEMP-TOKEN-A3STSZK6P6ULST'
    #Sensor Inventory
    SENSOR_BASE_URL = 'https://ooinet.oceanobservatories.org/api/m2m/12576/sensor/inv/'
    # Instrument Information
    if location == 'shelf':
        site = 'CE02SHSM'
    elif location == 'offshore':
        site = 'CE04OSSM'
    node = 'SBD11'
    instrument = '06-METBKA000'
    if method == 'telemetered':
        stream = 'metbk_a_dcl_instrument'
    elif method == 'recovered_host':
        stream = 'metbk_a_dcl_instrument_recovered'

    data_request_url = '/'.join(
        (SENSOR_BASE_URL, site, node, instrument, method, stream))

    params = {
        'beginDT': beginDT,
        'endDT': endDT,
        'format': 'application/csv',
        'include_provenance': 'false',
        'include_annotations': 'false',
    }
    r = requests.get(data_request_url, params=params, auth=(USERNAME, TOKEN))

    dataraw = r.json()
    print(method)
    print(dataraw)

    #This is the part that checls to ensure the link is ready to go.
    check_complete = dataraw['allURLs'][1] + '/status.txt'
    for i in range(10000):
        r = requests.get(check_complete)
        if r.status_code == requests.codes.ok:
            print('request completed')
            break
        else:
            time.sleep(.5)

    #This part then finds and downloads the requested csv file.
    url = dataraw['allURLs'][0]
    c = Crawl(url, select=['.*\.csv$'], debug=False)
    urls = [
        s.get("url") for d in c.datasets for s in d.services
        if s.get("service").lower() == "httpserver"
    ]
    urlsrev = [url for url in reversed(urls)]
    return urlsrev
Пример #19
0
def crawl(url):
    c = Crawl(url, select=['.*meta.nc'], skip=None, debug=None)
    locsite = []

    for jr in c.datasets:
        locsite.append(str(jr.id))
    rdro = [
        'http://tds0.ifremer.fr/thredds/catalog/' +
        '/'.join(jj.split('/')[:-1]) + '/catalog.html' for jj in locsite
    ]
    #    print (rdro)

    #print ('the new data is',rdro[0:1])
    gd = 0
    sitenamerd = []
    added = 0
    for kk in rdro:
        #for kk in rdro[0:4]:
        urd = rdro[gd]
        crr = Crawl(urd,
                    select=None,
                    skip=['.*meta.nc', '.*Rtraj.nc', '.*tech.nc'],
                    debug=None)
        gd += 1

        for pp in crr.datasets:
            #print (pp.id)
            sitenamerd.append(
                ['http://tds0.ifremer.fr/thredds/dodsC/' + str(pp.id)])
            uricorrect = 'http://tds0.ifremer.fr/thredds/dodsC/' + str(pp.id)
            ds0, cr0 = try_add_argo_float(uricorrect)

            if cr0:
                print('Added %s, no. %d,%d' % (url, added, len(crr.datasets)))
                added += 1
                print('Added', added)


#            import ipdb
#            ipdb.set_trace()
#    return sitenamerd
    return added
Пример #20
0
def main(url, out):
    now = dt.datetime.now().strftime('%Y.%m.%dT%H.%M.00')
    C = Crawl(url, select=[".*ncml"])
    tds = 'https://opendap.oceanobservatories.org/thredds/dodsC/'
    cf.create_dir(out)
    fopen = open(out + '/' + now + '-nc-links.txt', 'w')

    for dataset in C.datasets:
        fopen.write(tds + dataset.id + '\n')

    fopen.close()
def list_class4_files(class4_catalog_url):
    # Taken from Ocean Navigator source file "misc.py".
    c = Crawl(class4_catalog_url, select=[".*_GIOPS_.*.nc$"])

    result = []
    for dataset in c.datasets:
        value = dataset.name[:-3]
        date = datetime.datetime.strptime(value.split("_")[1], "%Y%m%d")
        result.append({'name': date.strftime("%Y-%m-%d"), 'id': value})

    return result
Пример #22
0
def list_class4_models(class4_id):
    select = ["(.*/)?%s.*_profile.nc$" % class4_id[:16]]
    c = Crawl(current_app.config["CLASS4_CATALOG_URL"], select=select)

    result = []
    for dataset in c.datasets:
        value = dataset.name[:-3]
        model = value.split("_")[2]
        if model != "GIOPS":
            result.append({'value': value.split("_")[2], 'id': value})

    return result
Пример #23
0
    def test_coawst_parse(self):
        selects = ['.*\.ncd']
        skips = Crawl.SKIPS + ['.*MATLAB.*']
        c = Crawl(
            'http://gamone.whoi.edu/thredds/catalog/coawst_4/use/fmrc/catalog.xml',
            select=selects,
            skip=skips)

        assert len(c.datasets) > 0
        isos = [(d.id, s.get("url")) for d in c.datasets for s in d.services
                if s.get("service").lower() == "iso"]
        assert len(isos) > 0
Пример #24
0
def list_class4_files_slowly():
    # This function has poor performance; only use as a fallback.
    c = Crawl(current_app.config["CLASS4_CATALOG_URL"],
              select=[".*_GIOPS_.*.nc$"],
              workers=16)

    result = []
    for dataset in c.datasets:
        value = dataset.name[:-3]
        date = datetime.datetime.strptime(value.split("_")[1], "%Y%m%d")
        result.append({'name': date.strftime("%Y-%m-%d"), 'id': value})

    return result
Пример #25
0
def list_class4_files():
    c = Crawl(app.config["CLASS4_CATALOG_URL"], select=[".*_GIOPS_.*.nc$"])

    result = []
    for dataset in c.datasets:
        value = dataset.name[:-3]
        date = datetime.datetime.strptime(value.split("_")[1], "%Y%m%d")
        result.append({
            'name': date.strftime("%Y-%m-%d"),
            'id': value
        })

    return result
Пример #26
0
def get_thredds_waf(url, destination_path, suffix=None):
    '''
    Scrapes the available ISO files at the specified THREDDS instance.
    The URL must point to the catalog.xml
    '''
    c = Crawl(url)
    datasets = c.datasets
    suffix = suffix or ''
    for dataset in datasets:
        services = { row['name'] : row for row in dataset.services }
        iso_url = services['iso']['url']
        if iso_url:
            get_iso_doc(iso_url, destination_path, dataset.id + suffix + '.xml')
Пример #27
0
def cli(thredds_catalogue, skips, select, workers, outfile):
    """ Download Metadata from THREDDS server to tarball

    Example:

       \b
       Download files in directory that match `*yaml` and store them as a tar
        > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/"
        -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*'
         -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz

    """

    user_skips = Crawl.SKIPS
    for skip in skips:
        user_skips = user_skips + [skip]

    print("Searching {thredds_catalogue} for matching files".format(
        thredds_catalogue=thredds_catalogue))
    results = Crawl(thredds_catalogue + '/catalog.xml',
                    select=[select],
                    skip=user_skips,
                    workers=workers).datasets

    print("Found {0} metadata files".format(str(len(results))))

    # construct (guess) the fileserver url based on
    # https://www.unidata.ucar.edu/software/thredds/v4.6/tds/reference/Services.html#HTTP

    parsed_uri = urlparse(thredds_catalogue)

    split_path = parsed_uri.path.split('/')
    fileserver_path = parsed_uri.scheme + '://' + parsed_uri.netloc + '/'.join(
        split_path[:(split_path.index('thredds') + 1)] + ['fileServer', ''])

    parsed_uri = urlparse(fileserver_path)

    # use a threadpool to download from thredds
    pool = ThreadPool(workers)
    yamls = pool.map(partial(download, parsed_uri=parsed_uri), results)
    pool.close()
    pool.join()

    # jam it all in a tar
    tar_opts = dict(name=outfile,
                    mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False))
    with tarfile.open(**tar_opts) as tar:
        for yaml in yamls:
            add_txt_file(tar=tar, content=yaml[0], fname=yaml[1])

    print("Done!")
def crawl(url, **options):
    validate_uri(url)

    skips = Crawl.SKIPS + ['.*ncml']
    c = Crawl(url, skip=skips, debug=True)
    added = 0
    for ds in c.datasets:
        url = [s.get('url') for s in ds.services if
                s.get('service').lower()=='opendap'][0]
        metno_obs_stat, cr = MetObsStation.objects.get_or_create(url)
        if cr:
            added += 1
            print('Added %s, no. %d/%d'%(url, added, len(c.datasets)))
    return added
Пример #29
0
    def __init__(self,
                 catalog_url,
                 out_dir,
                 log_file=None,
                 select=None,
                 skip=None,
                 clean=True):

        self.logger = logging.getLogger('thredds_crawler')
        self.logger.setLevel(logging.DEBUG)
        self.logger.handlers = []
        self.__add_stream_logger()

        if log_file is not None:
            self.__add_file_logger(log_file)
        if skip is None:
            skip = Crawl.SKIPS
        else:
            skip.extend(Crawl.SKIPS)

        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        found_isos = []
        catalog = Crawl(catalog_url, select=select, skip=skip)
        isos = [(d.id, s.get("url")) for d in catalog.datasets
                for s in d.services if s.get("service").lower() == "iso"]
        for iso in isos:
            try:
                filename = iso[0].replace("/", "_") + ".iso.xml"
                found_isos.append(filename)
                filepath = os.path.join(out_dir, filename)
                self.logger.info("Downloading/Saving %s" % filepath)

                r = requests.get(iso[1], stream=True)
                if r.ok:
                    with open(filepath, 'wb') as f:
                        for chunk in r.iter_content():
                            if chunk:
                                f.write(chunk)
                else:
                    self.logger.info("Got a non-200 status code (%s) from %s" %
                                     (r.status_code, iso[1]))
            except KeyboardInterrupt:
                self.logger.info("Caught interrupt, exiting")
                sys.exit(0)
            except BaseException:
                self.logger.exception("Error!")
        if clean:
            self.__clean_not_found_files(out_dir, found_isos)
Пример #30
0
    def test_unidata_parse(self):
        selects = [".*Best.*"]
        skips = Crawl.SKIPS + [
            ".*grib2", ".*grib1", ".*GrbF.*", ".*ncx2", "Radar Data",
            "Station Data", "Point Feature Collections", "Satellite Data",
            "Unidata NEXRAD Composites \(GINI\)", "Unidata case studies",
            ".*Reflectivity-[0-9]{8}"
        ]
        c = Crawl('http://thredds.ucar.edu/thredds/catalog.xml',
                  select=selects,
                  skip=skips)

        assert len(c.datasets) > 0

        isos = [(d.id, s.get("url")) for d in c.datasets for s in d.services
                if s.get("service").lower() == "iso"]
        assert len(isos) > 0
Пример #31
0
def cli(thredds_catalogue, skips, select, workers, outfile):
    """ Download Metadata from THREDDS server to tarball

    Example:

       \b
       Download files in directory that match `*yaml` and store them as a tar
        > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/"
        -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*'
         -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz

    """

    user_skips = Crawl.SKIPS
    for skip in skips:
        user_skips = user_skips + [skip]

    print("Searching {thredds_catalogue} for matching files".format(
        thredds_catalogue=thredds_catalogue))
    results = Crawl(thredds_catalogue + '/catalog.xml',
                    select=[select],
                    skip=user_skips,
                    workers=workers).datasets

    print("Found {0} metadata files".format(str(len(results))))

    urls = [
        service['url'] for dataset in results for service in dataset.services
        if service['service'].lower() == 'httpserver'
    ]

    # use a threadpool to download from thredds
    pool = ThreadPool(workers)
    yamls = pool.map(partial(download), urls)
    pool.close()
    pool.join()

    # jam it all in a tar
    tar_opts = dict(name=outfile,
                    mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False))
    with tarfile.open(**tar_opts) as tar:
        for yaml in yamls:
            add_txt_file(tar=tar, content=yaml[0], fname=yaml[1])

    print("Done!")
Пример #32
0
    if args.post:
        token = os.environ['SLACKTOKEN']
        slack = Slacker(token)

    # Assume that the database has already been created with description and terrain information, so use minimal arguments in constructor
    cl = CANONLoader(args.database, args.campaign)
    cl.dbAlias = args.database
    cl.campaignName = args.campaign
   
    # Get directory list from sites
    s = args.inUrl.rsplit('/',1)
    files = s[1]
    url = s[0]
    logger.info("Crawling %s for %s files" % (url, files))
    c = Crawl(os.path.join(url, 'catalog.xml'), select=[files], debug=False)

    for d in c.datasets:
        logger.debug('Found %s' % d.id)
    
    urls = [s.get("url") for d in c.datasets for s in d.services if s.get("service").lower() == "opendap"]

    pw = lrauvNc4ToNetcdf.InterpolatorWriter()

    # If parameter names contains any group forward slash '/' delimiters
    # replace them with underscores. This is because pydap automatically renames slashes as underscores
    # and needs to reference the parameter correctly in the DAPloader
    parm_list = []
    plot_group = []
    parm_process = []
    coord = {}