def queue_harvest_tasks(): """ Generate a number of harvest tasks. Meant to be called via cron. Only queues services that are active. """ with app.app_context(): # Some hosts don't like successive repeated connections, so by # shuffling our list of services we reduce the liklihood that we'll # harvest from the same host enough times to cause a service problem. # This should reduce timeouts and unresponsive datasets services = list(db.Service.find({'active': True})) services = distinct_services(services) shuffle(services) for s in services: service_id = s._id if service_id in LARGER_SERVICES: continue # count all the datasets associated with this particular service datalen = db.datasets.find({'services.service_id': service_id}).count() # handle timeouts for services with large numbers of datasets if datalen <= 36: timeout_secs = 180 else: # for large numbers of requests, 5 seconds should be enough # for each request, on average timeout_secs = datalen * 60 queue.enqueue_call(harvest, args=(service_id,), timeout=timeout_secs) # record dataset/service metrics after harvest add_counts()
def migrate(): """Changes 'Glider_DAC_2' services to 'Glider_DAC'""" with app.app_context(): db.Service.collection.update({"data_provider": u"Glider_DAC_2"}, {'$set': {"data_provider": u"Glider_DAC"}}, multi=True) app.logger.info("Migration 2015-01-20 complete")
def queue_harvest_tasks(): """ Generate a number of harvest tasks. Meant to be called via cron. Only queues services that are active. """ with app.app_context(): for s in db.Service.find({'active':True}, {'_id':True}): service_id = s._id if service_id in LARGER_SERVICES: continue # count all the datasets associated with this particular service datalen = db.datasets.find({'services.service_id': service_id}).count() # handle timeouts for services with large numbers of datasets if datalen <= 36: timeout_secs = 180 else: # for large numbers of requests, 5 seconds should be enough # for each request, on average timeout_secs = datalen * 60 queue.enqueue_call(harvest, args=(service_id,), timeout=timeout_secs) # record dataset/service metrics after harvest add_counts()
def queue_harvest_tasks(): """ Generate a number of harvest tasks. Meant to be called via cron. Only queues services that are active. """ with app.app_context(): # Some hosts don't like successive repeated connections, so by # shuffling our list of services we reduce the liklihood that we'll # harvest from the same host enough times to cause a service problem. # This should reduce timeouts and unresponsive datasets services = list(db.Service.find({'active': True}, {'_id': True})) shuffle(services) for s in services: service_id = s._id if service_id in LARGER_SERVICES: continue # count all the datasets associated with this particular service datalen = db.datasets.find({ 'services.service_id': service_id }).count() # handle timeouts for services with large numbers of datasets if datalen <= 36: timeout_secs = 180 else: # for large numbers of requests, 5 seconds should be enough # for each request, on average timeout_secs = datalen * 60 queue.enqueue_call(harvest, args=(service_id, ), timeout=timeout_secs) # record dataset/service metrics after harvest add_counts()
def cleanup_datasets(): with app.app_context(): datasets = db.Dataset.find({'active':True}) for d in datasets: services = d['services'] # a list of services service_ids = [s['service_id'] for s in services] if not service_ids: app.logger.info('Deactivating %s', d['uid']) d['active'] = False d.save() continue # Go through each of the services # # if we don't find at least one service that is active, set # dataset.active to False for service_id in service_ids: related_services = db.Service.find({'_id':service_id}) for service in related_services: if service['active']: break else: # reached the end of the loop app.logger.info('Deactivating %s', d['uid']) d['active'] = False d.save() break
def send_service_down_email(service_id): with app.app_context(): kwargs = { 'service': db.Service.find_one({'_id': service_id}), 'stat': db.PingLatest.find_one({'service_id': service_id}) } kwargs['status'] = kwargs['stat'].last_operational_status subject = "[ioos] Service Status Alert (%s): %s (%s)" % ( "UP" if kwargs['status'] else "DOWN", kwargs['service'].name, kwargs['service'].service_type) text_template = render_template("service_status_changed.txt", **kwargs) html_template = render_template("service_status_changed.html", **kwargs) to_addresses = [app.config.get("MAIL_DEFAULT_LIST") ] if app.config.get('MAILER_DEBUG') == False else [ app.config.get("MAIL_DEFAULT_TO") ] # Don't send these until Anna updates the ISO document in GeoPortal with the correct service contacts #if app.config.get('MAILER_DEBUG') == False and kwargs['service'].contact is not None: # to_addresses = kwargs['service'].contact.split(",") cc_addresses = [app.config.get("MAIL_DEFAULT_TO")] send(subject, to_addresses, cc_addresses, text_template, html_template)
def send_daily_report_email(end_time=None, start_time=None): with app.app_context(): failed_services, services, end_time, start_time = db.Service.get_failures_in_time_range(end_time, start_time) text_template = render_template("daily_service_report.txt", services=services, failed_services=failed_services, start_time=start_time, end_time=end_time) html_template = render_template("daily_service_report_email.html", services=services, failed_services=failed_services, start_time=start_time, end_time=end_time) to_addresses = [app.config.get("MAIL_DEFAULT_LIST")] if app.config.get('MAILER_DEBUG') == False else [app.config.get("MAIL_DEFAULT_TO")] cc_addresses = [app.config.get("MAIL_DEFAULT_TO")] subject = "[ioos] Service Daily Downtime Report" send(subject, to_addresses, cc_addresses, text_template, html_template)
def harvest(service_id): with app.app_context(): service = db.Service.find_one( { '_id' : ObjectId(service_id) } ) # make sure service is active before we harvest if not service.active: #service.cancel_harvest() return "Service %s is not active, not harvesting" % service_id # ping it first to see if alive try: _, response_code = service.ping(timeout=15) operational_status = True if response_code in [200,400] else False except (requests.ConnectionError, requests.HTTPError, requests.Timeout): operational_status = False if not operational_status: # not a failure # @TODO: record last attempt time/this message return "Aborted harvest due to service down" if service.service_type == "DAP": return DapHarvest(service).harvest() elif service.service_type == "SOS": return SosHarvest(service).harvest() elif service.service_type == "WMS": return WmsHarvest(service).harvest() elif service.service_type == "WCS": return WcsHarvest(service).harvest()
def send_daily_report_email(end_time=None, start_time=None): with app.app_context(): failed_services, services, end_time, start_time = db.Service.get_failures_in_time_range( end_time, start_time) text_template = render_template("daily_service_report.txt", services=services, failed_services=failed_services, start_time=start_time, end_time=end_time) html_template = render_template("daily_service_report_email.html", services=services, failed_services=failed_services, start_time=start_time, end_time=end_time) to_addresses = [app.config.get("MAIL_DEFAULT_LIST") ] if app.config.get('MAILER_DEBUG') == False else [ app.config.get("MAIL_DEFAULT_TO") ] cc_addresses = [app.config.get("MAIL_DEFAULT_TO")] subject = "[ioos] Service Daily Downtime Report" send(subject, to_addresses, cc_addresses, text_template, html_template)
def cleanup_datasets(): with app.app_context(): datasets = db.Dataset.find({'active': True}) for d in datasets: services = d['services'] # a list of services service_ids = [s['service_id'] for s in services] if not service_ids: app.logger.info('Deactivating %s', d['uid']) d['active'] = False d.save() continue # Go through each of the services # # if we don't find at least one service that is active, set # dataset.active to False for service_id in service_ids: related_services = db.Service.find({'_id': service_id}) for service in related_services: if service['active']: break else: # reached the end of the loop app.logger.info('Deactivating %s', d['uid']) d['active'] = False d.save() break
def queue_large_service_harvest_tasks(): larger_services = [ ObjectId('53d34aed8c0db37e0b538fda'), ObjectId('53d49c8d8c0db37ff1370308') ] with app.app_context(): for s in db.Service.find({'_id': {'$in': larger_services}}): service_id = s._id # count all the datasets associated with this particular service datalen = db.datasets.find({ 'services.service_id': service_id }).count() # handle timeouts for services with large numbers of datasets if datalen <= 36: timeout_secs = 180 else: # for large numbers of requests, 5 seconds should be enough # for each request, on average timeout_secs = datalen * 60 queue.enqueue_call(harvest, args=(service_id, ), timeout=timeout_secs) # record dataset/service metrics after harvest add_counts()
def migrate(): """Adds min and max time to datasets""" with app.app_context(): datasets = db.Dataset.find() for d in datasets: for i, s in enumerate(d['services']): d['services'][i]['time_min'] = None d['services'][i]['time_max'] = None d.save()
def queue_harvest_tasks(): """ Generate a number of harvet tasks. Meant to be called via cron. Only queues services that are active. """ with app.app_context(): sids = [s._id for s in db.Service.find({'active':True}, {'_id':True})] for sid in sids: queue.enqueue(harvest, sid)
def save_ccheck_and_metadata(self, service_id, checker_name, ref_id, ref_type, scores, metamap): """ Saves the result of a compliance checker scores and metamap document. Will be called by service/station derived methods. """ if not (scores or metamap): return with app.app_context(): def res2dict(r): cl = [] if r.children: cl = map(res2dict, r.children) return {'name' : unicode(r.name), 'score' : float(r.value[0]), 'maxscore' : float(r.value[1]), 'weight' : int(r.weight), 'children' : cl} metadata = db.Metadata.find_one({'ref_id': ref_id}) if metadata is None: metadata = db.Metadata() metadata.ref_id = ref_id metadata.ref_type = unicode(ref_type) cc_results = map(res2dict, scores) # @TODO: srsly need to decouple from cchecker score = sum(((float(r.value[0])/r.value[1]) * r.weight for r in scores)) max_score = sum((r.weight for r in scores)) score_doc = {'score' : float(score), 'max_score' : float(max_score), 'pct' : float(score) / max_score} update_doc = {'cc_score' : score_doc, 'cc_results' : cc_results, 'metamap' : metamap} for mr in metadata.metadata: if mr['service_id'] == service_id and mr['checker'] == checker_name: mr.update(update_doc) break else: metarecord = {'service_id': service_id, 'checker' : unicode(checker_name)} metarecord.update(update_doc) metadata.metadata.append(metarecord) metadata.updated = datetime.utcnow() metadata.save() return metadata
def ccheck_dataset(self, ncdataset): with app.app_context(): scores = None try: cs = ComplianceCheckerCheckSuite() groups = cs.run(ncdataset, 'ioos') scores = groups['ioos'] except Exception as e: app.logger.warn("Caught exception doing Compliance Checker on Dataset: %s", e) return scores
def ccheck_station(self, sensor_ml): with app.app_context(): scores = None try: cs = ComplianceCheckerCheckSuite() groups = cs.run(sensor_ml, 'ioos') scores = groups['ioos'] except Exception as e: app.logger.warn("Caught exception doing Compliance Checker on SOS station: %s", e) return scores
def harvest(service_id): with app.app_context(): service = db.Service.find_one( { '_id' : ObjectId(service_id) } ) if service.service_type == "DAP": return DapHarvest(service).harvest() elif service.service_type == "SOS": return SosHarvest(service).harvest() elif service.service_type == "WMS": return WmsHarvest(service).harvest() elif service.service_type == "WCS": return WcsHarvest(service).harvest()
def ping_service_task(service_id): with app.app_context(): # get last for this service last_stat = db.Stat.find_one({"service_id": ObjectId(service_id)}, sort=[("created", -1)]) stat = db.Stat() stat.service_id = ObjectId(service_id) stat.ping_service() stat.save() if last_stat and last_stat.operational_status != stat.operational_status: queue.enqueue(send_service_down_email, ObjectId(service_id))
def harvest(service_id): with app.app_context(): service = db.Service.find_one({'_id': ObjectId(service_id)}) if service.service_type == "DAP": return DapHarvest(service).harvest() elif service.service_type == "SOS": return SosHarvest(service).harvest() elif service.service_type == "WMS": return WmsHarvest(service).harvest() elif service.service_type == "WCS": return WcsHarvest(service).harvest()
def queue_ping_tasks(): """ Generate a number of ping tasks. Meant to be called via cron. Only queues services that are active. """ with app.app_context(): sids = [ s._id for s in db.Service.find({'active': True}, {'_id': True}) ] for sid in sids: queue.enqueue(ping_service_task, sid)
def ping_service_task(service_id): with app.app_context(): # get last for this service last_stat = db.Stat.find_one({'service_id': ObjectId(service_id)}, sort=[('created', -1)]) stat = db.Stat() stat.service_id = ObjectId(service_id) stat.ping_service() stat.save() if last_stat and last_stat.operational_status != stat.operational_status: queue.enqueue(send_service_down_email, ObjectId(service_id))
def ccheck_service(self): assert self.sos with app.app_context(): scores = None try: cs = ComplianceCheckerCheckSuite() groups = cs.run(self.sos, 'ioos') scores = groups['ioos'] except Exception as e: app.logger.warn("Caught exception doing Compliance Checker on SOS service: %s", e) return scores
def metamap_station(self, sensor_ml): with app.app_context(): # gets a metamap document of this service using wicken beliefs = IOOSSOSDSCheck.beliefs() doc = MultipleXmlDogma('sos-ds', beliefs, sensor_ml._root, namespaces=get_namespaces()) # now make a map out of this # @TODO wicken should make this easier metamap = {} for k in beliefs: try: metamap[k] = getattr(doc, doc._fixup_belief(k)[0]) except Exception as e: pass return metamap
def metamap_dataset(self, ncdataset): with app.app_context(): # gets a metamap document of this service using wicken beliefs = IOOSNCCheck.beliefs() ncnamespaces = {'nc': pb_namespaces['ncml']} doc = NetCDFDogma('nc', beliefs, ncdataset, namespaces=ncnamespaces) # now make a map out of this # @TODO wicken should make this easier m_names, m_units = ['Variable Names*', 'Variable Units*'] metamap = {} for k in beliefs: try: metamap[k] = getattr(doc, doc._fixup_belief(k)[0]) except Exception as e: app.logger.exception("Problem setting belief (%s)", k) metamap[m_names] = [ ] # Override the Wicken return to preserve the order metamap[m_units] = [ ] # Override the Wicken return to preserve the order # Wicken doesn't preserve the order between the names and the units, # so what you wind up with is two lists that can't be related, but we # want to keep the relationship between the name and the units for k in ncdataset.variables.iterkeys(): var_name = k standard_name = getattr(ncdataset.variables[k], 'standard_name', '') units = getattr(ncdataset.variables[k], 'units', '') # Only map metadata where we have all three if var_name and standard_name and units: metamap[m_names].append('%s (%s)' % (var_name, standard_name)) metamap[m_units].append(units) return metamap
def metamap_service(self): assert self.sos with app.app_context(): # gets a metamap document of this service using wicken beliefs = IOOSSOSGCCheck.beliefs() doc = MultipleXmlDogma('sos-gc', beliefs, self.sos._capabilities, namespaces=get_namespaces()) # now make a map out of this # @TODO wicken should make this easier metamap = {} for k in beliefs: try: metamap[k] = getattr(doc, doc._fixup_belief(k)[0]) except Exception as e: pass return metamap
def ping_service_task(service_id): with app.app_context(): pl = db.PingLatest.get_for_service(ObjectId(service_id)) wasnew, flip = pl.ping_service() pl.save() # save to WeeklyArchive if wasnew: utcnow = datetime.utcnow() pa = db.PingArchive.get_for_service(ObjectId(service_id), utcnow) pa.add_ping_data(pl.last_response_time, pl.last_operational_status) pa.updated = utcnow pa.save() if flip: queue.enqueue(send_service_down_email, ObjectId(service_id)) return pl.last_response_time
def metamap_dataset(self, ncdataset): with app.app_context(): # gets a metamap document of this service using wicken beliefs = IOOSNCCheck.beliefs() ncnamespaces = {'nc': pb_namespaces['ncml']} doc = NetCDFDogma('nc', beliefs, ncdataset, namespaces=ncnamespaces) # now make a map out of this # @TODO wicken should make this easier m_names, m_units = ['Variable Names*', 'Variable Units*'] metamap = {} for k in beliefs: try: metamap[k] = getattr(doc, doc._fixup_belief(k)[0]) except Exception: app.logger.exception("Problem setting belief (%s)", k) # Override the Wicken return to preserve the order metamap[m_names] = [] # Override the Wicken return to preserve the order metamap[m_units] = [] # Wicken doesn't preserve the order between the names and the units, # so what you wind up with is two lists that can't be related, but we # want to keep the relationship between the name and the units for k in ncdataset.variables.iterkeys(): var_name = k standard_name = getattr( ncdataset.variables[k], 'standard_name', '') units = getattr(ncdataset.variables[k], 'units', '') # Only map metadata where we have all three if var_name and standard_name and units: metamap[m_names].append('%s (%s)' % (var_name, standard_name)) metamap[m_units].append(units) return metamap
def metamap_dataset(self, ncdataset): with app.app_context(): # gets a metamap document of this service using wicken beliefs = IOOSNCCheck.beliefs() ncnamespaces = {'nc':pb_namespaces['ncml']} doc = NetCDFDogma('nc', beliefs, ncdataset, namespaces=ncnamespaces) # now make a map out of this # @TODO wicken should make this easier metamap = {} for k in beliefs: try: metamap[k] = getattr(doc, doc._fixup_belief(k)[0]) except Exception as e: print k, e return metamap
def queue_provider(provider): with app.app_context(): for s in db.Service.find({'data_provider':provider, 'active':True}): service_id = s._id if service_id in LARGER_SERVICES: continue # count all the datasets associated with this particular service datalen = db.datasets.find({'services.service_id': service_id}).count() # handle timeouts for services with large numbers of datasets if datalen <= 36: timeout_secs = 180 else: # for large numbers of requests, 5 seconds should be enough # for each request, on average timeout_secs = datalen * 60 queue.enqueue_call(harvest, args=(service_id,), timeout=timeout_secs) # record dataset/service metrics after harvest add_counts()
def reindex_services(provider=None): ''' Downloads all records from CKAN and creates service records for the appropriate resources defined in those records. ''' region_map = get_region_map() if provider is not None: region_map = [org for org in region_map if org['name'] == provider] with app.app_context(): for organization in region_map: index_organization(organization) # Deactivate any service older than 7 days old = datetime.utcnow() - timedelta(days=7) db.services.update({"updated": {"$lt": old}}, {"$set": {"active": False, "updated": datetime.utcnow()}}, multi=True, upsert=False) return
def queue_large_service_harvest_tasks(): larger_services = [ ObjectId('53d34aed8c0db37e0b538fda'), ObjectId('53d49c8d8c0db37ff1370308') ] with app.app_context(): for s in db.Service.find({'_id':{'$in':larger_services}}): service_id = s._id # count all the datasets associated with this particular service datalen = db.datasets.find({'services.service_id': service_id}).count() # handle timeouts for services with large numbers of datasets if datalen <= 36: timeout_secs = 180 else: # for large numbers of requests, 5 seconds should be enough # for each request, on average timeout_secs = datalen * 60 queue.enqueue_call(harvest, args=(service_id,), timeout=timeout_secs) # record dataset/service metrics after harvest add_counts()
def send_service_down_email(service_id): with app.app_context(): kwargs = {'service' : db.Service.find_one({'_id':service_id}), 'stat' : db.Stat.find_one({'service_id':service_id}, sort=[('created',-1)]), 'last_success_stat' : db.Stat.find_one({'service_id':service_id, 'operational_status':1}, sort=[('created',-1)]) } kwargs['status'] = kwargs['stat'].operational_status subject = "[ioos] Service Status Alert (%s): %s (%s)" % ("UP" if kwargs['status'] else "DOWN", kwargs['service'].name, kwargs['service'].service_type) text_template = render_template("service_status_changed.txt", **kwargs) html_template = render_template("service_status_changed.html", **kwargs) to_addresses = [app.config.get("MAIL_DEFAULT_LIST")] if app.config.get('MAILER_DEBUG') == False else [app.config.get("MAIL_DEFAULT_TO")] # Don't send these until Anna updates the ISO document in GeoPortal with the correct service contacts #if app.config.get('MAILER_DEBUG') == False and kwargs['service'].contact is not None: # to_addresses = kwargs['service'].contact.split(",") cc_addresses = [app.config.get("MAIL_DEFAULT_TO")] send(subject, to_addresses, cc_addresses, text_template, html_template)
def harvest(self): """ Identify the type of CF dataset this is: * UGRID * CGRID * RGRID * DSG """ METADATA_VAR_NAMES = [u'crs', u'projection'] # CF standard names for Axis STD_AXIS_NAMES = [ u'latitude', u'longitude', u'time', u'forecast_reference_time', u'forecast_period', u'ocean_sigma', u'ocean_s_coordinate_g1', u'ocean_s_coordinate_g2', u'ocean_s_coordinate', u'ocean_double_sigma', u'ocean_sigma_over_z', u'projection_y_coordinate', u'projection_x_coordinate' ] # Some datasets don't define standard_names on axis variables. This is used to weed them out based on the # actual variable name COMMON_AXIS_NAMES = [ u'x', u'y', u'lat', u'latitude', u'lon', u'longitude', u'time', u'time_run', u'time_offset', u'ntimes', u'lat_u', u'lon_u', u'lat_v', u'lon_v ', u'lat_rho', u'lon_rho', u'lat_psi' ] cd = CommonDataset.open(self.service.get('url')) # For DAP, the unique ID is the URL unique_id = self.service.get('url') with app.app_context(): dataset = db.Dataset.find_one({'uid': unicode(unique_id)}) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = None try: name = unicode_or_none(cd.nc.getncattr('title')) except AttributeError: messages.append( u"Could not get dataset name. No global attribute named 'title'." ) # DESCRIPTION description = None try: description = unicode_or_none(cd.nc.getncattr('summary')) except AttributeError: messages.append( u"Could not get dataset description. No global attribute named 'summary'." ) # KEYWORDS keywords = [] try: keywords = sorted( map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(","))) except AttributeError: messages.append( u"Could not get dataset keywords. No global attribute named 'keywords' or was not comma seperated list." ) # VARIABLES prefix = "" # Add additonal prefix mappings as they become available. try: standard_name_vocabulary = unicode( cd.nc.getncattr("standard_name_vocabulary")) cf_regex = [ re.compile("CF-"), re.compile( 'http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html' ) ] for reg in cf_regex: if reg.match(standard_name_vocabulary) is not None: prefix = "http://mmisw.org/ont/cf/parameter/" break except AttributeError: pass # Get variables with a standard_name std_variables = [ cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0 ] # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable non_std_variables = list( set([ x for x in cd.nc.variables if x not in itertools.chain( _possibley, _possiblex, _possiblez, _possiblet, METADATA_VAR_NAMES, COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables ])) """ var_to_get_geo_from = None if len(std_names) > 0: var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0] messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1])) else: # No idea which variable to generate geometry from... try to factor variables with a shape > 1. try: var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1] except IndexError: messages.append(u"Could not find any non-axis variables to compute geometry from.") else: messages.append(u"No 'standard_name' attributes were found on non-axis variables. Variable '%s' was used to calculate geometry." % var_to_get_geo_from) """ # LOCATION (from Paegan) # Try POLYGON and fall back to BBOX gj = None for v in itertools.chain(std_variables, non_std_variables): try: gj = mapping(cd.getboundingpolygon(var=v)) except (AttributeError, AssertionError, ValueError): try: # Returns a tuple of four coordinates, but box takes in four seperate positional argouments # Asterik magic to expland the tuple into positional arguments gj = mapping(box(*cd.get_bbox(var=v))) except (AttributeError, AssertionError, ValueError): pass if gj is not None: # We computed something, break out of loop. messages.append( u"Variable %s was used to calculate geometry." % v) break if gj is None: messages.append( u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset." ) messages.append( u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset." ) messages.append( u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables))) # TODO: compute bounding box using global attributes final_var_names = [] if prefix == "": messages.append( u"Could not find a standard name vocabulary. No global attribute named 'standard_name_vocabulary'. Variable list may be incorrect or contain non-measured quantities." ) final_var_names = non_std_variables + std_variables else: final_var_names = non_std_variables + list( map(unicode, [ "%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables ])) service = { 'name': name, 'description': description, 'service_type': self.service.get('service_type'), 'service_id': ObjectId(self.service.get('_id')), 'data_provider': self.service.get('data_provider'), 'metadata_type': u'ncml', 'metadata_value': unicode(dataset2ncml(cd.nc, url=self.service.get('url'))), 'messages': map(unicode, messages), 'keywords': keywords, 'variables': map(unicode, final_var_names), 'asset_type': unicode(cd._datasettype).upper(), 'geojson': gj, 'updated': datetime.utcnow() } with app.app_context(): dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() return "Harvested"
def wrapper(*args, **kwargs): with app.app_context(): return f(*args, **kwargs)
def migrate(): with app.app_context(): migrate_names() migrate_active_datasets() migrate_active_metadata() app.logger.info("Migration 2014-08-27 complete")
def harvest(self): """ Identify the type of CF dataset this is: * UGRID * CGRID * RGRID * DSG """ try: cd = CommonDataset.open(self.service.get('url')) except Exception as e: app.logger.error("Could not open DAP dataset from '%s'\n" "Exception %s: %s" % (self.service.get('url'), type(e).__name__, e)) return 'Not harvested' # rely on times in the file first over global atts for calculating # start/end times of dataset. tmin, tmax = self.get_min_max_time(cd) # if nothing was returned, try to get from global atts if (tmin == None and tmax == None and 'time_coverage_start' in cd.metadata and 'time_coverage_end' in cd.metadata): try: tmin, tmax = (parse(cd.metadata[t]) for t in ('time_coverage_start', 'time_coverage_end')) except ValueError: tmin, tmax = None, None # For DAP, the unique ID is the URL unique_id = self.service.get('url') with app.app_context(): dataset = db.Dataset.find_one({'uid': unicode(unique_id)}) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) dataset['active'] = True # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = None try: name = unicode_or_none(cd.nc.getncattr('title')) except AttributeError: messages.append( u"Could not get dataset name. No global attribute named 'title'." ) # DESCRIPTION description = None try: description = unicode_or_none(cd.nc.getncattr('summary')) except AttributeError: messages.append( u"Could not get dataset description. No global attribute named 'summary'." ) # KEYWORDS keywords = [] try: keywords = sorted( map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(","))) except AttributeError: messages.append( u"Could not get dataset keywords. No global attribute named 'keywords' or was not comma seperated list." ) # VARIABLES prefix = "" # Add additonal prefix mappings as they become available. try: standard_name_vocabulary = unicode( cd.nc.getncattr("standard_name_vocabulary")) cf_regex = [ re.compile("CF-"), re.compile( 'http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html' ) ] for reg in cf_regex: if reg.match(standard_name_vocabulary) is not None: prefix = "http://mmisw.org/ont/cf/parameter/" break except AttributeError: pass # Get variables with a standard_name std_variables = [ cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in self.STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0 ] # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable non_std_variables = list( set([ x for x in cd.nc.variables if x not in itertools.chain( _possibley, _possiblex, _possiblez, _possiblet, self.METADATA_VAR_NAMES, self.COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables ])) axis_names = DapHarvest.get_axis_variables(cd.nc) """ var_to_get_geo_from = None if len(std_names) > 0: var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0] messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1])) else: # No idea which variable to generate geometry from... try to factor variables with a shape > 1. try: var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1] except IndexError: messages.append(u"Could not find any non-axis variables to compute geometry from.") else: messages.append(u"No 'standard_name' attributes were found on non-axis variables. Variable '%s' was used to calculate geometry." % var_to_get_geo_from) """ # LOCATION (from Paegan) # Try POLYGON and fall back to BBOX # paegan does not support ugrid, so try to detect this condition and skip is_ugrid = False is_trajectory = False for vname, v in cd.nc.variables.iteritems(): if 'cf_role' in v.ncattrs(): if v.getncattr('cf_role') == 'mesh_topology': is_ugrid = True break elif v.getncattr('cf_role') == 'trajectory_id': is_trajectory = True break gj = None if is_ugrid: messages.append( u"The underlying 'Paegan' data access library does not support UGRID and cannot parse geometry." ) elif is_trajectory: coord_names = {} # try to get info for x, y, z, t axes for v in itertools.chain(std_variables, non_std_variables): try: coord_names = cd.get_coord_names(v, **axis_names) if coord_names['xname'] is not None and \ coord_names['yname'] is not None: break except (AssertionError, AttributeError, ValueError, KeyError): pass else: messages.append( u"Trajectory discovered but could not detect coordinate variables using the underlying 'Paegan' data access library." ) if 'xname' in coord_names: try: xvar = cd.nc.variables[coord_names['xname']] yvar = cd.nc.variables[coord_names['yname']] # one less order of magnitude eg 390000 -> 10000 slice_factor = 10**(int(math.log10(xvar.size)) - 1) if slice_factor < 1: slice_factor = 1 # TODO: don't split x/y as separate arrays. Refactor to # use single numpy array instead with both lon/lat # tabledap datasets must be treated differently than # standard DAP endpoints. Retrieve geojson instead of # trying to access as a DAP endpoint if 'erddap/tabledap' in unique_id: # take off 's.' from erddap gj = self.erddap_geojson_url(coord_names) # type defaults to MultiPoint, change to LineString coords = np.array(gj['coordinates'][::slice_factor] + gj['coordinates'][-1:]) xs = coords[:, 0] ys = coords[:, 1] else: xs = np.concatenate((xvar[::slice_factor], xvar[-1:])) ys = np.concatenate((yvar[::slice_factor], yvar[-1:])) # both coords must be valid to have a valid vertex # get rid of any nans and unreasonable lon/lats valid_idx = ((~np.isnan(xs)) & (np.absolute(xs) <= 180) & (~np.isnan(ys)) & (np.absolute(ys) <= 90)) xs = xs[valid_idx] ys = ys[valid_idx] # Shapely seems to require float64 values or incorrect # values will propagate for the generated lineString # if the array is not numpy's float64 dtype lineCoords = np.array([xs, ys]).T.astype('float64') gj = mapping(asLineString(lineCoords)) messages.append(u"Variable %s was used to calculate " u"trajectory geometry, and is a " u"naive sampling." % v) except (AssertionError, AttributeError, ValueError, KeyError, IndexError) as e: app.logger.warn("Trajectory error occured: %s", e) messages.append( u"Trajectory discovered but could not create a geometry." ) else: for v in itertools.chain(std_variables, non_std_variables): try: gj = mapping( cd.getboundingpolygon(var=v, **axis_names).simplify(0.5)) except (AttributeError, AssertionError, ValueError, KeyError, IndexError): try: # Returns a tuple of four coordinates, but box takes in four seperate positional argouments # Asterik magic to expland the tuple into positional arguments app.logger.exception("Error calculating bounding box") # handles "points" aka single position NCELLs bbox = cd.getbbox(var=v, **axis_names) gj = self.get_bbox_or_point(bbox) except (AttributeError, AssertionError, ValueError, KeyError, IndexError): pass if gj is not None: # We computed something, break out of loop. messages.append( u"Variable %s was used to calculate geometry." % v) break if gj is None: # Try the globals gj = self.global_bounding_box(cd.nc) messages.append( u"Bounding Box calculated using global attributes") if gj is None: messages.append( u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset." ) messages.append( u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset." ) messages.append( u"Failed to calculate geometry using all of the following variables: %s" % ", ".join( itertools.chain(std_variables, non_std_variables))) # TODO: compute bounding box using global attributes final_var_names = [] if prefix == "": messages.append( u"Could not find a standard name vocabulary. No global attribute named 'standard_name_vocabulary'. Variable list may be incorrect or contain non-measured quantities." ) final_var_names = non_std_variables + std_variables else: final_var_names = non_std_variables + list( map(unicode, [ "%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables ])) service = { 'name': name, 'description': description, 'service_type': self.service.get('service_type'), 'service_id': ObjectId(self.service.get('_id')), 'data_provider': self.service.get('data_provider'), 'metadata_type': u'ncml', 'metadata_value': unicode(dataset2ncml(cd.nc, url=self.service.get('url'))), 'time_min': tmin, 'time_max': tmax, 'messages': map(unicode, messages), 'keywords': keywords, 'variables': map(unicode, final_var_names), 'asset_type': get_common_name(DapHarvest.get_asset_type(cd)), 'geojson': gj, 'updated': datetime.utcnow() } with app.app_context(): dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() ncdataset = Dataset(self.service.get('url')) scores = self.ccheck_dataset(ncdataset) metamap = self.metamap_dataset(ncdataset) try: metadata_rec = self.save_ccheck_dataset('ioos', dataset._id, scores, metamap) except Exception as e: metadata_rec = None app.logger.error( "could not save compliancecheck/metamap information", exc_info=True) return "Harvested"
def process_station(self, uid, offering): """ Makes a DescribeSensor request based on a 'uid' parameter being a station procedure. Also pass along an offering with getCapabilities information for items such as temporal extent""" GML_NS = "http://www.opengis.net/gml" XLINK_NS = "http://www.w3.org/1999/xlink" with app.app_context(): app.logger.info("process_station: %s", uid) desc_sens = self._describe_sensor(uid, timeout=1200) # FIXME: add some kind of notice saying the station failed if desc_sens is None: app.logger.warn( "Could not get a valid describeSensor response") return metadata_value = etree.fromstring(desc_sens) sensor_ml = SensorML(metadata_value) try: station_ds = IoosDescribeSensor(metadata_value) # if this doesn't conform to IOOS SensorML sub, fall back to # manually picking apart the SensorML except ows.ExceptionReport: station_ds = process_sensorml(sensor_ml.members[0]) unique_id = station_ds.id if unique_id is None: app.logger.warn( "Could not get a 'stationID' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/stationID'" ) return dataset = db.Dataset.find_one({'uid': unicode(unique_id)}) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) dataset['active'] = True # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = unicode_or_none(station_ds.shortName) if name is None: messages.append( u"Could not get a 'shortName' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/shortName'" ) # DESCRIPTION description = unicode_or_none(station_ds.longName) if description is None: messages.append( u"Could not get a 'longName' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/longName'" ) # PLATFORM TYPE asset_type = unicode_or_none( getattr(station_ds, 'platformType', None)) if asset_type is None: messages.append( u"Could not get a 'platformType' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/platformType'" ) # LOCATION is in GML gj = None loc = station_ds.location if loc is not None and loc.tag == "{%s}Point" % GML_NS: pos_element = loc.find("{%s}pos" % GML_NS) # some older responses may uses the deprecated coordinates # element if pos_element is None: # if pos not found use deprecated coordinates element pos_element = loc.find("{%s}coordinates" % GML_NS) # strip out points positions = map(float, pos_element.text.split(" ")) for el in [pos_element, loc]: srs_name = testXMLAttribute(el, "srsName") if srs_name: crs = Crs(srs_name) if crs.axisorder == "yx": gj = json.loads( geojson.dumps( geojson.Point([positions[1], positions[0]]))) else: gj = json.loads( geojson.dumps( geojson.Point([positions[0], positions[1]]))) break else: if positions: messages.append( u"Position(s) found but could not parse SRS: %s, %s" % (positions, srs_name)) else: messages.append( u"Found an unrecognized child of the sml:location element and did not attempt to process it: %s" % loc) meta_str = unicode(etree.tostring(metadata_value)).strip() if len(meta_str) > 4000000: messages.append( u'Metadata document was too large to store (len: %s)' % len(meta_str)) meta_str = u'' service = { # Reset service 'name': name, 'description': description, 'service_type': self.service.get('service_type'), 'service_id': ObjectId(self.service.get('_id')), 'data_provider': self.service.get('data_provider'), 'metadata_type': u'sensorml', 'metadata_value': u'', 'time_min': getattr(offering, 'begin_position', None), 'time_max': getattr(offering, 'end_position', None), 'messages': map(unicode, messages), 'keywords': map(unicode, sorted(station_ds.keywords)), 'variables': map(unicode, sorted(station_ds.variables)), 'asset_type': get_common_name(asset_type), 'geojson': gj, 'updated': datetime.utcnow() } dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() # do compliance checker / metadata now scores = self.ccheck_station(sensor_ml) metamap = self.metamap_station(sensor_ml) try: self.save_ccheck_station('ioos', dataset._id, scores, metamap) except Exception as e: app.logger.warn( "could not save compliancecheck/metamap information: %s", e) return "Harvest Successful"
def process_station(self, uid): """ Makes a DescribeSensor request based on a 'uid' parameter being a station procedure """ GML_NS = "http://www.opengis.net/gml" XLINK_NS = "http://www.w3.org/1999/xlink" with app.app_context(): metadata_value = etree.fromstring(self.sos.describe_sensor(outputFormat='text/xml;subtype="sensorML/1.0.1/profiles/ioos_sos/1.0"', procedure=uid)) station_ds = IoosDescribeSensor(metadata_value) unique_id = station_ds.id if unique_id is None: app.logger.warn("Could not get a 'stationID' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/stationID'") return dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } ) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = unicode_or_none(station_ds.shortName) if name is None: messages.append(u"Could not get a 'shortName' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/shortName'") # DESCRIPTION description = unicode_or_none(station_ds.longName) if description is None: messages.append(u"Could not get a 'longName' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/longName'") # PLATFORM TYPE asset_type = unicode_or_none(station_ds.platformType) if asset_type is None: messages.append(u"Could not get a 'platformType' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/platformType'") # LOCATION is in GML gj = None loc = station_ds.location if loc is not None and loc.tag == "{%s}Point" % GML_NS: pos_element = loc.find("{%s}pos" % GML_NS) # strip out points positions = map(float, testXMLValue(pos_element).split(" ")) crs = Crs(testXMLAttribute(pos_element, "srsName")) if crs.axisorder == "yx": gj = json.loads(geojson.dumps(geojson.Point([positions[1], positions[0]]))) else: gj = json.loads(geojson.dumps(geojson.Point([positions[0], positions[1]]))) else: messages.append(u"Found an unrecognized child of the sml:location element and did not attempt to process it: %s" % etree.tostring(loc).strip()) service = { # Reset service 'name' : name, 'description' : description, 'service_type' : self.service.get('service_type'), 'service_id' : ObjectId(self.service.get('_id')), 'data_provider' : self.service.get('data_provider'), 'metadata_type' : u'sensorml', 'metadata_value' : unicode(etree.tostring(metadata_value)).strip(), 'messages' : map(unicode, messages), 'keywords' : map(unicode, sorted(station_ds.keywords)), 'variables' : map(unicode, sorted(station_ds.variables)), 'asset_type' : asset_type, 'geojson' : gj, 'updated' : datetime.utcnow() } dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() return "Harvested"
def regulate(): with app.app_context(): # Get services that have not been updated in two weeks and remove them. # The reindex job sets the 'updated' field. The below logic should effectively remove # services that the reindex task has not seen in two weeks. two_weeks_ago = (datetime.utcnow() - timedelta(weeks=2)).replace(tzinfo=pytz.utc) deletes = [s for s in db.Service.find() if s.updated.replace(tzinfo=pytz.utc).astimezone(pytz.utc) < two_weeks_ago] for d in deletes: d.cancel_ping() d.cancel_harvest() # I don't think we want to delete these. # Lets make deletion a manual process. #d.delete() # TODO: Now delete the stats that were collected for this service. # Get function and args of jobs = scheduler.get_jobs() # Make sure a daily report job is running daily_email_jobs = [job for job in jobs if job.func == send_daily_report_email] if len(daily_email_jobs) > 1: # Cancel all but the first daily email job for j in daily_email_jobs[1:]: scheduler.cancel(j) elif len(daily_email_jobs) < 1: # Run today at 3am (7am UTC) if it is between midnight and 3am runat = datetime.utcnow().replace(hour=7, minute=0, second=0, microsecond=0) if datetime.utcnow() > runat: # Run tomorrow at 3am (7am UTC) because it is already past that time. runat = runat + timedelta(days=1) scheduler.schedule( scheduled_time=runat, # Time for first execution func=send_daily_report_email, # Function to be queued interval=86400, # Time before the function is called again, in seconds (86400 == 1 day) repeat=None, # Repeat this number of times (None means repeat forever) result_ttl=100000 # How long to keep the results, in seconds ) # Make sure a service update job is running reindex_services_jobs = [job for job in jobs if job.func == reindex_services] if len(reindex_services_jobs) < 1: scheduler.schedule( scheduled_time=datetime.utcnow(), # Time for first execution func=reindex_services, # Function to be queued interval=21600, # Time before the function is called again, in seconds (21600 == 1/4 of a day) repeat=None, # Repeat this number of times (None means repeat forever) result_ttl=40000, # How long to keep the results, in seconds timeout=1200 # Default timeout of 180 seconds may not be enough ) # Make sure each service has a ping job stat_jobs = [unicode(job.args[0]) for job in jobs if job.func == ping_service_task] # Get services that don't have jobs need_ping = [s for s in db.Service.find() if unicode(s._id) not in stat_jobs] # Schedule the ones that do not for s in need_ping: s.schedule_ping(cancel=False) # Make sure each service has a harvest job harvest_jobs = [unicode(job.args[0]) for job in jobs if job.func == harvest] # Get services that don't have jobs need_harvest = [s for s in db.Service.find() if unicode(s._id) not in harvest_jobs] # Schedule the ones that do not for s in need_harvest: s.schedule_harvest(cancel=False) return "Regulated %s reindex jobs, %s ping jobs, %s harvest jobs, and deleted %s old services" % (len(reindex_services_jobs), len(need_ping), len(need_harvest), len(deletes))
self.update = {'$set': {'ping_job_id': None}} # Stats from ioos_catalog.models import stat class StatMigration(DocumentMigration): # add any migrations here named "allmigration_*" pass # Datasets from ioos_catalog.models import dataset class DatasetMigration(DocumentMigration): # add any migrations here named "allmigration_*" pass with app.app_context(): migration = ServiceMigration(service.Service) migration.migrate_all(collection=db['services']) migration = StatMigration(stat.Stat) migration.migrate_all(collection=db['stats']) migration = DatasetMigration(dataset.Dataset) migration.migrate_all(collection=db['datasets'])
def process_station(self, uid): """ Makes a DescribeSensor request based on a 'uid' parameter being a station procedure """ GML_NS = "http://www.opengis.net/gml" XLINK_NS = "http://www.w3.org/1999/xlink" with app.app_context(): metadata_value = etree.fromstring( self.sos.describe_sensor( outputFormat= 'text/xml;subtype="sensorML/1.0.1/profiles/ioos_sos/1.0"', procedure=uid)) station_ds = IoosDescribeSensor(metadata_value) unique_id = station_ds.id if unique_id is None: app.logger.warn( "Could not get a 'stationID' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/stationID'" ) return dataset = db.Dataset.find_one({'uid': unicode(unique_id)}) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = unicode_or_none(station_ds.shortName) if name is None: messages.append( u"Could not get a 'shortName' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/shortName'" ) # DESCRIPTION description = unicode_or_none(station_ds.longName) if description is None: messages.append( u"Could not get a 'longName' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/longName'" ) # PLATFORM TYPE asset_type = unicode_or_none(station_ds.platformType) if asset_type is None: messages.append( u"Could not get a 'platformType' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/platformType'" ) # LOCATION is in GML gj = None loc = station_ds.location if loc is not None and loc.tag == "{%s}Point" % GML_NS: pos_element = loc.find("{%s}pos" % GML_NS) # strip out points positions = map(float, testXMLValue(pos_element).split(" ")) crs = Crs(testXMLAttribute(pos_element, "srsName")) if crs.axisorder == "yx": gj = json.loads( geojson.dumps( geojson.Point([positions[1], positions[0]]))) else: gj = json.loads( geojson.dumps( geojson.Point([positions[0], positions[1]]))) else: messages.append( u"Found an unrecognized child of the sml:location element and did not attempt to process it: %s" % etree.tostring(loc).strip()) service = { # Reset service 'name': name, 'description': description, 'service_type': self.service.get('service_type'), 'service_id': ObjectId(self.service.get('_id')), 'data_provider': self.service.get('data_provider'), 'metadata_type': u'sensorml', 'metadata_value': unicode(etree.tostring(metadata_value)).strip(), 'messages': map(unicode, messages), 'keywords': map(unicode, sorted(station_ds.keywords)), 'variables': map(unicode, sorted(station_ds.variables)), 'asset_type': asset_type, 'geojson': gj, 'updated': datetime.utcnow() } dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() return "Harvested"
def initialize_captcha_db(): with app.app_context(): captcha.ext_db.create_all() app.logger.info("Captcha DB Initialized")
def allmigration07__add_extra_url_field(self): self.target = {'extra_url':{'$exists': False}} self.update = {'$set':{'extra_url': None}} # Datasets from ioos_catalog.models import dataset class DatasetMigration(DocumentMigration): # add any migrations here named "allmigration_*" def allmigration01__add_active_field(self): self.target = {'active' : {'$exists' : False}} self.update = {'$set' : {'active' : False}} # Metadatas from ioos_catalog.models import metadata class MetadataMigration(DocumentMigration): def allmigration01__add_active_field(self): self.target = {'active' : {'$exists' : False}} self.update = {'$set' : {'active' : False}} with app.app_context(): migration = ServiceMigration(service.Service) migration.migrate_all(collection=db['services']) migration = DatasetMigration(dataset.Dataset) migration.migrate_all(collection=db['datasets']) migration = MetadataMigration(metadata.Metadata) migration.migrate_all(collection=db['metadatas'])
def reindex_services(): region_map = { 'AOOS': '1706F520-2647-4A33-B7BF-592FAFDE4B45', 'CARICOOS': '117F1684-A5E3-400E-98D8-A270BDBA1603', 'CENCOOS': '4BA5624D-A61F-4C7E-BAEE-7F8BDDB8D9C4', 'GCOOS': '003747E7-4818-43CD-937D-44D5B8E2F4E9', 'GLOS': 'B664427E-6953-4517-A874-78DDBBD3893E', 'MARACOOS': 'C664F631-6E53-4108-B8DD-EFADF558E408', 'NANOOS': '254CCFC0-E408-4E13-BD62-87567E7586BB', 'NERACOOS': 'E41F4FCD-0297-415D-AC53-967B970C3A3E', 'PacIOOS': '68FF11D8-D66B-45EE-B33A-21919BB26421', 'SCCOOS': 'B70B3E3C-3851-4BA9-8E9B-C9F195DCEAC7', 'SECOORA': 'B3EA8869-B726-4E39-898A-299E53ABBC98' } #'NOS/CO-OPS': '72E748DF-23B1-4E80-A2C4-81E70783094A', #'USACE': '73019DFF-2E01-4800-91CD-0B3F812256A7', #'NAVY': '3B94DAAE-B7E9-4789-993B-0045AD9149D9', #'NDBC': '828981B0-0039-4360-9788-E788FA6B0875', #'USGS/CMGP': 'C6F11F00-C2BD-4AC6-8E2C-013E16F4932E' } services = { 'SOS' : 'urn:x-esri:specification:ServiceType:sos:url', 'WMS' : 'urn:x-esri:specification:ServiceType:wms:url', 'WCS' : 'urn:x-esri:specification:ServiceType:wcs:url', 'DAP' : 'urn:x-esri:specification:ServiceType:odp:url' } endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw' # NGDC Geoportal c = csw.CatalogueServiceWeb(endpoint, timeout=120) ns = Namespaces() with app.app_context(): for region,uuid in region_map.iteritems(): # Setup uuid filter uuid_filter = fes.PropertyIsEqualTo(propertyname='sys.siteuuid', literal="{%s}" % uuid) # Make CSW request c.getrecords2([uuid_filter], esn='full', maxrecords=999999) for name, record in c.records.iteritems(): # @TODO: unfortunately CSW does not provide us with contact info, so # we must request it manually contact_email = "" metadata_url = None iso_ref = [x['url'] for x in record.references if x['scheme'] == 'urn:x-esri:specification:ServiceType:ArcIMS:Metadata:Document'] if len(iso_ref): metadata_url = iso_ref[0] # Don't query for contact info right now. It takes WAY too long. #r = requests.get(iso_ref[0]) #r.raise_for_status() #node = ET.fromstring(r.content) #safe = nspath_eval("gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString", ns.get_namespaces()) #contact_node = node.find(".//" + safe) #if contact_node is not None and contact_node.text != "": # contact_email = contact_node.text # if " or " in contact_email: # contact_email = ",".join(contact_email.split(" or ")) for ref in record.references: # We are only interested in the 'services' if ref["scheme"] in services.values(): url = unicode(ref["url"]) s = db.Service.find_one({ 'data_provider' : unicode(region), 'url' : url }) if s is None: s = db.Service() s.url = url s.data_provider = unicode(region) s.service_id = unicode(name) s.name = unicode(record.title) s.service_type = unicode(next((k for k,v in services.items() if v == ref["scheme"]))) s.interval = 3600 # 1 hour s.tld = unicode(urlparse(url).netloc) s.updated = datetime.utcnow() s.contact = unicode(contact_email) s.metadata_url = unicode(metadata_url) s.save() s.schedule_harvest()
def reindex_services(): region_map = { 'AOOS': '1706F520-2647-4A33-B7BF-592FAFDE4B45', 'CARICOOS': '117F1684-A5E3-400E-98D8-A270BDBA1603', 'CENCOOS': '4BA5624D-A61F-4C7E-BAEE-7F8BDDB8D9C4', 'GCOOS': '003747E7-4818-43CD-937D-44D5B8E2F4E9', 'GLOS': 'B664427E-6953-4517-A874-78DDBBD3893E', 'MARACOOS': 'C664F631-6E53-4108-B8DD-EFADF558E408', 'NANOOS': '254CCFC0-E408-4E13-BD62-87567E7586BB', 'NERACOOS': 'E41F4FCD-0297-415D-AC53-967B970C3A3E', 'PacIOOS': '68FF11D8-D66B-45EE-B33A-21919BB26421', 'SCCOOS': 'B70B3E3C-3851-4BA9-8E9B-C9F195DCEAC7', 'SECOORA': 'B3EA8869-B726-4E39-898A-299E53ABBC98' } #'NOS/CO-OPS': '72E748DF-23B1-4E80-A2C4-81E70783094A', #'USACE': '73019DFF-2E01-4800-91CD-0B3F812256A7', #'NAVY': '3B94DAAE-B7E9-4789-993B-0045AD9149D9', #'NDBC': '828981B0-0039-4360-9788-E788FA6B0875', #'USGS/CMGP': 'C6F11F00-C2BD-4AC6-8E2C-013E16F4932E' } services = { 'SOS': 'urn:x-esri:specification:ServiceType:sos:url', 'WMS': 'urn:x-esri:specification:ServiceType:wms:url', 'WCS': 'urn:x-esri:specification:ServiceType:wcs:url', 'DAP': 'urn:x-esri:specification:ServiceType:odp:url' } endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw' # NGDC Geoportal c = csw.CatalogueServiceWeb(endpoint, timeout=120) ns = Namespaces() with app.app_context(): for region, uuid in region_map.iteritems(): # Setup uuid filter uuid_filter = fes.PropertyIsEqualTo(propertyname='sys.siteuuid', literal="{%s}" % uuid) # Make CSW request c.getrecords2([uuid_filter], esn='full', maxrecords=999999) for name, record in c.records.iteritems(): # @TODO: unfortunately CSW does not provide us with contact info, so # we must request it manually contact_email = "" metadata_url = None iso_ref = [ x['url'] for x in record.references if x['scheme'] == 'urn:x-esri:specification:ServiceType:ArcIMS:Metadata:Document' ] if len(iso_ref): metadata_url = iso_ref[0] # Don't query for contact info right now. It takes WAY too long. #r = requests.get(iso_ref[0]) #r.raise_for_status() #node = ET.fromstring(r.content) #safe = nspath_eval("gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString", ns.get_namespaces()) #contact_node = node.find(".//" + safe) #if contact_node is not None and contact_node.text != "": # contact_email = contact_node.text # if " or " in contact_email: # contact_email = ",".join(contact_email.split(" or ")) for ref in record.references: # We are only interested in the 'services' if ref["scheme"] in services.values(): url = unicode(ref["url"]) s = db.Service.find_one({ 'data_provider': unicode(region), 'url': url }) if s is None: s = db.Service() s.url = url s.data_provider = unicode(region) s.service_id = unicode(name) s.name = unicode(record.title) s.service_type = unicode( next((k for k, v in services.items() if v == ref["scheme"]))) s.interval = 3600 # 1 hour s.tld = unicode(urlparse(url).netloc) s.updated = datetime.utcnow() s.contact = unicode(contact_email) s.metadata_url = unicode(metadata_url) s.save() s.schedule_harvest()
def harvest(self): """ Identify the type of CF dataset this is: * UGRID * CGRID * RGRID * DSG """ try: cd = CommonDataset.open(self.service.get('url')) except Exception as e: app.logger.error("Could not open DAP dataset from '%s'\n" "Exception %s: %s" % (self.service.get('url'), type(e).__name__, e)) return 'Not harvested' # For DAP, the unique ID is the URL unique_id = self.service.get('url') with app.app_context(): dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } ) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) dataset['active'] = True # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = None try: name = unicode_or_none(cd.nc.getncattr('title')) except AttributeError: messages.append(u"Could not get dataset name. No global attribute named 'title'.") # DESCRIPTION description = None try: description = unicode_or_none(cd.nc.getncattr('summary')) except AttributeError: messages.append(u"Could not get dataset description. No global attribute named 'summary'.") # KEYWORDS keywords = [] try: keywords = sorted(map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(","))) except AttributeError: messages.append(u"Could not get dataset keywords. No global attribute named 'keywords' or was not comma seperated list.") # VARIABLES prefix = "" # Add additonal prefix mappings as they become available. try: standard_name_vocabulary = unicode(cd.nc.getncattr("standard_name_vocabulary")) cf_regex = [re.compile("CF-"), re.compile('http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html')] for reg in cf_regex: if reg.match(standard_name_vocabulary) is not None: prefix = "http://mmisw.org/ont/cf/parameter/" break except AttributeError: pass # Get variables with a standard_name std_variables = [cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in self.STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0] # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable non_std_variables = list(set([x for x in cd.nc.variables if x not in itertools.chain(_possibley, _possiblex, _possiblez, _possiblet, self.METADATA_VAR_NAMES, self.COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables])) axis_names = DapHarvest.get_axis_variables(cd.nc) """ var_to_get_geo_from = None if len(std_names) > 0: var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0] messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1])) else: # No idea which variable to generate geometry from... try to factor variables with a shape > 1. try: var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1] except IndexError: messages.append(u"Could not find any non-axis variables to compute geometry from.") else: messages.append(u"No 'standard_name' attributes were found on non-axis variables. Variable '%s' was used to calculate geometry." % var_to_get_geo_from) """ # LOCATION (from Paegan) # Try POLYGON and fall back to BBOX # paegan does not support ugrid, so try to detect this condition and skip is_ugrid = False is_trajectory = False for vname, v in cd.nc.variables.iteritems(): if 'cf_role' in v.ncattrs(): if v.getncattr('cf_role') == 'mesh_topology': is_ugrid = True break elif v.getncattr('cf_role') == 'trajectory_id': is_trajectory = True break gj = None if is_ugrid: messages.append(u"The underlying 'Paegan' data access library does not support UGRID and cannot parse geometry.") elif is_trajectory: coord_names = {} # try to get info for x, y, z, t axes for v in itertools.chain(std_variables, non_std_variables): try: coord_names = cd.get_coord_names(v, **axis_names) if coord_names['xname'] is not None and \ coord_names['yname'] is not None: break except (AssertionError, AttributeError, ValueError, KeyError): pass else: messages.append(u"Trajectory discovered but could not detect coordinate variables using the underlying 'Paegan' data access library.") if 'xname' in coord_names: try: xvar = cd.nc.variables[coord_names['xname']] yvar = cd.nc.variables[coord_names['yname']] # one less order of magnitude eg 390000 -> 10000 slice_factor = 10 ** (int(math.log10(xvar.size)) - 1) xs = np.concatenate((xvar[::slice_factor], xvar[-1:])) ys = np.concatenate((yvar[::slice_factor], yvar[-1:])) # both coords must be valid to have a valid vertex # get rid of any nans and unreasonable lon/lats valid_idx = ((~np.isnan(xs)) & (np.absolute(xs) <= 180) & (~np.isnan(ys)) & (np.absolute(ys) <= 90)) xs = xs[valid_idx] ys = ys[valid_idx] # Shapely seems to require float64 values or incorrect # values will propagate for the generated lineString # if the array is not numpy's float64 dtype lineCoords = np.array([xs, ys]).T.astype('float64') gj = mapping(asLineString(lineCoords)) messages.append(u"Variable %s was used to calculate " u"trajectory geometry, and is a " u"naive sampling." % v) except (AssertionError, AttributeError, ValueError, KeyError, IndexError) as e: app.logger.warn("Trajectory error occured: %s", e) messages.append(u"Trajectory discovered but could not create a geometry.") else: for v in itertools.chain(std_variables, non_std_variables): try: gj = mapping(cd.getboundingpolygon(var=v, **axis_names ).simplify(0.5)) except (AttributeError, AssertionError, ValueError, KeyError, IndexError): try: # Returns a tuple of four coordinates, but box takes in four seperate positional argouments # Asterik magic to expland the tuple into positional arguments app.logger.exception("Error calculating bounding box") # handles "points" aka single position NCELLs bbox = cd.getbbox(var=v, **axis_names) gj = self.get_bbox_or_point(bbox) except (AttributeError, AssertionError, ValueError, KeyError, IndexError): pass if gj is not None: # We computed something, break out of loop. messages.append(u"Variable %s was used to calculate geometry." % v) break if gj is None: # Try the globals gj = self.global_bounding_box(cd.nc) messages.append(u"Bounding Box calculated using global attributes") if gj is None: messages.append(u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset.") messages.append(u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset.") messages.append(u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables))) # TODO: compute bounding box using global attributes final_var_names = [] if prefix == "": messages.append(u"Could not find a standard name vocabulary. No global attribute named 'standard_name_vocabulary'. Variable list may be incorrect or contain non-measured quantities.") final_var_names = non_std_variables + std_variables else: final_var_names = non_std_variables + list(map(unicode, ["%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables])) service = { 'name': name, 'description': description, 'service_type': self.service.get('service_type'), 'service_id': ObjectId(self.service.get('_id')), 'data_provider': self.service.get('data_provider'), 'metadata_type': u'ncml', 'metadata_value': unicode(dataset2ncml(cd.nc, url=self.service.get('url'))), 'messages': map(unicode, messages), 'keywords': keywords, 'variables': map(unicode, final_var_names), 'asset_type': get_common_name(DapHarvest.get_asset_type(cd)), 'geojson': gj, 'updated': datetime.utcnow() } with app.app_context(): dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() ncdataset = Dataset(self.service.get('url')) scores = self.ccheck_dataset(ncdataset) metamap = self.metamap_dataset(ncdataset) try: metadata_rec = self.save_ccheck_dataset('ioos', dataset._id, scores, metamap) except Exception as e: metadata_rec = None app.logger.error("could not save compliancecheck/metamap information", exc_info=True) return "Harvested"
def process_station(self, uid): """ Makes a DescribeSensor request based on a 'uid' parameter being a station procedure """ GML_NS = "http://www.opengis.net/gml" XLINK_NS = "http://www.w3.org/1999/xlink" with app.app_context(): app.logger.info("process_station: %s", uid) metadata_value = etree.fromstring(self._describe_sensor(uid)) sensor_ml = SensorML(metadata_value) station_ds = IoosDescribeSensor(metadata_value) unique_id = station_ds.id if unique_id is None: app.logger.warn("Could not get a 'stationID' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/stationID'") return dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } ) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) dataset['active'] = True # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = unicode_or_none(station_ds.shortName) if name is None: messages.append(u"Could not get a 'shortName' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/shortName'") # DESCRIPTION description = unicode_or_none(station_ds.longName) if description is None: messages.append(u"Could not get a 'longName' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/longName'") # PLATFORM TYPE asset_type = unicode_or_none(station_ds.platformType) if asset_type is None: messages.append(u"Could not get a 'platformType' from the SensorML identifiers. Looking for a definition of 'http://mmisw.org/ont/ioos/definition/platformType'") # LOCATION is in GML gj = None loc = station_ds.location if loc is not None and loc.tag == "{%s}Point" % GML_NS: pos_element = loc.find("{%s}pos" % GML_NS) # strip out points positions = map(float, testXMLValue(pos_element).split(" ")) for el in [pos_element, loc]: srs_name = testXMLAttribute(el, "srsName") if srs_name: crs = Crs(srs_name) if crs.axisorder == "yx": gj = json.loads(geojson.dumps(geojson.Point([positions[1], positions[0]]))) else: gj = json.loads(geojson.dumps(geojson.Point([positions[0], positions[1]]))) break else: if positions: messages.append(u"Position(s) found but could not parse SRS: %s, %s" % (positions, srs_name)) else: messages.append(u"Found an unrecognized child of the sml:location element and did not attempt to process it: %s" % loc) meta_str = unicode(etree.tostring(metadata_value)).strip() if len(meta_str) > 4000000: messages.append(u'Metadata document was too large to store (len: %s)' % len(meta_str)) meta_str = u'' service = { # Reset service 'name' : name, 'description' : description, 'service_type' : self.service.get('service_type'), 'service_id' : ObjectId(self.service.get('_id')), 'data_provider' : self.service.get('data_provider'), 'metadata_type' : u'sensorml', 'metadata_value' : u'', 'messages' : map(unicode, messages), 'keywords' : map(unicode, sorted(station_ds.keywords)), 'variables' : map(unicode, sorted(station_ds.variables)), 'asset_type' : get_common_name(asset_type), 'geojson' : gj, 'updated' : datetime.utcnow() } dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() # do compliance checker / metadata now scores = self.ccheck_station(sensor_ml) metamap = self.metamap_station(sensor_ml) try: self.save_ccheck_station('ioos', dataset._id, scores, metamap) except Exception as e: app.logger.warn("could not save compliancecheck/metamap information: %s", e) return "Harvested"
def reindex_services(filter_regions=None, filter_service_types=None): c = csw.CatalogueServiceWeb(endpoint, timeout=120) ns = Namespaces() filter_regions = filter_regions or region_map.keys() filter_service_types = filter_service_types or services.keys() with app.app_context(): new_services = [] update_services = [] # get a set of all non-manual, active services for possible deactivation later current_services = set((s._id for s in db.Service.find( { 'manual': False, 'active': True, 'data_provider': { '$in': filter_regions } }, {'_id': True}))) # FIXME: find a more robust mechanism for detecting ERDDAP instances # this would fail if behind a url rewriting/proxying mechanism which # remove the 'erddap' portion from the URL. May want to have GeoPortal # use a separate 'scheme' dedicated to ERDDAP for CSW record # 'references' # workaround for matching ERDDAP endpoints # match griddap or tabledap endpoints with html or graph # discarding any query string parameters (i.e. some datasets on PacIOOS) re_string = r'(^.*erddap/(?:grid|table)dap.*)\.(?:html|graph)(:?\?.*)?$' erddap_re = re.compile(re_string) erddap_all_re = re.compile(r'(^.*erddap/(?:(?:grid|table|)dap|wms).*)' r'\.(?:html|graph)(:?\?.*)?$') for region, uuid in region_map.iteritems(): if region not in filter_regions: app.logger.info("Skipping region %s due to filter", region) continue app.logger.info("Requesting region %s", region) # Setup uuid filter uuid_filter = fes.PropertyIsEqualTo(propertyname='sys.siteuuid', literal="{%s}" % uuid) # Make CSW request c.getrecords2([uuid_filter], esn='full', maxrecords=999999) for name, record in c.records.iteritems(): try: # @TODO: unfortunately CSW does not provide us with contact info, so # we must request it manually contact_email = "" metadata_url = None for ref in record.references: try: # TODO: Use a more robust mechanism for detecting # ERDDAP instances aside from relying on the url erddap_match = erddap_re.search(ref['url']) # We are only interested in the 'services' if (ref["scheme"] in services.values()): metadata_url = next(( r['url'] for r in record.references if r['scheme'] == 'urn:x-esri:specification:ServiceType:ArcIMS:Metadata:Document' ), None) # strip extension if erddap endpoint url = unicode(ref['url']) elif erddap_match: test_url = (erddap_match.group(1) + '.iso19115') req = requests.get(test_url) # if we have a valid ERDDAP metadata endpoint, # store it. if req.status_code == 200: metadata_url = unicode(test_url) else: app.logger.error('Invalid service URL %s', ref['url']) continue url = get_erddap_url_from_iso(req.content) if url is None: app.logger.error(ref['url']) app.logger.error( "Failed to parse Erddap ISO for %s", test_url) continue # Either not a valid ISO or there's not a valid endpoint # next record if not one of the previously mentioned else: continue # end metadata find block s = db.Service.find_one({ 'data_provider': unicode(region), 'url': url }) if s is None: s = db.Service() s.url = unicode(url) s.data_provider = unicode(region) s.manual = False s.active = True new_services.append(s) else: # will run twice if erddap services have # both .html and .graph, but resultant # data should be the same update_services.append(s) s.service_id = unicode(name) s.name = unicode(record.title) s.service_type = unicode( 'DAP' if erddap_match else next(( k for k, v in services.items() if v == ref["scheme"]))) s.interval = 3600 # 1 hour s.tld = unicode(urlparse(url).netloc) s.updated = datetime.utcnow() s.contact = unicode(contact_email) s.metadata_url = metadata_url # grab opendap form url if present if s.service_type == 'DAP': possible_refs = [ r['url'] for r in record.references if r['scheme'] == opendap_form_schema ] if len(possible_refs): # this is bad, it can grab any associated # record from the dataset s.extra_url = unicode(possible_refs[0]) # if we see the service, this is "Active", unless we've set manual (then we don't touch) if not s.manual: s.active = True s.save() except Exception as e: app.logger.warn("Could not save service: %s", e) except Exception as e: app.logger.warn("Could not save region info: %s", e) # DEACTIVATE KNOWN SERVICES updated_ids = set((s._id for s in update_services)) deactivate = list(current_services.difference(updated_ids)) # bulk update (using pymongo syntax) db.services.update({'_id': { '$in': deactivate }}, {'$set': { 'active': False, 'updated': datetime.utcnow() }}, multi=True, upsert=False) return "New services: %s, updated services: %s, deactivated services: %s" % ( len(new_services), len(update_services), len(deactivate))
def harvest(self): """ Identify the type of CF dataset this is: * UGRID * CGRID * RGRID * DSG """ METADATA_VAR_NAMES = [u'crs', u'projection'] # CF standard names for Axis STD_AXIS_NAMES = [u'latitude', u'longitude', u'time', u'forecast_reference_time', u'forecast_period', u'ocean_sigma', u'ocean_s_coordinate_g1', u'ocean_s_coordinate_g2', u'ocean_s_coordinate', u'ocean_double_sigma', u'ocean_sigma_over_z', u'projection_y_coordinate', u'projection_x_coordinate'] # Some datasets don't define standard_names on axis variables. This is used to weed them out based on the # actual variable name COMMON_AXIS_NAMES = [u'x', u'y', u'lat', u'latitude', u'lon', u'longitude', u'time', u'time_run', u'time_offset', u'ntimes', u'lat_u', u'lon_u', u'lat_v', u'lon_v ', u'lat_rho', u'lon_rho', u'lat_psi'] cd = CommonDataset.open(self.service.get('url')) # For DAP, the unique ID is the URL unique_id = self.service.get('url') with app.app_context(): dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } ) if dataset is None: dataset = db.Dataset() dataset.uid = unicode(unique_id) # Find service reference in Dataset.services and remove (to replace it) tmp = dataset.services[:] for d in tmp: if d['service_id'] == self.service.get('_id'): dataset.services.remove(d) # Parsing messages messages = [] # NAME name = None try: name = unicode_or_none(cd.nc.getncattr('title')) except AttributeError: messages.append(u"Could not get dataset name. No global attribute named 'title'.") # DESCRIPTION description = None try: description = unicode_or_none(cd.nc.getncattr('summary')) except AttributeError: messages.append(u"Could not get dataset description. No global attribute named 'summary'.") # KEYWORDS keywords = [] try: keywords = sorted(map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(","))) except AttributeError: messages.append(u"Could not get dataset keywords. No global attribute named 'keywords' or was not comma seperated list.") # VARIABLES prefix = "" # Add additonal prefix mappings as they become available. try: standard_name_vocabulary = unicode(cd.nc.getncattr("standard_name_vocabulary")) cf_regex = [re.compile("CF-"), re.compile('http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html')] for reg in cf_regex: if reg.match(standard_name_vocabulary) is not None: prefix = "http://mmisw.org/ont/cf/parameter/" break except AttributeError: pass # Get variables with a standard_name std_variables = [cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0] # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable non_std_variables = list(set([x for x in cd.nc.variables if x not in itertools.chain(_possibley, _possiblex, _possiblez, _possiblet, METADATA_VAR_NAMES, COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables])) """ var_to_get_geo_from = None if len(std_names) > 0: var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0] messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1])) else: # No idea which variable to generate geometry from... try to factor variables with a shape > 1. try: var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1] except IndexError: messages.append(u"Could not find any non-axis variables to compute geometry from.") else: messages.append(u"No 'standard_name' attributes were found on non-axis variables. Variable '%s' was used to calculate geometry." % var_to_get_geo_from) """ # LOCATION (from Paegan) # Try POLYGON and fall back to BBOX gj = None for v in itertools.chain(std_variables, non_std_variables): try: gj = mapping(cd.getboundingpolygon(var=v)) except (AttributeError, AssertionError, ValueError): try: # Returns a tuple of four coordinates, but box takes in four seperate positional argouments # Asterik magic to expland the tuple into positional arguments gj = mapping(box(*cd.get_bbox(var=v))) except (AttributeError, AssertionError, ValueError): pass if gj is not None: # We computed something, break out of loop. messages.append(u"Variable %s was used to calculate geometry." % v) break if gj is None: messages.append(u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset.") messages.append(u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset.") messages.append(u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables))) # TODO: compute bounding box using global attributes final_var_names = [] if prefix == "": messages.append(u"Could not find a standard name vocabulary. No global attribute named 'standard_name_vocabulary'. Variable list may be incorrect or contain non-measured quantities.") final_var_names = non_std_variables + std_variables else: final_var_names = non_std_variables + list(map(unicode, ["%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables])) service = { 'name' : name, 'description' : description, 'service_type' : self.service.get('service_type'), 'service_id' : ObjectId(self.service.get('_id')), 'data_provider' : self.service.get('data_provider'), 'metadata_type' : u'ncml', 'metadata_value' : unicode(dataset2ncml(cd.nc, url=self.service.get('url'))), 'messages' : map(unicode, messages), 'keywords' : keywords, 'variables' : map(unicode, final_var_names), 'asset_type' : unicode(cd._datasettype).upper(), 'geojson' : gj, 'updated' : datetime.utcnow() } with app.app_context(): dataset.services.append(service) dataset.updated = datetime.utcnow() dataset.save() return "Harvested"