Пример #1
0
def queue_harvest_tasks():
    """
    Generate a number of harvest tasks.

    Meant to be called via cron. Only queues services that are active.
    """

    with app.app_context():
        # Some hosts don't like successive repeated connections, so by
        # shuffling our list of services we reduce the liklihood that we'll
        # harvest from the same host enough times to cause a service problem.
        # This should reduce timeouts and unresponsive datasets
        services = list(db.Service.find({'active': True}))
        services = distinct_services(services)
        shuffle(services)
        for s in services:
            service_id = s._id
            if service_id in LARGER_SERVICES:
                continue
            # count all the datasets associated with this particular service
            datalen = db.datasets.find({'services.service_id':
                                        service_id}).count()
            # handle timeouts for services with large numbers of datasets
            if datalen <= 36:
                timeout_secs = 180
            else:
                # for large numbers of requests, 5 seconds should be enough
                # for each request, on average
                timeout_secs = datalen * 60
            queue.enqueue_call(harvest, args=(service_id,),
                               timeout=timeout_secs)

    # record dataset/service metrics after harvest
    add_counts()
Пример #2
0
def migrate():
    """Changes 'Glider_DAC_2' services to 'Glider_DAC'"""
    with app.app_context():
        db.Service.collection.update({"data_provider": u"Glider_DAC_2"},
                                     {'$set': {"data_provider": u"Glider_DAC"}},
                                     multi=True)
        app.logger.info("Migration 2015-01-20 complete")
Пример #3
0
def queue_harvest_tasks():
    """
    Generate a number of harvest tasks.

    Meant to be called via cron. Only queues services that are active.
    """

    with app.app_context():
        for s in db.Service.find({'active':True}, {'_id':True}):
            service_id = s._id
            if service_id in LARGER_SERVICES:
                continue
            # count all the datasets associated with this particular service
            datalen = db.datasets.find({'services.service_id':
                                         service_id}).count()
            # handle timeouts for services with large numbers of datasets
            if datalen <= 36:
                timeout_secs = 180
            else:
                # for large numbers of requests, 5 seconds should be enough
                # for each request, on average
                timeout_secs = datalen * 60
            queue.enqueue_call(harvest, args=(service_id,),
                               timeout=timeout_secs)


    # record dataset/service metrics after harvest
    add_counts()
Пример #4
0
def queue_harvest_tasks():
    """
    Generate a number of harvest tasks.

    Meant to be called via cron. Only queues services that are active.
    """

    with app.app_context():
        # Some hosts don't like successive repeated connections, so by
        # shuffling our list of services we reduce the liklihood that we'll
        # harvest from the same host enough times to cause a service problem.
        # This should reduce timeouts and unresponsive datasets
        services = list(db.Service.find({'active': True}, {'_id': True}))
        shuffle(services)
        for s in services:
            service_id = s._id
            if service_id in LARGER_SERVICES:
                continue
            # count all the datasets associated with this particular service
            datalen = db.datasets.find({
                'services.service_id': service_id
            }).count()
            # handle timeouts for services with large numbers of datasets
            if datalen <= 36:
                timeout_secs = 180
            else:
                # for large numbers of requests, 5 seconds should be enough
                # for each request, on average
                timeout_secs = datalen * 60
            queue.enqueue_call(harvest,
                               args=(service_id, ),
                               timeout=timeout_secs)

    # record dataset/service metrics after harvest
    add_counts()
Пример #5
0
def cleanup_datasets():
    with app.app_context():
        datasets = db.Dataset.find({'active':True})
        for d in datasets:
            services = d['services'] # a list of services
            service_ids = [s['service_id'] for s in services]
            if not service_ids:
                app.logger.info('Deactivating %s', d['uid'])
                d['active'] = False
                d.save()
                continue

            # Go through each of the services
            #
            # if we don't find at least one service that is active, set
            # dataset.active to False
            for service_id in service_ids:
                related_services = db.Service.find({'_id':service_id})
                for service in related_services:
                    if service['active']:
                        break
                else: # reached the end of the loop
                    app.logger.info('Deactivating %s', d['uid'])
                    d['active'] = False
                    d.save()
                    break
Пример #6
0
def send_service_down_email(service_id):
    with app.app_context():
        kwargs = {
            'service': db.Service.find_one({'_id': service_id}),
            'stat': db.PingLatest.find_one({'service_id': service_id})
        }

        kwargs['status'] = kwargs['stat'].last_operational_status

        subject = "[ioos] Service Status Alert (%s): %s (%s)" % (
            "UP" if kwargs['status'] else "DOWN", kwargs['service'].name,
            kwargs['service'].service_type)

        text_template = render_template("service_status_changed.txt", **kwargs)
        html_template = render_template("service_status_changed.html",
                                        **kwargs)

        to_addresses = [app.config.get("MAIL_DEFAULT_LIST")
                        ] if app.config.get('MAILER_DEBUG') == False else [
                            app.config.get("MAIL_DEFAULT_TO")
                        ]
        # Don't send these until Anna updates the ISO document in GeoPortal with the correct service contacts
        #if app.config.get('MAILER_DEBUG') == False and kwargs['service'].contact is not None:
        #    to_addresses = kwargs['service'].contact.split(",")
        cc_addresses = [app.config.get("MAIL_DEFAULT_TO")]

        send(subject, to_addresses, cc_addresses, text_template, html_template)
Пример #7
0
def send_daily_report_email(end_time=None, start_time=None):
    with app.app_context():

        failed_services, services, end_time, start_time = db.Service.get_failures_in_time_range(end_time, start_time)

        text_template = render_template("daily_service_report.txt",
                                        services=services,
                                        failed_services=failed_services,
                                        start_time=start_time,
                                        end_time=end_time)
        html_template = render_template("daily_service_report_email.html",
                                        services=services,
                                        failed_services=failed_services,
                                        start_time=start_time,
                                        end_time=end_time)

        to_addresses = [app.config.get("MAIL_DEFAULT_LIST")] if app.config.get('MAILER_DEBUG') == False else [app.config.get("MAIL_DEFAULT_TO")]
        cc_addresses = [app.config.get("MAIL_DEFAULT_TO")]
        subject      = "[ioos] Service Daily Downtime Report"

        send(subject,
             to_addresses,
             cc_addresses,
             text_template,
             html_template)
Пример #8
0
def harvest(service_id):
    with app.app_context():
        service = db.Service.find_one( { '_id' : ObjectId(service_id) } )

        # make sure service is active before we harvest
        if not service.active:
            #service.cancel_harvest()
            return "Service %s is not active, not harvesting" % service_id

        # ping it first to see if alive
        try:
            _, response_code = service.ping(timeout=15)
            operational_status = True if response_code in [200,400] else False
        except (requests.ConnectionError, requests.HTTPError, requests.Timeout):
            operational_status = False

        if not operational_status:
            # not a failure
            # @TODO: record last attempt time/this message
            return "Aborted harvest due to service down"

        if service.service_type == "DAP":
            return DapHarvest(service).harvest()
        elif service.service_type == "SOS":
            return SosHarvest(service).harvest()
        elif service.service_type == "WMS":
            return WmsHarvest(service).harvest()
        elif service.service_type == "WCS":
            return WcsHarvest(service).harvest()
Пример #9
0
def send_daily_report_email(end_time=None, start_time=None):
    with app.app_context():

        failed_services, services, end_time, start_time = db.Service.get_failures_in_time_range(
            end_time, start_time)

        text_template = render_template("daily_service_report.txt",
                                        services=services,
                                        failed_services=failed_services,
                                        start_time=start_time,
                                        end_time=end_time)
        html_template = render_template("daily_service_report_email.html",
                                        services=services,
                                        failed_services=failed_services,
                                        start_time=start_time,
                                        end_time=end_time)

        to_addresses = [app.config.get("MAIL_DEFAULT_LIST")
                        ] if app.config.get('MAILER_DEBUG') == False else [
                            app.config.get("MAIL_DEFAULT_TO")
                        ]
        cc_addresses = [app.config.get("MAIL_DEFAULT_TO")]
        subject = "[ioos] Service Daily Downtime Report"

        send(subject, to_addresses, cc_addresses, text_template, html_template)
Пример #10
0
def cleanup_datasets():
    with app.app_context():
        datasets = db.Dataset.find({'active': True})
        for d in datasets:
            services = d['services']  # a list of services
            service_ids = [s['service_id'] for s in services]
            if not service_ids:
                app.logger.info('Deactivating %s', d['uid'])
                d['active'] = False
                d.save()
                continue

            # Go through each of the services
            #
            # if we don't find at least one service that is active, set
            # dataset.active to False
            for service_id in service_ids:
                related_services = db.Service.find({'_id': service_id})
                for service in related_services:
                    if service['active']:
                        break
                else:  # reached the end of the loop
                    app.logger.info('Deactivating %s', d['uid'])
                    d['active'] = False
                    d.save()
                    break
Пример #11
0
def queue_large_service_harvest_tasks():
    larger_services = [
        ObjectId('53d34aed8c0db37e0b538fda'),
        ObjectId('53d49c8d8c0db37ff1370308')
    ]
    with app.app_context():
        for s in db.Service.find({'_id': {'$in': larger_services}}):
            service_id = s._id
            # count all the datasets associated with this particular service
            datalen = db.datasets.find({
                'services.service_id': service_id
            }).count()
            # handle timeouts for services with large numbers of datasets
            if datalen <= 36:
                timeout_secs = 180
            else:
                # for large numbers of requests, 5 seconds should be enough
                # for each request, on average
                timeout_secs = datalen * 60
            queue.enqueue_call(harvest,
                               args=(service_id, ),
                               timeout=timeout_secs)

    # record dataset/service metrics after harvest
    add_counts()
Пример #12
0
def migrate():
    """Adds min and max time to datasets"""
    with app.app_context():
        datasets = db.Dataset.find()
        for d in datasets:
            for i, s in enumerate(d['services']):
                d['services'][i]['time_min'] = None
                d['services'][i]['time_max'] = None
            d.save()
Пример #13
0
def migrate():
    """Adds min and max time to datasets"""
    with app.app_context():
        datasets = db.Dataset.find()
        for d in datasets:
            for i, s in enumerate(d['services']):
                d['services'][i]['time_min'] = None
                d['services'][i]['time_max'] = None
            d.save()
Пример #14
0
def queue_harvest_tasks():
    """
    Generate a number of harvet tasks.

    Meant to be called via cron. Only queues services that are active.
    """
    with app.app_context():
        sids = [s._id for s in db.Service.find({'active':True}, {'_id':True})]
        for sid in sids:
            queue.enqueue(harvest, sid)
Пример #15
0
    def save_ccheck_and_metadata(self, service_id, checker_name, ref_id, ref_type, scores, metamap):
        """
        Saves the result of a compliance checker scores and metamap document.

        Will be called by service/station derived methods.
        """
        if not (scores or metamap):
            return

        with app.app_context():
            def res2dict(r):
                cl = []
                if r.children:
                    cl = map(res2dict, r.children)

                return {'name'     : unicode(r.name),
                        'score'    : float(r.value[0]),
                        'maxscore' : float(r.value[1]),
                        'weight'   : int(r.weight),
                        'children' : cl}

            metadata = db.Metadata.find_one({'ref_id': ref_id})
            if metadata is None:
                metadata             = db.Metadata()
                metadata.ref_id      = ref_id
                metadata.ref_type    = unicode(ref_type)

            cc_results = map(res2dict, scores)

            # @TODO: srsly need to decouple from cchecker
            score     = sum(((float(r.value[0])/r.value[1]) * r.weight for r in scores))
            max_score = sum((r.weight for r in scores))

            score_doc = {'score'     : float(score),
                         'max_score' : float(max_score),
                         'pct'       : float(score) / max_score}

            update_doc = {'cc_score'   : score_doc,
                          'cc_results' : cc_results,
                          'metamap'    : metamap}

            for mr in metadata.metadata:
                if mr['service_id'] == service_id and mr['checker'] == checker_name:
                    mr.update(update_doc)
                    break
            else:
                metarecord = {'service_id': service_id,
                              'checker'   : unicode(checker_name)}
                metarecord.update(update_doc)
                metadata.metadata.append(metarecord)

            metadata.updated = datetime.utcnow()
            metadata.save()

            return metadata
Пример #16
0
    def ccheck_dataset(self, ncdataset):
        with app.app_context():
            scores = None
            try:
                cs = ComplianceCheckerCheckSuite()
                groups = cs.run(ncdataset, 'ioos')
                scores = groups['ioos']
            except Exception as e:
                app.logger.warn("Caught exception doing Compliance Checker on Dataset: %s", e)

            return scores
Пример #17
0
    def ccheck_dataset(self, ncdataset):
        with app.app_context():
            scores = None
            try:
                cs = ComplianceCheckerCheckSuite()
                groups = cs.run(ncdataset, 'ioos')
                scores = groups['ioos']
            except Exception as e:
                app.logger.warn("Caught exception doing Compliance Checker on Dataset: %s", e)

            return scores
Пример #18
0
    def ccheck_station(self, sensor_ml):
        with app.app_context():
            scores = None
            try:
                cs = ComplianceCheckerCheckSuite()
                groups = cs.run(sensor_ml, 'ioos')
                scores = groups['ioos']
            except Exception as e:
                app.logger.warn("Caught exception doing Compliance Checker on SOS station: %s", e)

            return scores
Пример #19
0
    def ccheck_station(self, sensor_ml):
        with app.app_context():
            scores = None
            try:
                cs = ComplianceCheckerCheckSuite()
                groups = cs.run(sensor_ml, 'ioos')
                scores = groups['ioos']
            except Exception as e:
                app.logger.warn("Caught exception doing Compliance Checker on SOS station: %s", e)

            return scores
Пример #20
0
def harvest(service_id):
    with app.app_context():
        service = db.Service.find_one( { '_id' : ObjectId(service_id) } )

        if service.service_type == "DAP":
            return DapHarvest(service).harvest()
        elif service.service_type == "SOS":
            return SosHarvest(service).harvest()
        elif service.service_type == "WMS":
            return WmsHarvest(service).harvest()
        elif service.service_type == "WCS":
            return WcsHarvest(service).harvest()
Пример #21
0
def ping_service_task(service_id):
    with app.app_context():
        # get last for this service
        last_stat = db.Stat.find_one({"service_id": ObjectId(service_id)}, sort=[("created", -1)])

        stat = db.Stat()
        stat.service_id = ObjectId(service_id)
        stat.ping_service()
        stat.save()

        if last_stat and last_stat.operational_status != stat.operational_status:
            queue.enqueue(send_service_down_email, ObjectId(service_id))
Пример #22
0
def harvest(service_id):
    with app.app_context():
        service = db.Service.find_one({'_id': ObjectId(service_id)})

        if service.service_type == "DAP":
            return DapHarvest(service).harvest()
        elif service.service_type == "SOS":
            return SosHarvest(service).harvest()
        elif service.service_type == "WMS":
            return WmsHarvest(service).harvest()
        elif service.service_type == "WCS":
            return WcsHarvest(service).harvest()
Пример #23
0
def queue_ping_tasks():
    """
    Generate a number of ping tasks.

    Meant to be called via cron. Only queues services that are active.
    """
    with app.app_context():
        sids = [
            s._id for s in db.Service.find({'active': True}, {'_id': True})
        ]
        for sid in sids:
            queue.enqueue(ping_service_task, sid)
Пример #24
0
def ping_service_task(service_id):
    with app.app_context():
        # get last for this service
        last_stat = db.Stat.find_one({'service_id': ObjectId(service_id)},
                                     sort=[('created', -1)])

        stat = db.Stat()
        stat.service_id = ObjectId(service_id)
        stat.ping_service()
        stat.save()

        if last_stat and last_stat.operational_status != stat.operational_status:
            queue.enqueue(send_service_down_email, ObjectId(service_id))
Пример #25
0
    def ccheck_service(self):
        assert self.sos

        with app.app_context():

            scores = None

            try:
                cs = ComplianceCheckerCheckSuite()
                groups = cs.run(self.sos, 'ioos')
                scores = groups['ioos']
            except Exception as e:
                app.logger.warn("Caught exception doing Compliance Checker on SOS service: %s", e)

            return scores
Пример #26
0
    def ccheck_service(self):
        assert self.sos

        with app.app_context():

            scores = None

            try:
                cs = ComplianceCheckerCheckSuite()
                groups = cs.run(self.sos, 'ioos')
                scores = groups['ioos']
            except Exception as e:
                app.logger.warn("Caught exception doing Compliance Checker on SOS service: %s", e)

            return scores
Пример #27
0
    def metamap_station(self, sensor_ml):
        with app.app_context():
            # gets a metamap document of this service using wicken
            beliefs = IOOSSOSDSCheck.beliefs()
            doc = MultipleXmlDogma('sos-ds', beliefs, sensor_ml._root, namespaces=get_namespaces())

            # now make a map out of this
            # @TODO wicken should make this easier
            metamap = {}
            for k in beliefs:
                try:
                    metamap[k] = getattr(doc, doc._fixup_belief(k)[0])
                except Exception as e:
                    pass

            return metamap
Пример #28
0
    def metamap_station(self, sensor_ml):
        with app.app_context():
            # gets a metamap document of this service using wicken
            beliefs = IOOSSOSDSCheck.beliefs()
            doc = MultipleXmlDogma('sos-ds', beliefs, sensor_ml._root, namespaces=get_namespaces())

            # now make a map out of this
            # @TODO wicken should make this easier
            metamap = {}
            for k in beliefs:
                try:
                    metamap[k] = getattr(doc, doc._fixup_belief(k)[0])
                except Exception as e:
                    pass

            return metamap
Пример #29
0
    def metamap_dataset(self, ncdataset):
        with app.app_context():

            # gets a metamap document of this service using wicken
            beliefs = IOOSNCCheck.beliefs()
            ncnamespaces = {'nc': pb_namespaces['ncml']}

            doc = NetCDFDogma('nc',
                              beliefs,
                              ncdataset,
                              namespaces=ncnamespaces)

            # now make a map out of this
            # @TODO wicken should make this easier

            m_names, m_units = ['Variable Names*', 'Variable Units*']
            metamap = {}
            for k in beliefs:
                try:
                    metamap[k] = getattr(doc, doc._fixup_belief(k)[0])
                except Exception as e:
                    app.logger.exception("Problem setting belief (%s)", k)

            metamap[m_names] = [
            ]  # Override the Wicken return to preserve the order
            metamap[m_units] = [
            ]  # Override the Wicken return to preserve the order

            # Wicken doesn't preserve the order between the names and the units,
            # so what you wind up with is two lists that can't be related, but we
            # want to keep the relationship between the name and the units

            for k in ncdataset.variables.iterkeys():
                var_name = k
                standard_name = getattr(ncdataset.variables[k],
                                        'standard_name', '')
                units = getattr(ncdataset.variables[k], 'units', '')

                # Only map metadata where we have all three
                if var_name and standard_name and units:
                    metamap[m_names].append('%s (%s)' %
                                            (var_name, standard_name))
                    metamap[m_units].append(units)

            return metamap
Пример #30
0
    def metamap_service(self):
        assert self.sos

        with app.app_context():
            # gets a metamap document of this service using wicken
            beliefs = IOOSSOSGCCheck.beliefs()
            doc = MultipleXmlDogma('sos-gc', beliefs, self.sos._capabilities, namespaces=get_namespaces())

            # now make a map out of this
            # @TODO wicken should make this easier
            metamap = {}
            for k in beliefs:
                try:
                    metamap[k] = getattr(doc, doc._fixup_belief(k)[0])
                except Exception as e:
                    pass

            return metamap
Пример #31
0
    def metamap_service(self):
        assert self.sos

        with app.app_context():
            # gets a metamap document of this service using wicken
            beliefs = IOOSSOSGCCheck.beliefs()
            doc = MultipleXmlDogma('sos-gc', beliefs, self.sos._capabilities, namespaces=get_namespaces())

            # now make a map out of this
            # @TODO wicken should make this easier
            metamap = {}
            for k in beliefs:
                try:
                    metamap[k] = getattr(doc, doc._fixup_belief(k)[0])
                except Exception as e:
                    pass

            return metamap
Пример #32
0
def ping_service_task(service_id):
    with app.app_context():

        pl = db.PingLatest.get_for_service(ObjectId(service_id))
        wasnew, flip = pl.ping_service()
        pl.save()

        # save to WeeklyArchive
        if wasnew:
            utcnow = datetime.utcnow()
            pa = db.PingArchive.get_for_service(ObjectId(service_id), utcnow)
            pa.add_ping_data(pl.last_response_time, pl.last_operational_status)
            pa.updated = utcnow
            pa.save()

        if flip:
            queue.enqueue(send_service_down_email, ObjectId(service_id))

        return pl.last_response_time
Пример #33
0
    def metamap_dataset(self, ncdataset):
        with app.app_context():

            # gets a metamap document of this service using wicken
            beliefs = IOOSNCCheck.beliefs()
            ncnamespaces = {'nc': pb_namespaces['ncml']}

            doc = NetCDFDogma('nc', beliefs, ncdataset,
                              namespaces=ncnamespaces)

            # now make a map out of this
            # @TODO wicken should make this easier

            m_names, m_units = ['Variable Names*', 'Variable Units*']
            metamap = {}
            for k in beliefs:
                try:
                    metamap[k] = getattr(doc, doc._fixup_belief(k)[0])
                except Exception:
                    app.logger.exception("Problem setting belief (%s)", k)

            # Override the Wicken return to preserve the order
            metamap[m_names] = []
            # Override the Wicken return to preserve the order
            metamap[m_units] = []

            # Wicken doesn't preserve the order between the names and the units,
            # so what you wind up with is two lists that can't be related, but we
            # want to keep the relationship between the name and the units

            for k in ncdataset.variables.iterkeys():
                var_name = k
                standard_name = getattr(
                    ncdataset.variables[k], 'standard_name', '')
                units = getattr(ncdataset.variables[k], 'units', '')

                # Only map metadata where we have all three
                if var_name and standard_name and units:
                    metamap[m_names].append('%s (%s)' %
                                            (var_name, standard_name))
                    metamap[m_units].append(units)

            return metamap
Пример #34
0
    def metamap_dataset(self, ncdataset):
        with app.app_context():

            # gets a metamap document of this service using wicken
            beliefs = IOOSNCCheck.beliefs()
            ncnamespaces = {'nc':pb_namespaces['ncml']}

            doc = NetCDFDogma('nc', beliefs, ncdataset, namespaces=ncnamespaces)

            # now make a map out of this
            # @TODO wicken should make this easier
            metamap = {}
            for k in beliefs:
                try:
                    metamap[k] = getattr(doc, doc._fixup_belief(k)[0])
                except Exception as e:
                    print k, e

            return metamap
Пример #35
0
def ping_service_task(service_id):
    with app.app_context():

        pl = db.PingLatest.get_for_service(ObjectId(service_id))
        wasnew, flip = pl.ping_service()
        pl.save()

        # save to WeeklyArchive
        if wasnew:
            utcnow = datetime.utcnow()
            pa = db.PingArchive.get_for_service(ObjectId(service_id), utcnow)
            pa.add_ping_data(pl.last_response_time, pl.last_operational_status)
            pa.updated = utcnow
            pa.save()

        if flip:
            queue.enqueue(send_service_down_email, ObjectId(service_id))

        return pl.last_response_time
Пример #36
0
def queue_provider(provider):
    with app.app_context():
        for s in db.Service.find({'data_provider':provider, 'active':True}):
            service_id = s._id
            if service_id in LARGER_SERVICES:
                continue
            # count all the datasets associated with this particular service
            datalen = db.datasets.find({'services.service_id':
                                         service_id}).count()
            # handle timeouts for services with large numbers of datasets
            if datalen <= 36:
                timeout_secs = 180
            else:
                # for large numbers of requests, 5 seconds should be enough
                # for each request, on average
                timeout_secs = datalen * 60
            queue.enqueue_call(harvest, args=(service_id,),
                               timeout=timeout_secs)

    # record dataset/service metrics after harvest
    add_counts()
Пример #37
0
def reindex_services(provider=None):
    '''
    Downloads all records from CKAN and creates service records for the
    appropriate resources defined in those records.
    '''
    region_map = get_region_map()
    if provider is not None:
        region_map = [org for org in region_map if org['name'] == provider]

    with app.app_context():

        for organization in region_map:
            index_organization(organization)

        # Deactivate any service older than 7 days
        old = datetime.utcnow() - timedelta(days=7)
        db.services.update({"updated": {"$lt": old}},
                           {"$set": {"active": False, "updated": datetime.utcnow()}},
                           multi=True,
                           upsert=False)

        return
Пример #38
0
def queue_large_service_harvest_tasks():
    larger_services = [
        ObjectId('53d34aed8c0db37e0b538fda'),
        ObjectId('53d49c8d8c0db37ff1370308')
    ]
    with app.app_context():
        for s in db.Service.find({'_id':{'$in':larger_services}}):
            service_id = s._id
            # count all the datasets associated with this particular service
            datalen = db.datasets.find({'services.service_id':
                                         service_id}).count()
            # handle timeouts for services with large numbers of datasets
            if datalen <= 36:
                timeout_secs = 180
            else:
                # for large numbers of requests, 5 seconds should be enough
                # for each request, on average
                timeout_secs = datalen * 60
            queue.enqueue_call(harvest, args=(service_id,),
                               timeout=timeout_secs)

    # record dataset/service metrics after harvest
    add_counts()
Пример #39
0
def send_service_down_email(service_id):
    with app.app_context():
        kwargs = {'service' : db.Service.find_one({'_id':service_id}),
                  'stat'    : db.Stat.find_one({'service_id':service_id}, sort=[('created',-1)]),
                  'last_success_stat' : db.Stat.find_one({'service_id':service_id, 'operational_status':1}, sort=[('created',-1)]) }
        kwargs['status'] = kwargs['stat'].operational_status

        subject = "[ioos] Service Status Alert (%s): %s (%s)" % ("UP" if kwargs['status'] else "DOWN", kwargs['service'].name, kwargs['service'].service_type)

        text_template = render_template("service_status_changed.txt", **kwargs)
        html_template = render_template("service_status_changed.html", **kwargs)

        to_addresses = [app.config.get("MAIL_DEFAULT_LIST")] if app.config.get('MAILER_DEBUG') == False else [app.config.get("MAIL_DEFAULT_TO")]
        # Don't send these until Anna updates the ISO document in GeoPortal with the correct service contacts
        #if app.config.get('MAILER_DEBUG') == False and kwargs['service'].contact is not None:
        #    to_addresses = kwargs['service'].contact.split(",")
        cc_addresses = [app.config.get("MAIL_DEFAULT_TO")]

        send(subject,
             to_addresses,
             cc_addresses,
             text_template,
             html_template)
Пример #40
0
    def harvest(self):
        """
        Identify the type of CF dataset this is:
          * UGRID
          * CGRID
          * RGRID
          * DSG
        """

        METADATA_VAR_NAMES = [u'crs', u'projection']

        # CF standard names for Axis
        STD_AXIS_NAMES = [
            u'latitude', u'longitude', u'time', u'forecast_reference_time',
            u'forecast_period', u'ocean_sigma', u'ocean_s_coordinate_g1',
            u'ocean_s_coordinate_g2', u'ocean_s_coordinate',
            u'ocean_double_sigma', u'ocean_sigma_over_z',
            u'projection_y_coordinate', u'projection_x_coordinate'
        ]

        # Some datasets don't define standard_names on axis variables.  This is used to weed them out based on the
        # actual variable name
        COMMON_AXIS_NAMES = [
            u'x', u'y', u'lat', u'latitude', u'lon', u'longitude', u'time',
            u'time_run', u'time_offset', u'ntimes', u'lat_u', u'lon_u',
            u'lat_v', u'lon_v  ', u'lat_rho', u'lon_rho', u'lat_psi'
        ]

        cd = CommonDataset.open(self.service.get('url'))

        # For DAP, the unique ID is the URL
        unique_id = self.service.get('url')

        with app.app_context():
            dataset = db.Dataset.find_one({'uid': unicode(unique_id)})
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)

        # Find service reference in Dataset.services and remove (to replace it)
        tmp = dataset.services[:]
        for d in tmp:
            if d['service_id'] == self.service.get('_id'):
                dataset.services.remove(d)

        # Parsing messages
        messages = []

        # NAME
        name = None
        try:
            name = unicode_or_none(cd.nc.getncattr('title'))
        except AttributeError:
            messages.append(
                u"Could not get dataset name.  No global attribute named 'title'."
            )

        # DESCRIPTION
        description = None
        try:
            description = unicode_or_none(cd.nc.getncattr('summary'))
        except AttributeError:
            messages.append(
                u"Could not get dataset description.  No global attribute named 'summary'."
            )

        # KEYWORDS
        keywords = []
        try:
            keywords = sorted(
                map(lambda x: unicode(x.strip()),
                    cd.nc.getncattr('keywords').split(",")))
        except AttributeError:
            messages.append(
                u"Could not get dataset keywords.  No global attribute named 'keywords' or was not comma seperated list."
            )

        # VARIABLES
        prefix = ""
        # Add additonal prefix mappings as they become available.
        try:
            standard_name_vocabulary = unicode(
                cd.nc.getncattr("standard_name_vocabulary"))

            cf_regex = [
                re.compile("CF-"),
                re.compile(
                    'http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html'
                )
            ]

            for reg in cf_regex:
                if reg.match(standard_name_vocabulary) is not None:
                    prefix = "http://mmisw.org/ont/cf/parameter/"
                    break
        except AttributeError:
            pass

        # Get variables with a standard_name
        std_variables = [
            cd.get_varname_from_stdname(x)[0]
            for x in self.get_standard_variables(cd.nc)
            if x not in STD_AXIS_NAMES and
            len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0
        ]

        # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable
        non_std_variables = list(
            set([
                x for x in cd.nc.variables if x not in itertools.chain(
                    _possibley, _possiblex, _possiblez, _possiblet,
                    METADATA_VAR_NAMES, COMMON_AXIS_NAMES) and
                len(cd.nc.variables[x].shape) > 0 and x not in std_variables
            ]))
        """
        var_to_get_geo_from = None
        if len(std_names) > 0:
            var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0]
            messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1]))
        else:
            # No idea which variable to generate geometry from... try to factor variables with a shape > 1.
            try:
                var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1]
            except IndexError:
                messages.append(u"Could not find any non-axis variables to compute geometry from.")
            else:
                messages.append(u"No 'standard_name' attributes were found on non-axis variables.  Variable '%s' was used to calculate geometry." % var_to_get_geo_from)
        """

        # LOCATION (from Paegan)
        # Try POLYGON and fall back to BBOX
        gj = None
        for v in itertools.chain(std_variables, non_std_variables):
            try:
                gj = mapping(cd.getboundingpolygon(var=v))
            except (AttributeError, AssertionError, ValueError):
                try:
                    # Returns a tuple of four coordinates, but box takes in four seperate positional argouments
                    # Asterik magic to expland the tuple into positional arguments
                    gj = mapping(box(*cd.get_bbox(var=v)))
                except (AttributeError, AssertionError, ValueError):
                    pass

            if gj is not None:
                # We computed something, break out of loop.
                messages.append(
                    u"Variable %s was used to calculate geometry." % v)
                break

        if gj is None:
            messages.append(
                u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset."
            )
            messages.append(
                u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset."
            )
            messages.append(
                u"Failed to calculate geometry using all of the following variables: %s"
                % ", ".join(itertools.chain(std_variables, non_std_variables)))

        # TODO: compute bounding box using global attributes

        final_var_names = []
        if prefix == "":
            messages.append(
                u"Could not find a standard name vocabulary.  No global attribute named 'standard_name_vocabulary'.  Variable list may be incorrect or contain non-measured quantities."
            )
            final_var_names = non_std_variables + std_variables
        else:
            final_var_names = non_std_variables + list(
                map(unicode, [
                    "%s%s" %
                    (prefix, cd.nc.variables[x].getncattr("standard_name"))
                    for x in std_variables
                ]))

        service = {
            'name':
            name,
            'description':
            description,
            'service_type':
            self.service.get('service_type'),
            'service_id':
            ObjectId(self.service.get('_id')),
            'data_provider':
            self.service.get('data_provider'),
            'metadata_type':
            u'ncml',
            'metadata_value':
            unicode(dataset2ncml(cd.nc, url=self.service.get('url'))),
            'messages':
            map(unicode, messages),
            'keywords':
            keywords,
            'variables':
            map(unicode, final_var_names),
            'asset_type':
            unicode(cd._datasettype).upper(),
            'geojson':
            gj,
            'updated':
            datetime.utcnow()
        }

        with app.app_context():
            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

        return "Harvested"
Пример #41
0
 def wrapper(*args, **kwargs):
     with app.app_context():
         return f(*args, **kwargs)
Пример #42
0
def migrate():
    with app.app_context():
        migrate_names()
        migrate_active_datasets()
        migrate_active_metadata()
        app.logger.info("Migration 2014-08-27 complete")
Пример #43
0
    def harvest(self):
        """
        Identify the type of CF dataset this is:
          * UGRID
          * CGRID
          * RGRID
          * DSG
        """

        try:
            cd = CommonDataset.open(self.service.get('url'))
        except Exception as e:
            app.logger.error("Could not open DAP dataset from '%s'\n"
                             "Exception %s: %s" %
                             (self.service.get('url'), type(e).__name__, e))
            return 'Not harvested'

        # rely on times in the file first over global atts for calculating
        # start/end times of dataset.
        tmin, tmax = self.get_min_max_time(cd)
        # if nothing was returned, try to get from global atts
        if (tmin == None and tmax == None
                and 'time_coverage_start' in cd.metadata
                and 'time_coverage_end' in cd.metadata):
            try:
                tmin, tmax = (parse(cd.metadata[t])
                              for t in ('time_coverage_start',
                                        'time_coverage_end'))
            except ValueError:
                tmin, tmax = None, None
        # For DAP, the unique ID is the URL
        unique_id = self.service.get('url')

        with app.app_context():
            dataset = db.Dataset.find_one({'uid': unicode(unique_id)})
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)
                dataset['active'] = True

        # Find service reference in Dataset.services and remove (to replace it)
        tmp = dataset.services[:]
        for d in tmp:
            if d['service_id'] == self.service.get('_id'):
                dataset.services.remove(d)

        # Parsing messages
        messages = []

        # NAME
        name = None
        try:
            name = unicode_or_none(cd.nc.getncattr('title'))
        except AttributeError:
            messages.append(
                u"Could not get dataset name.  No global attribute named 'title'."
            )

        # DESCRIPTION
        description = None
        try:
            description = unicode_or_none(cd.nc.getncattr('summary'))
        except AttributeError:
            messages.append(
                u"Could not get dataset description.  No global attribute named 'summary'."
            )

        # KEYWORDS
        keywords = []
        try:
            keywords = sorted(
                map(lambda x: unicode(x.strip()),
                    cd.nc.getncattr('keywords').split(",")))
        except AttributeError:
            messages.append(
                u"Could not get dataset keywords.  No global attribute named 'keywords' or was not comma seperated list."
            )

        # VARIABLES
        prefix = ""
        # Add additonal prefix mappings as they become available.
        try:
            standard_name_vocabulary = unicode(
                cd.nc.getncattr("standard_name_vocabulary"))

            cf_regex = [
                re.compile("CF-"),
                re.compile(
                    'http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html'
                )
            ]

            for reg in cf_regex:
                if reg.match(standard_name_vocabulary) is not None:
                    prefix = "http://mmisw.org/ont/cf/parameter/"
                    break
        except AttributeError:
            pass

        # Get variables with a standard_name
        std_variables = [
            cd.get_varname_from_stdname(x)[0]
            for x in self.get_standard_variables(cd.nc)
            if x not in self.STD_AXIS_NAMES and
            len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0
        ]

        # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable
        non_std_variables = list(
            set([
                x for x in cd.nc.variables if x not in itertools.chain(
                    _possibley, _possiblex, _possiblez, _possiblet,
                    self.METADATA_VAR_NAMES, self.COMMON_AXIS_NAMES) and
                len(cd.nc.variables[x].shape) > 0 and x not in std_variables
            ]))

        axis_names = DapHarvest.get_axis_variables(cd.nc)
        """
        var_to_get_geo_from = None
        if len(std_names) > 0:
            var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0]
            messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1]))
        else:
            # No idea which variable to generate geometry from... try to factor variables with a shape > 1.
            try:
                var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1]
            except IndexError:
                messages.append(u"Could not find any non-axis variables to compute geometry from.")
            else:
                messages.append(u"No 'standard_name' attributes were found on non-axis variables.  Variable '%s' was used to calculate geometry." % var_to_get_geo_from)
        """

        # LOCATION (from Paegan)
        # Try POLYGON and fall back to BBOX

        # paegan does not support ugrid, so try to detect this condition and skip
        is_ugrid = False
        is_trajectory = False
        for vname, v in cd.nc.variables.iteritems():
            if 'cf_role' in v.ncattrs():
                if v.getncattr('cf_role') == 'mesh_topology':
                    is_ugrid = True
                    break
                elif v.getncattr('cf_role') == 'trajectory_id':
                    is_trajectory = True
                    break

        gj = None

        if is_ugrid:
            messages.append(
                u"The underlying 'Paegan' data access library does not support UGRID and cannot parse geometry."
            )
        elif is_trajectory:
            coord_names = {}
            # try to get info for x, y, z, t axes
            for v in itertools.chain(std_variables, non_std_variables):
                try:
                    coord_names = cd.get_coord_names(v, **axis_names)

                    if coord_names['xname'] is not None and \
                       coord_names['yname'] is not None:
                        break
                except (AssertionError, AttributeError, ValueError, KeyError):
                    pass
            else:
                messages.append(
                    u"Trajectory discovered but could not detect coordinate variables using the underlying 'Paegan' data access library."
                )

            if 'xname' in coord_names:
                try:
                    xvar = cd.nc.variables[coord_names['xname']]
                    yvar = cd.nc.variables[coord_names['yname']]

                    # one less order of magnitude eg 390000 -> 10000
                    slice_factor = 10**(int(math.log10(xvar.size)) - 1)
                    if slice_factor < 1:
                        slice_factor = 1

                    # TODO: don't split x/y as separate arrays.  Refactor to
                    # use single numpy array instead with both lon/lat

                    # tabledap datasets must be treated differently than
                    # standard DAP endpoints.  Retrieve geojson instead of
                    # trying to access as a DAP endpoint
                    if 'erddap/tabledap' in unique_id:
                        # take off 's.' from erddap
                        gj = self.erddap_geojson_url(coord_names)
                        # type defaults to MultiPoint, change to LineString
                        coords = np.array(gj['coordinates'][::slice_factor] +
                                          gj['coordinates'][-1:])
                        xs = coords[:, 0]
                        ys = coords[:, 1]
                    else:
                        xs = np.concatenate((xvar[::slice_factor], xvar[-1:]))
                        ys = np.concatenate((yvar[::slice_factor], yvar[-1:]))
                    # both coords must be valid to have a valid vertex
                    # get rid of any nans and unreasonable lon/lats
                    valid_idx = ((~np.isnan(xs)) & (np.absolute(xs) <= 180) &
                                 (~np.isnan(ys)) & (np.absolute(ys) <= 90))

                    xs = xs[valid_idx]
                    ys = ys[valid_idx]
                    # Shapely seems to require float64 values or incorrect
                    # values will propagate for the generated lineString
                    # if the array is not numpy's float64 dtype
                    lineCoords = np.array([xs, ys]).T.astype('float64')

                    gj = mapping(asLineString(lineCoords))

                    messages.append(u"Variable %s was used to calculate "
                                    u"trajectory geometry, and is a "
                                    u"naive sampling." % v)

                except (AssertionError, AttributeError, ValueError, KeyError,
                        IndexError) as e:
                    app.logger.warn("Trajectory error occured: %s", e)
                    messages.append(
                        u"Trajectory discovered but could not create a geometry."
                    )

        else:
            for v in itertools.chain(std_variables, non_std_variables):
                try:
                    gj = mapping(
                        cd.getboundingpolygon(var=v,
                                              **axis_names).simplify(0.5))
                except (AttributeError, AssertionError, ValueError, KeyError,
                        IndexError):
                    try:
                        # Returns a tuple of four coordinates, but box takes in four seperate positional argouments
                        # Asterik magic to expland the tuple into positional arguments
                        app.logger.exception("Error calculating bounding box")

                        # handles "points" aka single position NCELLs
                        bbox = cd.getbbox(var=v, **axis_names)
                        gj = self.get_bbox_or_point(bbox)

                    except (AttributeError, AssertionError, ValueError,
                            KeyError, IndexError):
                        pass

                if gj is not None:
                    # We computed something, break out of loop.
                    messages.append(
                        u"Variable %s was used to calculate geometry." % v)
                    break

            if gj is None:  # Try the globals
                gj = self.global_bounding_box(cd.nc)
                messages.append(
                    u"Bounding Box calculated using global attributes")
            if gj is None:
                messages.append(
                    u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset."
                )
                messages.append(
                    u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset."
                )
                messages.append(
                    u"Failed to calculate geometry using all of the following variables: %s"
                    % ", ".join(
                        itertools.chain(std_variables, non_std_variables)))

        # TODO: compute bounding box using global attributes

        final_var_names = []
        if prefix == "":
            messages.append(
                u"Could not find a standard name vocabulary.  No global attribute named 'standard_name_vocabulary'.  Variable list may be incorrect or contain non-measured quantities."
            )
            final_var_names = non_std_variables + std_variables
        else:
            final_var_names = non_std_variables + list(
                map(unicode, [
                    "%s%s" %
                    (prefix, cd.nc.variables[x].getncattr("standard_name"))
                    for x in std_variables
                ]))

        service = {
            'name':
            name,
            'description':
            description,
            'service_type':
            self.service.get('service_type'),
            'service_id':
            ObjectId(self.service.get('_id')),
            'data_provider':
            self.service.get('data_provider'),
            'metadata_type':
            u'ncml',
            'metadata_value':
            unicode(dataset2ncml(cd.nc, url=self.service.get('url'))),
            'time_min':
            tmin,
            'time_max':
            tmax,
            'messages':
            map(unicode, messages),
            'keywords':
            keywords,
            'variables':
            map(unicode, final_var_names),
            'asset_type':
            get_common_name(DapHarvest.get_asset_type(cd)),
            'geojson':
            gj,
            'updated':
            datetime.utcnow()
        }

        with app.app_context():
            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

        ncdataset = Dataset(self.service.get('url'))
        scores = self.ccheck_dataset(ncdataset)
        metamap = self.metamap_dataset(ncdataset)

        try:
            metadata_rec = self.save_ccheck_dataset('ioos', dataset._id,
                                                    scores, metamap)
        except Exception as e:
            metadata_rec = None
            app.logger.error(
                "could not save compliancecheck/metamap information",
                exc_info=True)

        return "Harvested"
Пример #44
0
    def process_station(self, uid, offering):
        """ Makes a DescribeSensor request based on a 'uid' parameter being a
            station procedure.  Also pass along an offering with
            getCapabilities information for items such as temporal extent"""

        GML_NS = "http://www.opengis.net/gml"
        XLINK_NS = "http://www.w3.org/1999/xlink"

        with app.app_context():

            app.logger.info("process_station: %s", uid)
            desc_sens = self._describe_sensor(uid, timeout=1200)
            # FIXME: add some kind of notice saying the station failed
            if desc_sens is None:
                app.logger.warn(
                    "Could not get a valid describeSensor response")
                return
            metadata_value = etree.fromstring(desc_sens)
            sensor_ml = SensorML(metadata_value)
            try:
                station_ds = IoosDescribeSensor(metadata_value)
            # if this doesn't conform to IOOS SensorML sub, fall back to
            # manually picking apart the SensorML
            except ows.ExceptionReport:
                station_ds = process_sensorml(sensor_ml.members[0])

            unique_id = station_ds.id
            if unique_id is None:
                app.logger.warn(
                    "Could not get a 'stationID' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/stationID'"
                )
                return

            dataset = db.Dataset.find_one({'uid': unicode(unique_id)})
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)
                dataset['active'] = True

            # Find service reference in Dataset.services and remove (to replace it)
            tmp = dataset.services[:]
            for d in tmp:
                if d['service_id'] == self.service.get('_id'):
                    dataset.services.remove(d)

            # Parsing messages
            messages = []

            # NAME
            name = unicode_or_none(station_ds.shortName)
            if name is None:
                messages.append(
                    u"Could not get a 'shortName' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/shortName'"
                )

            # DESCRIPTION
            description = unicode_or_none(station_ds.longName)
            if description is None:
                messages.append(
                    u"Could not get a 'longName' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/longName'"
                )

            # PLATFORM TYPE
            asset_type = unicode_or_none(
                getattr(station_ds, 'platformType', None))
            if asset_type is None:
                messages.append(
                    u"Could not get a 'platformType' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/platformType'"
                )

            # LOCATION is in GML
            gj = None
            loc = station_ds.location
            if loc is not None and loc.tag == "{%s}Point" % GML_NS:
                pos_element = loc.find("{%s}pos" % GML_NS)
                # some older responses may uses the deprecated coordinates
                # element
                if pos_element is None:
                    # if pos not found use deprecated coordinates element
                    pos_element = loc.find("{%s}coordinates" % GML_NS)
                # strip out points
                positions = map(float, pos_element.text.split(" "))

                for el in [pos_element, loc]:
                    srs_name = testXMLAttribute(el, "srsName")
                    if srs_name:
                        crs = Crs(srs_name)
                        if crs.axisorder == "yx":
                            gj = json.loads(
                                geojson.dumps(
                                    geojson.Point([positions[1],
                                                   positions[0]])))
                        else:
                            gj = json.loads(
                                geojson.dumps(
                                    geojson.Point([positions[0],
                                                   positions[1]])))
                        break
                else:
                    if positions:
                        messages.append(
                            u"Position(s) found but could not parse SRS: %s, %s"
                            % (positions, srs_name))

            else:
                messages.append(
                    u"Found an unrecognized child of the sml:location element and did not attempt to process it: %s"
                    % loc)

            meta_str = unicode(etree.tostring(metadata_value)).strip()
            if len(meta_str) > 4000000:
                messages.append(
                    u'Metadata document was too large to store (len: %s)' %
                    len(meta_str))
                meta_str = u''

            service = {
                # Reset service
                'name': name,
                'description': description,
                'service_type': self.service.get('service_type'),
                'service_id': ObjectId(self.service.get('_id')),
                'data_provider': self.service.get('data_provider'),
                'metadata_type': u'sensorml',
                'metadata_value': u'',
                'time_min': getattr(offering, 'begin_position', None),
                'time_max': getattr(offering, 'end_position', None),
                'messages': map(unicode, messages),
                'keywords': map(unicode, sorted(station_ds.keywords)),
                'variables': map(unicode, sorted(station_ds.variables)),
                'asset_type': get_common_name(asset_type),
                'geojson': gj,
                'updated': datetime.utcnow()
            }

            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

            # do compliance checker / metadata now
            scores = self.ccheck_station(sensor_ml)
            metamap = self.metamap_station(sensor_ml)

            try:
                self.save_ccheck_station('ioos', dataset._id, scores, metamap)
            except Exception as e:
                app.logger.warn(
                    "could not save compliancecheck/metamap information: %s",
                    e)

            return "Harvest Successful"
Пример #45
0
    def process_station(self, uid):
        """ Makes a DescribeSensor request based on a 'uid' parameter being a station procedure """

        GML_NS   = "http://www.opengis.net/gml"
        XLINK_NS = "http://www.w3.org/1999/xlink"

        with app.app_context():

            metadata_value = etree.fromstring(self.sos.describe_sensor(outputFormat='text/xml;subtype="sensorML/1.0.1/profiles/ioos_sos/1.0"', procedure=uid))
            station_ds     = IoosDescribeSensor(metadata_value)

            unique_id = station_ds.id
            if unique_id is None:
                app.logger.warn("Could not get a 'stationID' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/stationID'")
                return

            dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } )
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)

            # Find service reference in Dataset.services and remove (to replace it)
            tmp = dataset.services[:]
            for d in tmp:
                if d['service_id'] == self.service.get('_id'):
                    dataset.services.remove(d)

            # Parsing messages
            messages = []

            # NAME
            name = unicode_or_none(station_ds.shortName)
            if name is None:
                messages.append(u"Could not get a 'shortName' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/shortName'")

            # DESCRIPTION
            description = unicode_or_none(station_ds.longName)
            if description is None:
                messages.append(u"Could not get a 'longName' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/longName'")

            # PLATFORM TYPE
            asset_type = unicode_or_none(station_ds.platformType)
            if asset_type is None:
                messages.append(u"Could not get a 'platformType' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/platformType'")

            # LOCATION is in GML
            gj = None
            loc = station_ds.location
            if loc is not None and loc.tag == "{%s}Point" % GML_NS:
                pos_element = loc.find("{%s}pos" % GML_NS)
                # strip out points
                positions = map(float, testXMLValue(pos_element).split(" "))
                crs = Crs(testXMLAttribute(pos_element, "srsName"))
                if crs.axisorder == "yx":
                    gj = json.loads(geojson.dumps(geojson.Point([positions[1], positions[0]])))
                else:
                    gj = json.loads(geojson.dumps(geojson.Point([positions[0], positions[1]])))
            else:
                messages.append(u"Found an unrecognized child of the sml:location element and did not attempt to process it: %s" % etree.tostring(loc).strip())

            service = {
                # Reset service
                'name'              : name,
                'description'       : description,
                'service_type'      : self.service.get('service_type'),
                'service_id'        : ObjectId(self.service.get('_id')),
                'data_provider'     : self.service.get('data_provider'),
                'metadata_type'     : u'sensorml',
                'metadata_value'    : unicode(etree.tostring(metadata_value)).strip(),
                'messages'          : map(unicode, messages),
                'keywords'          : map(unicode, sorted(station_ds.keywords)),
                'variables'         : map(unicode, sorted(station_ds.variables)),
                'asset_type'        : asset_type,
                'geojson'           : gj,
                'updated'           : datetime.utcnow()
            }

            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()
            return "Harvested"
Пример #46
0
def regulate():
    with app.app_context():

        # Get services that have not been updated in two weeks and remove them.
        # The reindex job sets the 'updated' field.  The below logic should effectively remove
        # services that the reindex task has not seen in two weeks.
        two_weeks_ago = (datetime.utcnow() - timedelta(weeks=2)).replace(tzinfo=pytz.utc)
        deletes = [s for s in db.Service.find() if s.updated.replace(tzinfo=pytz.utc).astimezone(pytz.utc) < two_weeks_ago]
        for d in deletes:
            d.cancel_ping()
            d.cancel_harvest()
            # I don't think we want to delete these.
            # Lets make deletion a manual process.
            #d.delete()
            # TODO: Now delete the stats that were collected for this service.

        # Get function and args of
        jobs = scheduler.get_jobs()

        # Make sure a daily report job is running
        daily_email_jobs = [job for job in jobs if job.func == send_daily_report_email]
        if len(daily_email_jobs) > 1:
            # Cancel all but the first daily email job
            for j in daily_email_jobs[1:]:
                scheduler.cancel(j)
        elif len(daily_email_jobs) < 1:
            # Run today at 3am (7am UTC) if it is between midnight and 3am
            runat = datetime.utcnow().replace(hour=7, minute=0, second=0, microsecond=0)
            if datetime.utcnow() > runat:
                # Run tomorrow at 3am (7am UTC) because it is already past that time.
                runat = runat + timedelta(days=1)

            scheduler.schedule(
                scheduled_time=runat,           # Time for first execution
                func=send_daily_report_email,   # Function to be queued
                interval=86400,                 # Time before the function is called again, in seconds (86400 == 1 day)
                repeat=None,                    # Repeat this number of times (None means repeat forever)
                result_ttl=100000               # How long to keep the results, in seconds
            )

        # Make sure a service update job is running
        reindex_services_jobs = [job for job in jobs if job.func == reindex_services]
        if len(reindex_services_jobs) < 1:
            scheduler.schedule(
                scheduled_time=datetime.utcnow(),  # Time for first execution
                func=reindex_services,             # Function to be queued
                interval=21600,                    # Time before the function is called again, in seconds (21600 == 1/4 of a day)
                repeat=None,                       # Repeat this number of times (None means repeat forever)
                result_ttl=40000,                  # How long to keep the results, in seconds
                timeout=1200                       # Default timeout of 180 seconds may not be enough
            )

        # Make sure each service has a ping job
        stat_jobs = [unicode(job.args[0]) for job in jobs if job.func == ping_service_task]
        # Get services that don't have jobs
        need_ping = [s for s in db.Service.find() if unicode(s._id) not in stat_jobs]
        # Schedule the ones that do not
        for s in need_ping:
            s.schedule_ping(cancel=False)

        # Make sure each service has a harvest job
        harvest_jobs = [unicode(job.args[0]) for job in jobs if job.func == harvest]
        # Get services that don't have jobs
        need_harvest = [s for s in db.Service.find() if unicode(s._id) not in harvest_jobs]
        # Schedule the ones that do not
        for s in need_harvest:
            s.schedule_harvest(cancel=False)


    return "Regulated %s reindex jobs, %s ping jobs, %s harvest jobs, and deleted %s old services" % (len(reindex_services_jobs), len(need_ping), len(need_harvest), len(deletes))
Пример #47
0
        self.update = {'$set': {'ping_job_id': None}}


# Stats
from ioos_catalog.models import stat


class StatMigration(DocumentMigration):
    # add any migrations here named "allmigration_*"
    pass


# Datasets
from ioos_catalog.models import dataset


class DatasetMigration(DocumentMigration):
    # add any migrations here named "allmigration_*"
    pass


with app.app_context():
    migration = ServiceMigration(service.Service)
    migration.migrate_all(collection=db['services'])

    migration = StatMigration(stat.Stat)
    migration.migrate_all(collection=db['stats'])

    migration = DatasetMigration(dataset.Dataset)
    migration.migrate_all(collection=db['datasets'])
Пример #48
0
    def process_station(self, uid):
        """ Makes a DescribeSensor request based on a 'uid' parameter being a station procedure """

        GML_NS = "http://www.opengis.net/gml"
        XLINK_NS = "http://www.w3.org/1999/xlink"

        with app.app_context():

            metadata_value = etree.fromstring(
                self.sos.describe_sensor(
                    outputFormat=
                    'text/xml;subtype="sensorML/1.0.1/profiles/ioos_sos/1.0"',
                    procedure=uid))
            station_ds = IoosDescribeSensor(metadata_value)

            unique_id = station_ds.id
            if unique_id is None:
                app.logger.warn(
                    "Could not get a 'stationID' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/stationID'"
                )
                return

            dataset = db.Dataset.find_one({'uid': unicode(unique_id)})
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)

            # Find service reference in Dataset.services and remove (to replace it)
            tmp = dataset.services[:]
            for d in tmp:
                if d['service_id'] == self.service.get('_id'):
                    dataset.services.remove(d)

            # Parsing messages
            messages = []

            # NAME
            name = unicode_or_none(station_ds.shortName)
            if name is None:
                messages.append(
                    u"Could not get a 'shortName' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/shortName'"
                )

            # DESCRIPTION
            description = unicode_or_none(station_ds.longName)
            if description is None:
                messages.append(
                    u"Could not get a 'longName' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/longName'"
                )

            # PLATFORM TYPE
            asset_type = unicode_or_none(station_ds.platformType)
            if asset_type is None:
                messages.append(
                    u"Could not get a 'platformType' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/platformType'"
                )

            # LOCATION is in GML
            gj = None
            loc = station_ds.location
            if loc is not None and loc.tag == "{%s}Point" % GML_NS:
                pos_element = loc.find("{%s}pos" % GML_NS)
                # strip out points
                positions = map(float, testXMLValue(pos_element).split(" "))
                crs = Crs(testXMLAttribute(pos_element, "srsName"))
                if crs.axisorder == "yx":
                    gj = json.loads(
                        geojson.dumps(
                            geojson.Point([positions[1], positions[0]])))
                else:
                    gj = json.loads(
                        geojson.dumps(
                            geojson.Point([positions[0], positions[1]])))
            else:
                messages.append(
                    u"Found an unrecognized child of the sml:location element and did not attempt to process it: %s"
                    % etree.tostring(loc).strip())

            service = {
                # Reset service
                'name': name,
                'description': description,
                'service_type': self.service.get('service_type'),
                'service_id': ObjectId(self.service.get('_id')),
                'data_provider': self.service.get('data_provider'),
                'metadata_type': u'sensorml',
                'metadata_value':
                unicode(etree.tostring(metadata_value)).strip(),
                'messages': map(unicode, messages),
                'keywords': map(unicode, sorted(station_ds.keywords)),
                'variables': map(unicode, sorted(station_ds.variables)),
                'asset_type': asset_type,
                'geojson': gj,
                'updated': datetime.utcnow()
            }

            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()
            return "Harvested"
Пример #49
0
def initialize_captcha_db():
    with app.app_context():
        captcha.ext_db.create_all()
        app.logger.info("Captcha DB Initialized")
Пример #50
0
    def allmigration07__add_extra_url_field(self):
        self.target = {'extra_url':{'$exists': False}}
        self.update = {'$set':{'extra_url': None}}

# Datasets
from ioos_catalog.models import dataset
class DatasetMigration(DocumentMigration):
    # add any migrations here named "allmigration_*"
    def allmigration01__add_active_field(self):
        self.target = {'active' : {'$exists' : False}}
        self.update = {'$set' : {'active' : False}}

# Metadatas
from ioos_catalog.models import metadata
class MetadataMigration(DocumentMigration):
    def allmigration01__add_active_field(self):
        self.target = {'active' : {'$exists' : False}}
        self.update = {'$set' : {'active' : False}}


with app.app_context():
    migration = ServiceMigration(service.Service)
    migration.migrate_all(collection=db['services'])

    migration = DatasetMigration(dataset.Dataset)
    migration.migrate_all(collection=db['datasets'])

    migration = MetadataMigration(metadata.Metadata)
    migration.migrate_all(collection=db['metadatas'])

Пример #51
0
def reindex_services():
    region_map =    {   'AOOS':         '1706F520-2647-4A33-B7BF-592FAFDE4B45',
                        'CARICOOS':     '117F1684-A5E3-400E-98D8-A270BDBA1603',
                        'CENCOOS':      '4BA5624D-A61F-4C7E-BAEE-7F8BDDB8D9C4',
                        'GCOOS':        '003747E7-4818-43CD-937D-44D5B8E2F4E9',
                        'GLOS':         'B664427E-6953-4517-A874-78DDBBD3893E',
                        'MARACOOS':     'C664F631-6E53-4108-B8DD-EFADF558E408',
                        'NANOOS':       '254CCFC0-E408-4E13-BD62-87567E7586BB',
                        'NERACOOS':     'E41F4FCD-0297-415D-AC53-967B970C3A3E',
                        'PacIOOS':      '68FF11D8-D66B-45EE-B33A-21919BB26421',
                        'SCCOOS':       'B70B3E3C-3851-4BA9-8E9B-C9F195DCEAC7',
                        'SECOORA':      'B3EA8869-B726-4E39-898A-299E53ABBC98' }
                        #'NOS/CO-OPS':   '72E748DF-23B1-4E80-A2C4-81E70783094A',
                        #'USACE':        '73019DFF-2E01-4800-91CD-0B3F812256A7',
                        #'NAVY':         '3B94DAAE-B7E9-4789-993B-0045AD9149D9',
                        #'NDBC':         '828981B0-0039-4360-9788-E788FA6B0875',
                        #'USGS/CMGP':    'C6F11F00-C2BD-4AC6-8E2C-013E16F4932E' }

    services =      {   'SOS'       :   'urn:x-esri:specification:ServiceType:sos:url',
                        'WMS'       :   'urn:x-esri:specification:ServiceType:wms:url',
                        'WCS'       :   'urn:x-esri:specification:ServiceType:wcs:url',
                        'DAP'       :   'urn:x-esri:specification:ServiceType:odp:url' }

    endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw' # NGDC Geoportal

    c = csw.CatalogueServiceWeb(endpoint, timeout=120)

    ns = Namespaces()

    with app.app_context():
        for region,uuid in region_map.iteritems():
            # Setup uuid filter
            uuid_filter = fes.PropertyIsEqualTo(propertyname='sys.siteuuid', literal="{%s}" % uuid)

            # Make CSW request
            c.getrecords2([uuid_filter], esn='full', maxrecords=999999)

            for name, record in c.records.iteritems():

                # @TODO: unfortunately CSW does not provide us with contact info, so
                # we must request it manually
                contact_email = ""
                metadata_url = None

                iso_ref = [x['url'] for x in record.references if x['scheme'] == 'urn:x-esri:specification:ServiceType:ArcIMS:Metadata:Document']
                if len(iso_ref):
                    metadata_url = iso_ref[0]

                    # Don't query for contact info right now.  It takes WAY too long.
                    #r = requests.get(iso_ref[0])
                    #r.raise_for_status()
                    #node = ET.fromstring(r.content)
                    #safe = nspath_eval("gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString", ns.get_namespaces())
                    #contact_node = node.find(".//" + safe)
                    #if contact_node is not None and contact_node.text != "":
                    #    contact_email = contact_node.text
                    #    if " or " in contact_email:
                    #        contact_email = ",".join(contact_email.split(" or "))

                for ref in record.references:

                    # We are only interested in the 'services'
                    if ref["scheme"] in services.values():
                        url = unicode(ref["url"])
                        s =   db.Service.find_one({ 'data_provider' : unicode(region), 'url' : url })
                        if s is None:
                            s               = db.Service()
                            s.url           = url
                            s.data_provider = unicode(region)

                        s.service_id        = unicode(name)
                        s.name              = unicode(record.title)
                        s.service_type      = unicode(next((k for k,v in services.items() if v == ref["scheme"])))
                        s.interval          = 3600 # 1 hour
                        s.tld               = unicode(urlparse(url).netloc)
                        s.updated           = datetime.utcnow()
                        s.contact           = unicode(contact_email)
                        s.metadata_url      = unicode(metadata_url)
                        s.save()
                        s.schedule_harvest()
Пример #52
0
def reindex_services():
    region_map = {
        'AOOS': '1706F520-2647-4A33-B7BF-592FAFDE4B45',
        'CARICOOS': '117F1684-A5E3-400E-98D8-A270BDBA1603',
        'CENCOOS': '4BA5624D-A61F-4C7E-BAEE-7F8BDDB8D9C4',
        'GCOOS': '003747E7-4818-43CD-937D-44D5B8E2F4E9',
        'GLOS': 'B664427E-6953-4517-A874-78DDBBD3893E',
        'MARACOOS': 'C664F631-6E53-4108-B8DD-EFADF558E408',
        'NANOOS': '254CCFC0-E408-4E13-BD62-87567E7586BB',
        'NERACOOS': 'E41F4FCD-0297-415D-AC53-967B970C3A3E',
        'PacIOOS': '68FF11D8-D66B-45EE-B33A-21919BB26421',
        'SCCOOS': 'B70B3E3C-3851-4BA9-8E9B-C9F195DCEAC7',
        'SECOORA': 'B3EA8869-B726-4E39-898A-299E53ABBC98'
    }
    #'NOS/CO-OPS':   '72E748DF-23B1-4E80-A2C4-81E70783094A',
    #'USACE':        '73019DFF-2E01-4800-91CD-0B3F812256A7',
    #'NAVY':         '3B94DAAE-B7E9-4789-993B-0045AD9149D9',
    #'NDBC':         '828981B0-0039-4360-9788-E788FA6B0875',
    #'USGS/CMGP':    'C6F11F00-C2BD-4AC6-8E2C-013E16F4932E' }

    services = {
        'SOS': 'urn:x-esri:specification:ServiceType:sos:url',
        'WMS': 'urn:x-esri:specification:ServiceType:wms:url',
        'WCS': 'urn:x-esri:specification:ServiceType:wcs:url',
        'DAP': 'urn:x-esri:specification:ServiceType:odp:url'
    }

    endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw'  # NGDC Geoportal

    c = csw.CatalogueServiceWeb(endpoint, timeout=120)

    ns = Namespaces()

    with app.app_context():
        for region, uuid in region_map.iteritems():
            # Setup uuid filter
            uuid_filter = fes.PropertyIsEqualTo(propertyname='sys.siteuuid',
                                                literal="{%s}" % uuid)

            # Make CSW request
            c.getrecords2([uuid_filter], esn='full', maxrecords=999999)

            for name, record in c.records.iteritems():

                # @TODO: unfortunately CSW does not provide us with contact info, so
                # we must request it manually
                contact_email = ""
                metadata_url = None

                iso_ref = [
                    x['url'] for x in record.references if x['scheme'] ==
                    'urn:x-esri:specification:ServiceType:ArcIMS:Metadata:Document'
                ]
                if len(iso_ref):
                    metadata_url = iso_ref[0]

                    # Don't query for contact info right now.  It takes WAY too long.
                    #r = requests.get(iso_ref[0])
                    #r.raise_for_status()
                    #node = ET.fromstring(r.content)
                    #safe = nspath_eval("gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString", ns.get_namespaces())
                    #contact_node = node.find(".//" + safe)
                    #if contact_node is not None and contact_node.text != "":
                    #    contact_email = contact_node.text
                    #    if " or " in contact_email:
                    #        contact_email = ",".join(contact_email.split(" or "))

                for ref in record.references:

                    # We are only interested in the 'services'
                    if ref["scheme"] in services.values():
                        url = unicode(ref["url"])
                        s = db.Service.find_one({
                            'data_provider':
                            unicode(region),
                            'url':
                            url
                        })
                        if s is None:
                            s = db.Service()
                            s.url = url
                            s.data_provider = unicode(region)

                        s.service_id = unicode(name)
                        s.name = unicode(record.title)
                        s.service_type = unicode(
                            next((k for k, v in services.items()
                                  if v == ref["scheme"])))
                        s.interval = 3600  # 1 hour
                        s.tld = unicode(urlparse(url).netloc)
                        s.updated = datetime.utcnow()
                        s.contact = unicode(contact_email)
                        s.metadata_url = unicode(metadata_url)
                        s.save()
                        s.schedule_harvest()
Пример #53
0
def migrate():
    with app.app_context():
        migrate_names()
        migrate_active_datasets()
        migrate_active_metadata()
        app.logger.info("Migration 2014-08-27 complete")
Пример #54
0
    def harvest(self):
        """
        Identify the type of CF dataset this is:
          * UGRID
          * CGRID
          * RGRID
          * DSG
        """

        try:
            cd = CommonDataset.open(self.service.get('url'))
        except Exception as e:
            app.logger.error("Could not open DAP dataset from '%s'\n"
                             "Exception %s: %s" % (self.service.get('url'),
                                                   type(e).__name__, e))
            return 'Not harvested'


        # For DAP, the unique ID is the URL
        unique_id = self.service.get('url')

        with app.app_context():
            dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } )
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)
                dataset['active'] = True

        # Find service reference in Dataset.services and remove (to replace it)
        tmp = dataset.services[:]
        for d in tmp:
            if d['service_id'] == self.service.get('_id'):
                dataset.services.remove(d)

        # Parsing messages
        messages = []

        # NAME
        name = None
        try:
            name = unicode_or_none(cd.nc.getncattr('title'))
        except AttributeError:
            messages.append(u"Could not get dataset name.  No global attribute named 'title'.")

        # DESCRIPTION
        description = None
        try:
            description = unicode_or_none(cd.nc.getncattr('summary'))
        except AttributeError:
            messages.append(u"Could not get dataset description.  No global attribute named 'summary'.")

        # KEYWORDS
        keywords = []
        try:
            keywords = sorted(map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(",")))
        except AttributeError:
            messages.append(u"Could not get dataset keywords.  No global attribute named 'keywords' or was not comma seperated list.")

        # VARIABLES
        prefix    = ""
        # Add additonal prefix mappings as they become available.
        try:
            standard_name_vocabulary = unicode(cd.nc.getncattr("standard_name_vocabulary"))

            cf_regex = [re.compile("CF-"), re.compile('http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html')]

            for reg in cf_regex:
                if reg.match(standard_name_vocabulary) is not None:
                    prefix = "http://mmisw.org/ont/cf/parameter/"
                    break
        except AttributeError:
            pass

        # Get variables with a standard_name
        std_variables = [cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in self.STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0]

        # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable
        non_std_variables = list(set([x for x in cd.nc.variables if x not in itertools.chain(_possibley, _possiblex, _possiblez, _possiblet, self.METADATA_VAR_NAMES, self.COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables]))

        axis_names = DapHarvest.get_axis_variables(cd.nc)
        """
        var_to_get_geo_from = None
        if len(std_names) > 0:
            var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0]
            messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1]))
        else:
            # No idea which variable to generate geometry from... try to factor variables with a shape > 1.
            try:
                var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1]
            except IndexError:
                messages.append(u"Could not find any non-axis variables to compute geometry from.")
            else:
                messages.append(u"No 'standard_name' attributes were found on non-axis variables.  Variable '%s' was used to calculate geometry." % var_to_get_geo_from)
        """

        # LOCATION (from Paegan)
        # Try POLYGON and fall back to BBOX

        # paegan does not support ugrid, so try to detect this condition and skip
        is_ugrid = False
        is_trajectory = False
        for vname, v in cd.nc.variables.iteritems():
            if 'cf_role' in v.ncattrs():
                if v.getncattr('cf_role') == 'mesh_topology':
                    is_ugrid = True
                    break
                elif v.getncattr('cf_role') == 'trajectory_id':
                    is_trajectory = True
                    break

        gj = None

        if is_ugrid:
            messages.append(u"The underlying 'Paegan' data access library does not support UGRID and cannot parse geometry.")
        elif is_trajectory:
            coord_names = {}
            # try to get info for x, y, z, t axes
            for v in itertools.chain(std_variables, non_std_variables):
                try:
                    coord_names = cd.get_coord_names(v, **axis_names)

                    if coord_names['xname'] is not None and \
                       coord_names['yname'] is not None:
                        break
                except (AssertionError, AttributeError, ValueError, KeyError):
                    pass
            else:
                messages.append(u"Trajectory discovered but could not detect coordinate variables using the underlying 'Paegan' data access library.")

            if 'xname' in coord_names:
                try:
                    xvar = cd.nc.variables[coord_names['xname']]
                    yvar = cd.nc.variables[coord_names['yname']]

                    # one less order of magnitude eg 390000 -> 10000
                    slice_factor = 10 ** (int(math.log10(xvar.size)) - 1)

                    xs = np.concatenate((xvar[::slice_factor], xvar[-1:]))
                    ys = np.concatenate((yvar[::slice_factor], yvar[-1:]))
                    # both coords must be valid to have a valid vertex
                    # get rid of any nans and unreasonable lon/lats
                    valid_idx = ((~np.isnan(xs)) & (np.absolute(xs) <= 180) &
                                 (~np.isnan(ys)) & (np.absolute(ys) <= 90))

                    xs = xs[valid_idx]
                    ys = ys[valid_idx]
                    # Shapely seems to require float64 values or incorrect
                    # values will propagate for the generated lineString
                    # if the array is not numpy's float64 dtype
                    lineCoords = np.array([xs, ys]).T.astype('float64')

                    gj = mapping(asLineString(lineCoords))

                    messages.append(u"Variable %s was used to calculate "
                                    u"trajectory geometry, and is a "
                                    u"naive sampling." % v)

                except (AssertionError, AttributeError,
                        ValueError, KeyError, IndexError) as e:
                    app.logger.warn("Trajectory error occured: %s", e)
                    messages.append(u"Trajectory discovered but could not create a geometry.")

        else:
            for v in itertools.chain(std_variables, non_std_variables):
                try:
                    gj = mapping(cd.getboundingpolygon(var=v, **axis_names
                                                       ).simplify(0.5))
                except (AttributeError, AssertionError, ValueError,
                        KeyError, IndexError):
                    try:
                        # Returns a tuple of four coordinates, but box takes in four seperate positional argouments
                        # Asterik magic to expland the tuple into positional arguments
                        app.logger.exception("Error calculating bounding box")

                        # handles "points" aka single position NCELLs
                        bbox = cd.getbbox(var=v, **axis_names)
                        gj = self.get_bbox_or_point(bbox)

                    except (AttributeError, AssertionError, ValueError,
                            KeyError, IndexError):
                        pass

                if gj is not None:
                    # We computed something, break out of loop.
                    messages.append(u"Variable %s was used to calculate geometry." % v)
                    break

            if gj is None: # Try the globals
                gj = self.global_bounding_box(cd.nc)
                messages.append(u"Bounding Box calculated using global attributes")
            if gj is None:
                messages.append(u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset.")
                messages.append(u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset.")
                messages.append(u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables)))





        # TODO: compute bounding box using global attributes


        final_var_names = []
        if prefix == "":
            messages.append(u"Could not find a standard name vocabulary.  No global attribute named 'standard_name_vocabulary'.  Variable list may be incorrect or contain non-measured quantities.")
            final_var_names = non_std_variables + std_variables
        else:
            final_var_names = non_std_variables + list(map(unicode, ["%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables]))

        service = {
            'name':           name,
            'description':    description,
            'service_type':   self.service.get('service_type'),
            'service_id':     ObjectId(self.service.get('_id')),
            'data_provider':  self.service.get('data_provider'),
            'metadata_type':  u'ncml',
            'metadata_value': unicode(dataset2ncml(cd.nc, url=self.service.get('url'))),
            'messages':       map(unicode, messages),
            'keywords':       keywords,
            'variables':      map(unicode, final_var_names),
            'asset_type':     get_common_name(DapHarvest.get_asset_type(cd)),
            'geojson':        gj,
            'updated':        datetime.utcnow()
        }

        with app.app_context():
            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

        ncdataset = Dataset(self.service.get('url'))
        scores = self.ccheck_dataset(ncdataset)
        metamap = self.metamap_dataset(ncdataset)

        try:
            metadata_rec = self.save_ccheck_dataset('ioos', dataset._id, scores, metamap)
        except Exception as e:
            metadata_rec = None
            app.logger.error("could not save compliancecheck/metamap information", exc_info=True)

        return "Harvested"
Пример #55
0
    def process_station(self, uid):
        """ Makes a DescribeSensor request based on a 'uid' parameter being a station procedure """

        GML_NS   = "http://www.opengis.net/gml"
        XLINK_NS = "http://www.w3.org/1999/xlink"

        with app.app_context():

            app.logger.info("process_station: %s", uid)

            metadata_value = etree.fromstring(self._describe_sensor(uid))
            sensor_ml      = SensorML(metadata_value)
            station_ds     = IoosDescribeSensor(metadata_value)

            unique_id = station_ds.id
            if unique_id is None:
                app.logger.warn("Could not get a 'stationID' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/stationID'")
                return

            dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } )
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)
                dataset['active'] = True

            # Find service reference in Dataset.services and remove (to replace it)
            tmp = dataset.services[:]
            for d in tmp:
                if d['service_id'] == self.service.get('_id'):
                    dataset.services.remove(d)

            # Parsing messages
            messages = []

            # NAME
            name = unicode_or_none(station_ds.shortName)
            if name is None:
                messages.append(u"Could not get a 'shortName' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/shortName'")

            # DESCRIPTION
            description = unicode_or_none(station_ds.longName)
            if description is None:
                messages.append(u"Could not get a 'longName' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/longName'")

            # PLATFORM TYPE
            asset_type = unicode_or_none(station_ds.platformType)
            if asset_type is None:
                messages.append(u"Could not get a 'platformType' from the SensorML identifiers.  Looking for a definition of 'http://mmisw.org/ont/ioos/definition/platformType'")

            # LOCATION is in GML
            gj = None
            loc = station_ds.location
            if loc is not None and loc.tag == "{%s}Point" % GML_NS:
                pos_element = loc.find("{%s}pos" % GML_NS)
                # strip out points
                positions = map(float, testXMLValue(pos_element).split(" "))

                for el in [pos_element, loc]:
                    srs_name = testXMLAttribute(el, "srsName")
                    if srs_name:
                        crs = Crs(srs_name)
                        if crs.axisorder == "yx":
                            gj = json.loads(geojson.dumps(geojson.Point([positions[1], positions[0]])))
                        else:
                            gj = json.loads(geojson.dumps(geojson.Point([positions[0], positions[1]])))
                        break
                else:
                    if positions:
                        messages.append(u"Position(s) found but could not parse SRS: %s, %s" % (positions, srs_name))

            else:
                messages.append(u"Found an unrecognized child of the sml:location element and did not attempt to process it: %s" % loc)

            meta_str = unicode(etree.tostring(metadata_value)).strip()
            if len(meta_str) > 4000000:
                messages.append(u'Metadata document was too large to store (len: %s)' % len(meta_str))
                meta_str = u''

            service = {
                # Reset service
                'name'              : name,
                'description'       : description,
                'service_type'      : self.service.get('service_type'),
                'service_id'        : ObjectId(self.service.get('_id')),
                'data_provider'     : self.service.get('data_provider'),
                'metadata_type'     : u'sensorml',
                'metadata_value'    : u'',
                'messages'          : map(unicode, messages),
                'keywords'          : map(unicode, sorted(station_ds.keywords)),
                'variables'         : map(unicode, sorted(station_ds.variables)),
                'asset_type'        : get_common_name(asset_type),
                'geojson'           : gj,
                'updated'           : datetime.utcnow()
            }

            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

            # do compliance checker / metadata now
            scores = self.ccheck_station(sensor_ml)
            metamap = self.metamap_station(sensor_ml)

            try:
                self.save_ccheck_station('ioos', dataset._id, scores, metamap)
            except Exception as e:
                app.logger.warn("could not save compliancecheck/metamap information: %s", e)

            return "Harvested"
Пример #56
0
def reindex_services(filter_regions=None, filter_service_types=None):
    c = csw.CatalogueServiceWeb(endpoint, timeout=120)

    ns = Namespaces()

    filter_regions = filter_regions or region_map.keys()
    filter_service_types = filter_service_types or services.keys()

    with app.app_context():

        new_services = []
        update_services = []

        # get a set of all non-manual, active services for possible deactivation later
        current_services = set((s._id for s in db.Service.find(
            {
                'manual': False,
                'active': True,
                'data_provider': {
                    '$in': filter_regions
                }
            }, {'_id': True})))

        # FIXME: find a more robust mechanism for detecting ERDDAP instances
        # this would fail if behind a url rewriting/proxying mechanism which
        # remove the 'erddap' portion from the URL.  May want to have GeoPortal
        # use a separate 'scheme' dedicated to ERDDAP for CSW record
        # 'references'

        # workaround for matching ERDDAP endpoints
        # match griddap or tabledap endpoints with html or graph
        # discarding any query string parameters (i.e. some datasets on PacIOOS)
        re_string = r'(^.*erddap/(?:grid|table)dap.*)\.(?:html|graph)(:?\?.*)?$'
        erddap_re = re.compile(re_string)
        erddap_all_re = re.compile(r'(^.*erddap/(?:(?:grid|table|)dap|wms).*)'
                                   r'\.(?:html|graph)(:?\?.*)?$')

        for region, uuid in region_map.iteritems():

            if region not in filter_regions:
                app.logger.info("Skipping region %s due to filter", region)
                continue

            app.logger.info("Requesting region %s", region)

            # Setup uuid filter
            uuid_filter = fes.PropertyIsEqualTo(propertyname='sys.siteuuid',
                                                literal="{%s}" % uuid)

            # Make CSW request
            c.getrecords2([uuid_filter], esn='full', maxrecords=999999)

            for name, record in c.records.iteritems():
                try:
                    # @TODO: unfortunately CSW does not provide us with contact info, so
                    # we must request it manually
                    contact_email = ""
                    metadata_url = None

                    for ref in record.references:
                        try:
                            # TODO: Use a more robust mechanism for detecting
                            # ERDDAP instances aside from relying on the url
                            erddap_match = erddap_re.search(ref['url'])
                            # We are only interested in the 'services'
                            if (ref["scheme"] in services.values()):
                                metadata_url = next((
                                    r['url'] for r in record.references
                                    if r['scheme'] ==
                                    'urn:x-esri:specification:ServiceType:ArcIMS:Metadata:Document'
                                ), None)
                                # strip extension if erddap endpoint
                                url = unicode(ref['url'])
                            elif erddap_match:
                                test_url = (erddap_match.group(1) +
                                            '.iso19115')
                                req = requests.get(test_url)
                                # if we have a valid ERDDAP metadata endpoint,
                                # store it.
                                if req.status_code == 200:
                                    metadata_url = unicode(test_url)
                                else:
                                    app.logger.error('Invalid service URL %s',
                                                     ref['url'])
                                    continue

                                url = get_erddap_url_from_iso(req.content)
                                if url is None:
                                    app.logger.error(ref['url'])
                                    app.logger.error(
                                        "Failed to parse Erddap ISO for %s",
                                        test_url)
                                    continue  # Either not a valid ISO or there's not a valid endpoint

                            # next record if not one of the previously mentioned
                            else:
                                continue
                            # end metadata find block
                            s = db.Service.find_one({
                                'data_provider':
                                unicode(region),
                                'url':
                                url
                            })
                            if s is None:
                                s = db.Service()
                                s.url = unicode(url)
                                s.data_provider = unicode(region)
                                s.manual = False
                                s.active = True

                                new_services.append(s)
                            else:
                                # will run twice if erddap services have
                                # both .html and .graph, but resultant
                                # data should be the same
                                update_services.append(s)

                            s.service_id = unicode(name)
                            s.name = unicode(record.title)
                            s.service_type = unicode(
                                'DAP' if erddap_match else next((
                                    k for k, v in services.items()
                                    if v == ref["scheme"])))
                            s.interval = 3600  # 1 hour
                            s.tld = unicode(urlparse(url).netloc)
                            s.updated = datetime.utcnow()
                            s.contact = unicode(contact_email)
                            s.metadata_url = metadata_url

                            # grab opendap form url if present
                            if s.service_type == 'DAP':
                                possible_refs = [
                                    r['url'] for r in record.references
                                    if r['scheme'] == opendap_form_schema
                                ]
                                if len(possible_refs):
                                    # this is bad, it can grab any associated
                                    # record from the dataset
                                    s.extra_url = unicode(possible_refs[0])

                            # if we see the service, this is "Active", unless we've set manual (then we don't touch)
                            if not s.manual:
                                s.active = True

                            s.save()

                        except Exception as e:
                            app.logger.warn("Could not save service: %s", e)

                except Exception as e:
                    app.logger.warn("Could not save region info: %s", e)

        # DEACTIVATE KNOWN SERVICES
        updated_ids = set((s._id for s in update_services))
        deactivate = list(current_services.difference(updated_ids))

        # bulk update (using pymongo syntax)
        db.services.update({'_id': {
            '$in': deactivate
        }}, {'$set': {
            'active': False,
            'updated': datetime.utcnow()
        }},
                           multi=True,
                           upsert=False)

        return "New services: %s, updated services: %s, deactivated services: %s" % (
            len(new_services), len(update_services), len(deactivate))
Пример #57
0
    def harvest(self):
        """
        Identify the type of CF dataset this is:
          * UGRID
          * CGRID
          * RGRID
          * DSG
        """

        METADATA_VAR_NAMES   = [u'crs',
                                u'projection']

        # CF standard names for Axis
        STD_AXIS_NAMES       = [u'latitude',
                                u'longitude',
                                u'time',
                                u'forecast_reference_time',
                                u'forecast_period',
                                u'ocean_sigma',
                                u'ocean_s_coordinate_g1',
                                u'ocean_s_coordinate_g2',
                                u'ocean_s_coordinate',
                                u'ocean_double_sigma',
                                u'ocean_sigma_over_z',
                                u'projection_y_coordinate',
                                u'projection_x_coordinate']

        # Some datasets don't define standard_names on axis variables.  This is used to weed them out based on the
        # actual variable name
        COMMON_AXIS_NAMES    = [u'x',
                                u'y',
                                u'lat',
                                u'latitude',
                                u'lon',
                                u'longitude',
                                u'time',
                                u'time_run',
                                u'time_offset',
                                u'ntimes',
                                u'lat_u',
                                u'lon_u',
                                u'lat_v',
                                u'lon_v  ',
                                u'lat_rho',
                                u'lon_rho',
                                u'lat_psi']

        cd = CommonDataset.open(self.service.get('url'))

        # For DAP, the unique ID is the URL
        unique_id = self.service.get('url')

        with app.app_context():
            dataset = db.Dataset.find_one( { 'uid' : unicode(unique_id) } )
            if dataset is None:
                dataset = db.Dataset()
                dataset.uid = unicode(unique_id)

        # Find service reference in Dataset.services and remove (to replace it)
        tmp = dataset.services[:]
        for d in tmp:
            if d['service_id'] == self.service.get('_id'):
                dataset.services.remove(d)

        # Parsing messages
        messages = []

        # NAME
        name = None
        try:
            name = unicode_or_none(cd.nc.getncattr('title'))
        except AttributeError:
            messages.append(u"Could not get dataset name.  No global attribute named 'title'.")

        # DESCRIPTION
        description = None
        try:
            description = unicode_or_none(cd.nc.getncattr('summary'))
        except AttributeError:
            messages.append(u"Could not get dataset description.  No global attribute named 'summary'.")

        # KEYWORDS
        keywords = []
        try:
            keywords = sorted(map(lambda x: unicode(x.strip()), cd.nc.getncattr('keywords').split(",")))
        except AttributeError:
            messages.append(u"Could not get dataset keywords.  No global attribute named 'keywords' or was not comma seperated list.")

        # VARIABLES
        prefix    = ""
        # Add additonal prefix mappings as they become available.
        try:
            standard_name_vocabulary = unicode(cd.nc.getncattr("standard_name_vocabulary"))

            cf_regex = [re.compile("CF-"), re.compile('http://www.cgd.ucar.edu/cms/eaton/cf-metadata/standard_name.html')]

            for reg in cf_regex:
                if reg.match(standard_name_vocabulary) is not None:
                    prefix = "http://mmisw.org/ont/cf/parameter/"
                    break
        except AttributeError:
            pass

        # Get variables with a standard_name
        std_variables = [cd.get_varname_from_stdname(x)[0] for x in self.get_standard_variables(cd.nc) if x not in STD_AXIS_NAMES and len(cd.nc.variables[cd.get_varname_from_stdname(x)[0]].shape) > 0]

        # Get variables that are not axis variables or metadata variables and are not already in the 'std_variables' variable
        non_std_variables = list(set([x for x in cd.nc.variables if x not in itertools.chain(_possibley, _possiblex, _possiblez, _possiblet, METADATA_VAR_NAMES, COMMON_AXIS_NAMES) and len(cd.nc.variables[x].shape) > 0 and x not in std_variables]))

        """
        var_to_get_geo_from = None
        if len(std_names) > 0:
            var_to_get_geo_from = cd.get_varname_from_stdname(std_names[-1])[0]
            messages.append(u"Variable '%s' with standard name '%s' was used to calculate geometry." % (var_to_get_geo_from, std_names[-1]))
        else:
            # No idea which variable to generate geometry from... try to factor variables with a shape > 1.
            try:
                var_to_get_geo_from = [x for x in variables if len(cd.nc.variables[x].shape) > 1][-1]
            except IndexError:
                messages.append(u"Could not find any non-axis variables to compute geometry from.")
            else:
                messages.append(u"No 'standard_name' attributes were found on non-axis variables.  Variable '%s' was used to calculate geometry." % var_to_get_geo_from)
        """

        # LOCATION (from Paegan)
        # Try POLYGON and fall back to BBOX
        gj = None
        for v in itertools.chain(std_variables, non_std_variables):
            try:
                gj = mapping(cd.getboundingpolygon(var=v))
            except (AttributeError, AssertionError, ValueError):
                try:
                    # Returns a tuple of four coordinates, but box takes in four seperate positional argouments
                    # Asterik magic to expland the tuple into positional arguments
                    gj = mapping(box(*cd.get_bbox(var=v)))
                except (AttributeError, AssertionError, ValueError):
                    pass

            if gj is not None:
                # We computed something, break out of loop.
                messages.append(u"Variable %s was used to calculate geometry." % v)
                break

        if gj is None:
            messages.append(u"The underlying 'Paegan' data access library could not determine a bounding BOX for this dataset.")
            messages.append(u"The underlying 'Paegan' data access library could not determine a bounding POLYGON for this dataset.")
            messages.append(u"Failed to calculate geometry using all of the following variables: %s" % ", ".join(itertools.chain(std_variables, non_std_variables)))

        # TODO: compute bounding box using global attributes


        final_var_names = []
        if prefix == "":
            messages.append(u"Could not find a standard name vocabulary.  No global attribute named 'standard_name_vocabulary'.  Variable list may be incorrect or contain non-measured quantities.")
            final_var_names = non_std_variables + std_variables
        else:
            final_var_names = non_std_variables + list(map(unicode, ["%s%s" % (prefix, cd.nc.variables[x].getncattr("standard_name")) for x in std_variables]))

        service = {
            'name'              : name,
            'description'       : description,
            'service_type'      : self.service.get('service_type'),
            'service_id'        : ObjectId(self.service.get('_id')),
            'data_provider'     : self.service.get('data_provider'),
            'metadata_type'     : u'ncml',
            'metadata_value'    : unicode(dataset2ncml(cd.nc, url=self.service.get('url'))),
            'messages'          : map(unicode, messages),
            'keywords'          : keywords,
            'variables'         : map(unicode, final_var_names),
            'asset_type'        : unicode(cd._datasettype).upper(),
            'geojson'           : gj,
            'updated'           : datetime.utcnow()
        }

        with app.app_context():
            dataset.services.append(service)
            dataset.updated = datetime.utcnow()
            dataset.save()

        return "Harvested"