Exemplo n.º 1
0
    def fetch_stage(self, harvest_object):

        # Check harvest object status
        status = self._get_object_extra(harvest_object,'status')

        if status == 'delete':
            # No need to fetch anything, just pass to the import stage
            return True

        # We need to fetch the remote document

        # Get location
        url = self._get_object_extra(harvest_object, 'waf_location')
        if not url:
            self._save_object_error(
                    'No location defined for object {0}'.format(harvest_object.id),
                    harvest_object)
            return False

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            msg = 'Could not harvest WAF link {0}: {1}'.format(url, e)
            self._save_object_error(msg, harvest_object)
            return False

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
            harvest_object.save()
        else:
            extra = HOExtra(
                    object=harvest_object,
                    key='original_document',
                    value=content)
            extra.save()

            extra = HOExtra(
                    object=harvest_object,
                    key='original_format',
                    value=document_format)
            extra.save()

        return True
Exemplo n.º 2
0
def add_status():
    records = open('wafurls.txt')
    results = open('wafurlsstatus.txt', 'w+')
    headers = 'count,count_with_date,server,status_code,error,standard,id,unapproved,url'
    results.write(headers + '\n')
    writer = csv.DictWriter(
        results, headers.split(',')
    )


    for row in records:
        row_dict = dict(zip('id unapproved url'.split(),row.split()))
        try:
            response = requests.get(row_dict['url'], timeout=60)
            content = response.content
            server = str(response.headers.get('server'))
            if server == 'Microsoft-IIS/7.5':
                scraper = 'iis'
            elif 'apache' in server.lower() or 'nginx' in server.lower() or not response.headers.get('server'):
                scraper = 'apache'
            else:
                scraper = 'other'

            row_dict['status_code'] = str(response.status_code)
            row_dict['server'] = server

            if content and response.status_code == 200:
                extracted_waf = extract_waf(content,row_dict['url'], scraper)
                row_dict['count'] = str(len(extracted_waf))
                row_dict['count_with_date'] = str(len([i for i in extracted_waf if i[1]]))
                if extracted_waf:
                    try:
                        content_doc = requests.get(extracted_waf[0][0], timeout=60).content
                        standard = guess_standard(content_doc)
                        row_dict['standard'] = standard
                    except Exception, e:
                        print 'Error guessing format. Error is', e
            else:
                row_dict['count'] = "0"
                row_dict['count_with_date'] = "0"
Exemplo n.º 3
0
def add_status():
    records = open('wafurls.txt')
    results = open('wafurlsstatus.txt', 'w+')
    headers = 'count,count_with_date,server,status_code,error,standard,id,unapproved,url'
    results.write(headers + '\n')
    writer = csv.DictWriter(results, headers.split(','))

    for row in records:
        row_dict = dict(zip('id unapproved url'.split(), row.split()))
        try:
            response = requests.get(row_dict['url'], timeout=60)
            content = response.content
            server = str(response.headers.get('server'))
            if server == 'Microsoft-IIS/7.5':
                scraper = 'iis'
            elif 'apache' in server.lower() or 'nginx' in server.lower(
            ) or not response.headers.get('server'):
                scraper = 'apache'
            else:
                scraper = 'other'

            row_dict['status_code'] = str(response.status_code)
            row_dict['server'] = server

            if content and response.status_code == 200:
                extracted_waf = extract_waf(content, row_dict['url'], scraper)
                row_dict['count'] = str(len(extracted_waf))
                row_dict['count_with_date'] = str(
                    len([i for i in extracted_waf if i[1]]))
                if extracted_waf:
                    try:
                        content_doc = requests.get(extracted_waf[0][0],
                                                   timeout=60).content
                        standard = guess_standard(content_doc)
                        row_dict['standard'] = standard
                    except Exception, e:
                        print 'Error guessing format. Error is', e
            else:
                row_dict['count'] = "0"
                row_dict['count_with_date'] = "0"
Exemplo n.º 4
0
        if not url:
            self._save_object_error(
                u'No location defined for object {0}'.format(harvest_object.id),
                harvest_object)
            return False

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception, e:
            msg = u'Could not harvest WAF link {0}: {1}'.format(url, e)
            self._save_object_error(msg, harvest_object)
            return False

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == u'iso':
            harvest_object.content = content
            harvest_object.save()
        else:
            extra = HOExtra(
                object=harvest_object,
                key=u'original_document',
                value=content)
            extra.save()

            extra = HOExtra(
                object=harvest_object,
                key=u'original_format',
                value=document_format)
            extra.save()
Exemplo n.º 5
0
class GeoDataGovGeoportalHarvester(CSWHarvester, GeoDataGovHarvester):
    '''
    A Harvester for CSW servers, with customizations for geo.data.gov
    '''
    def info(self):
        return {
            'name': 'geoportal',
            'title': 'Geoportal Server',
            'description': 'A Geoportal Server CSW endpoint',
        }

    def output_schema(self):
        return 'csw'

    def fetch_stage(self, harvest_object):

        log = logging.getLogger(__name__ + '.geoportal.fetch')
        log.debug('CswHarvester fetch_stage for object: %s', harvest_object.id)

        url = harvest_object.source.url

        identifier = harvest_object.guid

        parts = urlparse.urlparse(url)
        url = urlparse.urlunparse((parts.scheme, parts.netloc, '/'.join(
            parts.path.rstrip('/').split('/')[:-2]), None, None, None))
        url = url.rstrip('/') + '/rest/document?id=%s' % identifier
        try:
            response = requests.get(url)
            content = response.content
        except Exception, e:
            self._save_object_error(
                'Error getting the record with GUID %s from %s' %
                (identifier, url), harvest_object)
            return False

        try:
            # Save the fetch contents in the HarvestObject
            # Contents come from csw_client already declared and encoded as utf-8
            # Remove original XML declaration
            content = re.sub('<\?xml(.*)\?>', '', content)

            document_format = guess_standard(content)
            if document_format == 'iso':
                harvest_object.content = content
                harvest_object.save()
            elif document_format == 'fgdc':
                extra = HOExtra(object=harvest_object,
                                key='original_document',
                                value=content)
                extra.save()

                extra = HOExtra(object=harvest_object,
                                key='original_format',
                                value=document_format)
                extra.save()
            else:
                harvest_object.report_status = 'ignored'
                harvest_object.save()
                return False
        except Exception, e:
            self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \
                                    (identifier, e), harvest_object)
            return False
Exemplo n.º 6
0
        if not url:
            self._save_object_error(
                    'No location defined for object {0}'.format(harvest_object.id),
                    harvest_object)
            return False

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception, e:
            msg = 'Could not harvest WAF link {0}: {1}'.format(url, e)
            self._save_object_error(msg, harvest_object)
            return False

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
            harvest_object.save()
        else:
            extra = HOExtra(
                    object=harvest_object,
                    key='original_document',
                    value=content)
            extra.save()

            extra = HOExtra(
                    object=harvest_object,
                    key='original_format',
                    value=document_format)
            extra.save()
Exemplo n.º 7
0
class DocHarvester(SpatialHarvester, SingletonPlugin):
    '''A Harvester for individual spatial metadata documents
    TODO: Move to new logic


    '''

    implements(IHarvester)

    def info(self):
        ''' '''
        return {
            u'name': u'single-doc',
            u'title': u'Single spatial metadata document',
            u'description': u'A single spatial metadata document'
            }

    def get_original_url(self, harvest_object_id):
        '''

        :param harvest_object_id: 

        '''
        obj = model.Session.query(HarvestObject).filter(
            HarvestObject.id == harvest_object_id).first()
        if not obj:
            return None

        return obj.source.url

    def gather_stage(self, harvest_job):
        '''

        :param harvest_job: 

        '''
        log = logging.getLogger(__name__ + u'.individual.gather')
        log.debug(u'DocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception, e:
            self._save_gather_error(u'Unable to get content for URL: %s: %r' % (url, e),
                                    harvest_job)
            return None

        existing_object = model.Session.query(HarvestObject.guid,
                                              HarvestObject.package_id).filter(
            HarvestObject.current is True).filter(
            HarvestObject.harvest_source_id == harvest_job.source.id).first()

        def create_extras(url, status):
            '''

            :param url: 
            :param status: 

            '''
            return [HOExtra(key=u'doc_location', value=url),
                    HOExtra(key=u'status', value=status)]

        if not existing_object:
            guid = hashlib.md5(url.encode(u'utf8', u'ignore')).hexdigest()
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url,
                                                                u'new'),
                                           guid=guid
                                           )
        else:
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url,
                                                                u'change'),
                                           guid=existing_object.guid,
                                           package_id=existing_object.package_id
                                           )

        harvest_object.add()

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == u'iso':
            harvest_object.content = content
        else:
            extra = HOExtra(
                object=harvest_object,
                key=u'original_document',
                value=content)
            extra.save()

            extra = HOExtra(
                object=harvest_object,
                key=u'original_format',
                value=document_format)
            extra.save()

        harvest_object.save()

        return [harvest_object.id]
Exemplo n.º 8
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.individual.gather')
        log.debug('DocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None

        existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
                                    first()

        def create_extras(url, status):
            return [
                HOExtra(key='doc_location', value=url),
                HOExtra(key='status', value=status)
            ]

        if not existing_object:
            guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest()
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url, 'new'),
                                           guid=guid)
        else:
            harvest_object = HarvestObject(
                job=harvest_job,
                extras=create_extras(url, 'change'),
                guid=existing_object.guid,
                package_id=existing_object.package_id)

        harvest_object.add()

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
        else:
            extra = HOExtra(object=harvest_object,
                            key='original_document',
                            value=content)
            extra.save()

            extra = HOExtra(object=harvest_object,
                            key='original_format',
                            value=document_format)
            extra.save()

        harvest_object.save()

        return [harvest_object.id]