def _gather_entry(self, entry, auth=None):
        # Create a harvest object for each entry
        entry_guid = entry['guid']
        log.debug('gathering %s', entry_guid)
        entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '')  # noqa: E501
        entry_restart_date = entry['restart_date']

        package_query = Session.query(Package)
        query_filtered = package_query.filter(Package.name == entry_name)
        package = query_filtered.first()

        if package:
            # Meaning we've previously harvested this,
            # but we may want to reharvest it now.
            previous_obj = Session.query(HarvestObject) \
                .filter(HarvestObject.guid == entry_guid) \
                .filter(HarvestObject.current == True) \
                .first()  # noqa: E712
            if previous_obj:
                previous_obj.current = False
                previous_obj.save()

            if self.update_all:
                log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                status = 'change'
            else:
                log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501  # noqa: E501
                status = 'unchanged'

            obj = HarvestObject(guid=entry_guid,
                                job=self.job,
                                extras=[
                                    HOExtra(key='status', value=status),
                                    HOExtra(key='restart_date', value=entry_restart_date)
                                ])

            obj.content = entry['content']
            obj.package = package
            obj.save()
            return obj.id

        elif not package:
            # It's a product we haven't harvested before.
            log.debug(
                '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                format(entry_name))  # noqa: E501
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value='new'),
                    HOExtra(key='restart_date', value=entry_restart_date)
                ])
            obj.content = entry['content']
            obj.package = None
            obj.save()
            return obj.id
示例#2
0
    def _run_import(self, xml, job):
        if not model.User.get('harvest'):
            model.User(name='harvest', sysadmin=True).save()
        if not model.Group.get('test'):
            get_action('organization_create')({
                'user': '******'
            }, {
                'name': 'test'
            })

        record = _get_record(xml)

        metadata = CmdiReader()(record)
        metadata['unified']['owner_org'] = "test"

        harvest_object = HarvestObject()
        harvest_object.content = json.dumps(metadata.getMap())
        harvest_object.id = xml
        harvest_object.guid = xml
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.save()

        self.harvester.import_stage(harvest_object)
        return harvest_object
示例#3
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        self._set_config(harvest_job.source.config)
        sets = []
        harvest_objs = []
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = oaipmh.client.Client(harvest_job.source.url, registry)
        try:
            identifier = client.identify()
        except urllib2.URLError:
            self._save_gather_error('Could not gather anything from %s!' %
                                    harvest_job.source.url, harvest_job)
            return None
        domain = identifier.repositoryName()
        group = Group.by_name(domain)
        if not group:
            group = Group(name=domain, description=domain)
        query = self.config['query'] if 'query' in self.config else ''
        try:
            for set in client.listSets():
                identifier, name, _ = set
                if 'query' in self.config:
                    if query in name:
                        sets.append((identifier, name))
                else:
                    sets.append((identifier, name))
        except NoSetHierarchyError:
            sets.append(('1', 'Default'))
            self._save_gather_error('Could not fetch sets!', harvest_job)

        for set_id, set_name in sets:
            harvest_obj = HarvestObject(job=harvest_job)
            harvest_obj.content = json.dumps(
                                             {
                                              'set': set_id, \
                                              'set_name': set_name, \
                                              'domain': domain
                                              }
                                             )
            harvest_obj.save()
            harvest_objs.append(harvest_obj.id)
        model.repo.commit()
        return harvest_objs
    def gather_stage(self, harvest_job, encoding=None):
        self._set_config(harvest_job.source.config)
        # Request all remote packages
        try:
            content = self._get_content(harvest_job.source.url)
            LOGGER.debug('Grabbing zip file: %s', harvest_job.source.url)

            object_ids = []
            packages = []

            file_content = StringIO.StringIO(content)
            archive = zipfile.ZipFile(file_content, 'r')
            for name in archive.namelist():
                if name.endswith('.json'):
                    archive_content = archive.read(name)
                    if encoding is not None:
                        archive_content = archive_content.decode(encoding)
                    else:
                        archive_content = self.lstrip_bom(archive_content)

                    package = json.loads(archive_content)
                    normalize_api_dataset(package)
                    packages.append(package)
                    obj = HarvestObject(guid=package['name'], job=harvest_job)
                    obj.content = json.dumps(package)
                    obj.save()
                    object_ids.append(obj.id)

        except zipfile.BadZipfile as err:
            self._save_gather_error(err.message, harvest_job)
            return None
        except ContentFetchError as err:
            self._save_gather_error(err.message, harvest_job)
            return None
        except Exception as err:
            error_template = 'Unable to get content for URL: %s: %s'
            error = error_template % (harvest_job.source.url, str(err))
            self._save_gather_error(error, harvest_job)
            return None

        if object_ids:
            # delete obsolete packages
            super(JSONZipBaseHarvester, self).delete_deprecated_datasets(
                packages,
                harvest_job
            )

            return object_ids
        else:
            self._save_gather_error(
                'No packages received for URL: %s' % harvest_job.source.url,
                harvest_job
            )

            return None
示例#5
0
    def test_import(self):
        source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
        source.save()
        job = HarvestJob(source=source)
        job.save()

        harvest_object = self._run_import("cmdi_1.xml", job)

        self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or [])))

        package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'})

        self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180')
        self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180')
        self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}')
        self.assertEquals(package.get('version', None), '2012-09-07')
        self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
        self.assertEquals(package.get('license_id', None), 'undernegotiation')

        provider = config['ckan.site_url']
        expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0',
                        u'provider': provider,
                        u'type': u'metadata'}

        self.assertTrue(expected_pid in package.get('pids'))

        model.Session.flush()

        harvest_object = self._run_import("cmdi_2.xml", job)

        self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or [])))

        package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'})

        self.assertEquals(package['temporal_coverage_begin'], '1880')
        self.assertEquals(package['temporal_coverage_end'], '1939')
        self.assertEquals(package.get('license_id', None), 'other')
        # Delete package
        harvest_object = HarvestObject()
        harvest_object.content = None
        harvest_object.id = "test-cmdi-delete"
        harvest_object.guid = "test-cmdi-delete"
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.package_id = package.get('id')
        harvest_object.report_status = "deleted"
        harvest_object.save()

        self.harvester.import_stage(harvest_object)

        model.Session.flush()
        self.assertEquals(model.Package.get(package['id']).state, 'deleted')
 def test_harvester_urlerror(self):
     harv, job = self._create_harvester()
     urllib2.urlopen = realopen
     self.assert_(harv.gather_stage(job) == None)
     errs = Session.query(HarvestGatherError).all()
     self.assert_(len(errs) == 1)
     harv_obj = HarvestObject()
     harv_obj.job = job
     harv_obj.content = json.dumps({'url': "http://foo"})
     # XML error and URL error, also the lack of url in content
     self.assert_(harv.import_stage(harv_obj) == False)
     errs = Session.query(HarvestObjectError).all()
     print errs
     self.assert_(len(errs) == 1)
示例#7
0
 def _make_retry_lists(self, harvest_job, ident2rec, ident2set, from_until):
     recs = []
     for ident, harv in ident2rec.items():
         info = json.loads(harv.content)
         harv.content = None
         harv.save()
         harvest_obj = HarvestObject(job=harvest_job)
         harvest_obj.content = json.dumps(info)
         harvest_obj.save()
         recs.append(harvest_obj.id)
         log.debug('Retrying record: %s' % harv.id)
     sets = []
     insertion_retries = set()
     def update_until(info, from_until):
         if 'until' not in info:
             return  # Wanted up to current time earlier.
         if 'until' not in from_until:
             del info['until']  # Want up to current time now.
             return
         fu = self._str_from_datetime(from_until['until'])
         if info['until'] < fu:  # Keep latest date from the two alternatives.
             info['until'] = fu
     for name, obj in ident2set.items():
         info = json.loads(obj.content)
         obj.content = None
         obj.save()
         update_until(info, from_until)
         harvest_obj = HarvestObject(job=harvest_job)
         harvest_obj.content = json.dumps(info)
         harvest_obj.save()
         sets.append(harvest_obj.id)
         if 'set' not in info:
             insertion_retries.add(name)
             log.debug('Retrying set insertions: %s' % info['set_name'])
         else:
             log.debug('Retrying set: %s' % info['set_name'])
     return recs, sets, insertion_retries
示例#8
0
 def gather_stage(self, harvest_job):
     """Gather the URLs to fetch from a URL which has a list of links to XML
     documents containing the DDI documents.
     """
     self._set_config(self.config)
     previous_job = (
         Session.query(HarvestJob)
         .filter(HarvestJob.source == harvest_job.source)
         .filter(HarvestJob.gather_finished != None)
         .filter(HarvestJob.id != harvest_job.id)
         .order_by(HarvestJob.gather_finished.desc())
         .limit(1)
         .first()
     )
     if previous_job:
         self.incremental = True
     gather_url = harvest_job.source.url
     try:
         urls = urllib2.urlopen(gather_url)
         harvest_objs = []
         for url in urls.readlines():
             gather = True
             if self.incremental:
                 request = urllib2.Request(url)
                 request.get_method = lambda: "HEAD"
                 doc_url = urllib2.urlopen(request)
                 lastmod = parser.parse(doc_url.headers["last-modified"], ignoretz=True)
                 if previous_job.gather_finished < lastmod:
                     log.debug("Gather false")
                     gather = False
             if gather and not self.incremental:
                 harvest_obj = HarvestObject()
                 harvest_obj.content = json.dumps({"url": url})
                 harvest_obj.job = harvest_job
                 harvest_obj.save()
                 harvest_objs.append(harvest_obj.id)
     except urllib2.URLError:
         self._save_gather_error("Could not gather XML files from URL!", harvest_job)
         return None
     return harvest_objs
示例#9
0
    def _run_import(self, xml, job):
        if not model.User.get('harvest'):
            model.User(name='harvest', sysadmin=True).save()
        if not model.Group.get('test'):
            get_action('organization_create')({'user': '******'}, {'name': 'test'})

        record = _get_record(xml)

        metadata = CmdiReader()(record)
        metadata['unified']['owner_org'] = "test"

        harvest_object = HarvestObject()
        harvest_object.content = json.dumps(metadata.getMap())
        harvest_object.id = xml
        harvest_object.guid = xml
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.save()

        self.harvester.import_stage(harvest_object)
        return harvest_object
    def gather_stage(self, harvest_job):
        super(JSONDumpBaseCKANHarvester, self)._set_config(
            harvest_job.source.config
        )

        try:
            content = self._get_content(harvest_job.source.url)
        except ContentFetchError as err:
            self._save_gather_error(err.message, harvest_job)
            return None
        except Exception as err:
            error_template = 'Unable to get content for URL: %s: %s'
            error = error_template % (harvest_job.source.url, str(err))
            self._save_gather_error(error, harvest_job)
            return None

        object_ids = []

        packages = json.loads(content)

        for package in packages:
            normalize_api_dataset(package)
            obj = HarvestObject(guid=package['name'], job=harvest_job)
            obj.content = json.dumps(package)
            obj.save()
            object_ids.append(obj.id)

        if object_ids:
            # delete obsolete packages
            self.delete_deprecated_datasets(packages, harvest_job)
            return object_ids
        else:
            self._save_gather_error(
                'No packages received for URL: %s' % harvest_job.source.url,
                harvest_job
            )
            return None
示例#11
0
    def test_import(self):
        source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
        source.save()
        job = HarvestJob(source=source)
        job.save()

        harvest_object = self._run_import("cmdi_1.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package.get('name', None),
                          utils.pid_to_name(package.get('id', None)))
        self.assertEquals(utils.get_primary_pid(package),
                          u'http://urn.fi/urn:nbn:fi:lb-20140730180')
        self.assertEquals(package.get('notes', None),
                          u'{"eng": "Test description"}')
        self.assertEquals(package.get('version', None), '2012-09-07')
        self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
        self.assertEquals(package.get('license_id', None), 'undernegotiation')

        provider = config['ckan.site_url']
        expected_pid = {
            u'id': u'http://islrn.org/resources/248-895-085-557-0',
            u'provider': provider,
            u'type': u'relation',
            u'relation': u'generalRelation'
        }

        self.assertTrue(expected_pid not in package.get('pids'))

        model.Session.flush()

        harvest_object = self._run_import("cmdi_2.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package['temporal_coverage_begin'], '1880')
        self.assertEquals(package['temporal_coverage_end'], '1939')
        self.assertEquals(package.get('license_id', None), 'other')
        # Delete package
        harvest_object = HarvestObject()
        harvest_object.content = None
        harvest_object.id = "test-cmdi-delete"
        harvest_object.guid = "test-cmdi-delete"
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.package_id = package.get('id')
        harvest_object.report_status = "deleted"
        harvest_object.save()

        self.harvester.import_stage(harvest_object)

        model.Session.flush()
        self.assertEquals(model.Package.get(package['id']).state, 'deleted')
    def _crawl_results(self, harvest_url, limit=100, timeout=5):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        update_counter = 0
        base_url = self.source_config.get('source_url')

        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            soup = self._make_request(harvest_url, timeout)

            if not soup:
                return ids

            next_url = soup.find('csw:searchresults', elementset="summary")
            records_returned = next_url['numberofrecordsreturned']
            next_record = next_url['nextrecord']
            number_records_matched = next_url['numberofrecordsmatched']

            if next_record != '0':
                current_record = str(eval(next_record) - eval(records_returned))  # noqa: E501
            else:
                current_record = str(eval(number_records_matched) - eval(records_returned))  # noqa: E501

            # Get the URL for the next loop, or None to break the loop
            # Only works if StartPosition is last URL parameter
            harvest_url = self._get_next_url(harvest_url, records_returned, next_record, limit)  # noqa: E501

            # Get the entries from the results
            entries = self._get_entries_from_results(soup, current_record, next_record)  # noqa: E501

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = 'saeon_csag_' + entry['identifier'].lower().replace('.', '_').replace('/', '-')  # noqa: E501

                full_content = {}
                full_content['extra_content'] = self._get_entry_time_and_author(base_url, entry['identifier'], timeout)  # noqa: E501
                full_content['raw_content'] = entry['content']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    else:
                        log.debug('{} already exists and will not be updated.'.format(entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value=status),
                                                HOExtra(key='restart_record',
                                                value=entry['restart_record'])])  # noqa: E501
                    obj.content = json.dumps(full_content)
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)
                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value='new'),
                                                HOExtra(key='restart_record',
                                                value=entry['restart_record'])])  # noqa: E501
                    new_counter += 1
                    obj.content = json.dumps(full_content)
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider,
                                       timestamp, self.job.id, new_counter, update_counter))  # noqa: E128, E501

        return ids
示例#13
0
 def _gather_stage(self, harvest_job):
     from_until = self._get_time_limits(harvest_job)
     client, identifier = self._get_client_identifier(
         harvest_job.source.url, harvest_job)
     if not identifier:
         raise RuntimeError('Could not get source identifier.')
     # Get things to retry.
     ident2rec, ident2set = {}, {}
     rec_idents = []
     domain = identifier.repositoryName()
     try:
         args = {self.metadata_prefix_key: self.metadata_prefix_value}
         if not self.config.get('force_all', False):
             args.update(from_until)
         for ident in client.listIdentifiers(**args):
             if ident.identifier() in ident2rec:
                 continue  # On our retry list already, do not fetch twice.
             rec_idents.append(ident.identifier())
     except NoRecordsMatchError:
         log.debug('No records matched: %s' % domain)
         pass  # Ok. Just nothing to get.
     except Exception as e:
         # Once we know of something specific, handle it separately.
         log.debug(traceback.format_exc(e))
         self._save_gather_error(
             'Could not fetch identifier list.', harvest_job)
         raise RuntimeError('Could not fetch an identifier list.')
     # Gathering the set list here. Member identifiers in fetch.
     sets = []
     try:
         for set_ in client.listSets():
             identifier, name, _ = set_
             # Is set due for retry and it is not missing member insertion?
             # Set either failed in retry of misses packages but not both.
             # Set with failed insertions may have new members.
             if name in ident2set:
                 continue
             sets.append((identifier, name,))
     except NoSetHierarchyError:
         log.debug('No sets: %s' % domain)
     except urllib2.URLError:
         # Possibly timeout.
         self._save_gather_error(
             'Could not fetch a set list.', harvest_job)
         # We got something so perhaps records can gen gotten, hence [].
         raise RuntimeError('Could not fetch set list.')
     # Since network errors can't occur anymore, it's ok to create the
     # harvest objects to return to caller since we are not missing anything
     # crucial.
     harvest_objs, set_objs, insertion_retries = [], [], set()
     for ident in rec_idents:
         info = {'fetch_type': 'record', 'record': ident, 'domain': domain}
         harvest_obj = HarvestObject(job=harvest_job)
         harvest_obj.content = json.dumps(info)
         harvest_obj.save()
         harvest_objs.append(harvest_obj.id)
     log.info('Gathered %i records from %s.' % (len(harvest_objs), domain,))
     # Add sets to retry first.
     harvest_objs.extend(set_objs)
     for set_id, set_name in sets:
         harvest_obj = HarvestObject(job=harvest_job)
         info = {'fetch_type': 'set', 'set': set_id, 'set_name': set_name, 'domain': domain}
         if 'from_' in from_until:
             info['from_'] = self._str_from_datetime(from_until['from_'])
         if 'until' in from_until:
             info['until'] = self._str_from_datetime(from_until['until'])
         harvest_obj.content = json.dumps(info)
         harvest_obj.save()
         harvest_objs.append(harvest_obj.id)
     log.info(
         'Gathered %i records/sets from %s.' % (len(harvest_objs), domain,))
     return harvest_objs
示例#14
0
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('VITO Harvester gather_stage for job: %r', harvest_job)
        
        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        self.update_all = self.source_config.get('update_all', False)
        interface = INTERFACE(self.source_config, COLLECTION)
        last_product_index = (
            self._get_last_harvesting_index(harvest_job.source_id, interface)
        )
        interface.update_index(last_product_index)
        interface.build_url_date()
        
        path_to_entries = interface.get_entries_path()

        ids = []
        try:
            results = interface.get_results()
            if results:
                entries = self.get_field(results, path_to_entries[:])
            else:
                return ids
        except Timeout as e:
            self._save_gather_error('Request timed out: {}'.format(e), self.job)  # noqa: E501
            return ids
        except Exception as e:
            return ids
        if entries == None:
            return ids
        elif type(entries) is not list:
            entries = [entries]

        identifier_path = interface.get_identifier_path()

        for entry in entries:
            entry_id = self.clean_snakecase(self.get_field(entry, identifier_path[:])[0])
            entry_guid = unicode(uuid.uuid4())
            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_id)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_id))  # noqa: E501
                    status = 'change'

                else:
                    log.debug(
                        '{} will not be updated.'.format(entry_id))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                    format(entry_id))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key=interface.get_pagination_mechanism(), value=interface.get_index())
                ])
            obj.content = json.dumps(entry)
            obj.package = None if status == 'new' else package
            obj.save()
            interface.increment_index()
            ids.append(obj.id)
        return ids
示例#15
0
    def _crawl_results(self, harvest_url, limit=100, timeout=5, username=None, password=None, provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        update_counter = 0
        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url,
                                 auth=HTTPBasicAuth(username, password),
                                 verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e), self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, r.status_code, elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(log_message.format(self.provider,
                    timestamp, r.status_code, r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')

            # Get the URL for the next loop, or None to break the loop
            harvest_url = self._get_next_url(soup)

            # Get the entries from the results
            entries = self._get_entries_from_results(soup)

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_date = entry['restart_date']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    # We need package_show to ensure that all the conversions
                    # are carried out.
                    context = {"user": "******", "ignore_auth": True,
                               "model": model, "session": Session}
                    pkg_dict = logic.get_action('package_show')(context, {"id": package.name})  # noqa: E501
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    # E.g., a Sentinel dataset exists,
                    # but doesn't have a NOA resource yet.
                    elif self.flagged_extra and not get_pkg_dict_extra(pkg_dict, self.flagged_extra):  # noqa: E501
                        log.debug('{} already exists and will be extended.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    else:
                        log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value=status),
                                                HOExtra(key='restart_date',
                                                value=entry_restart_date)])
                    obj.content = entry['content']
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)
                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value='new'),
                                                HOExtra(key='restart_date',
                                                value=entry_restart_date)])
                    new_counter += 1
                    obj.content = entry['content']
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider,
                                       timestamp, self.job.id, new_counter, update_counter))  # noqa: E128, E501
        return ids
示例#16
0
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SatcenBetter Harvester gather_stage for job: %r',
                       harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        self.update_all = self.source_config.get('update_all', False)
        interface = INTERFACE(self.source_config, COLLECTION)

        last_product_index = (self._get_last_harvesting_index(
            harvest_job.source_id, interface))
        interface.update_index(last_product_index)
        interface.build_url()

        log.debug('URL: {}'.format(interface.current_url))  # noqa: E501

        ids = []
        try:
            results = interface.get_results()
        except Timeout as e:
            self._save_gather_error('Request timed out: {}'.format(e),
                                    self.job)  # noqa: E501
            return ids
        if type(results) is not list:
            self._save_gather_error('{} error: {}'.format(
                results['status_code'], results['message']),
                                    self.job)  # noqa: E501
            return ids

        for entry in results:
            name_path = interface.get_name_path()

            name_url = get_field(entry,
                                 name_path['relative_location'].split(","),
                                 name_path['fixed_attributes'])
            entry_name = parse_name(name_url).lower()
            entry_guid = unicode(uuid.uuid4())
            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key=interface.get_pagination_mechanism(),
                            value=interface.get_index())
                ])
            obj.content = json.dumps(entry)
            obj.package = None if status == 'new' else package
            obj.save()
            interface.increment_index()
            ids.append(obj.id)
        return ids
示例#17
0
                                    job=harvest_job,
                                    content=json.dumps(pkg_dict))
                obj.save()
                object_ids.append(obj.id)

            for deleted_id in deleted_ids:

                # Original harvest object needs to be updated
                log.debug('Creating deleting HarvestObject for %s', deleted_id)
                obj = model.Session.query(HarvestObject)\
                    .filter(
                    HarvestObject.current == True  # noqa
                )\
                    .filter(HarvestObject.guid == deleted_id).one()
                obj.job = harvest_job
                obj.content = '{"id":"%s", "delete":true}' % deleted_id
                obj.save()
                object_ids.append(obj.id)

            return object_ids
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)

    def _search_for_datasets(self, remote_ckan_base_url, fq_terms=None):
        '''Does a dataset search on a remote CKAN and returns the results.

        Deals with paging to return all the results, not just the first page.
        '''
        base_search_url = remote_ckan_base_url + self._get_search_api_offset()
        params = {'rows': '100', 'start': '0'}
        # There is the worry that datasets will be changed whilst we are paging
    def _parse_products(self, products, mosquito_type):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0

        # Create a harvest object for each entry
        for entry in products:
            # Add mosquito type on object
            entry['mosquito_type'] = mosquito_type

            # Correct Date
            if entry['dt_placement'].startswith('00'):
                entry['dt_corrected'] = '20' + entry['dt_placement'][2:]

                filename = "{}_{}_{}".format(mosquito_type,
                                             entry['station_id'],
                                             entry['dt_corrected'])

            else:
                filename = "{}_{}_{}".format(mosquito_type,
                                             entry['station_id'],
                                             entry['dt_placement'])

            # Sanitize filename
            filename = self._sanitize_filename(filename)

            # Add coast_mean on aedes for uniqueness
            if mosquito_type == 'aedes':
                filename = filename + '_' + str(
                    int(entry['coast_mean_dist_1000']))

            entry_guid = filename
            entry_name = filename
            entry['filename'] = filename

            entry_restart_date = entry['dt_placement']

            package = Session.query(Package) \
                .filter(Package.name == entry_name).first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'
                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value=status),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                obj.content = json.dumps(entry)
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .format(entry_name))  # noqa: E501
                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value='new'),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                new_counter += 1
                obj.content = json.dumps(entry)
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
    def _parse_products(self, products):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0

        # Create a harvest object for each entry
        for entry in products:

            entry_guid = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_name = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_restart_date = entry['master']

            package = Session.query(Package) \
                .filter(Package.name == entry_name).first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'
                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value=status),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                obj.content = json.dumps(entry)
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .format(entry_name))  # noqa: E501
                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value='new'),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                new_counter += 1
                obj.content = json.dumps(entry)
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
示例#20
0
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        max_dataset = self.source_config.get('max_dataset', 100)
        wfs_url = self.source_config.get('wfs_url')
        wfs_version = self.source_config.get('wfs_version')
        collection = self.source_config.get('collection')
        typename = COLLECTION[collection].get('collection_typename')
        tag_typename = COLLECTION[collection].get('tag_typename', None)
        self.update_all =  self.source_config.get('update_all', False)

        last_product_index = (
            self._get_last_harvesting_index(harvest_job.source_id)
        )

        if last_product_index:
            last_product_index = last_product_index + 1
        else:
            last_product_index = 0

        wfs = WFS(url=wfs_url, version=wfs_version)

        wfs.set_collection(typename)
        sortby=['When']

        result = wfs.make_request(max_dataset, sortby, last_product_index)
        entries = result['features']
        name = '{}_{}'.format(collection.lower(), '{}')
        ids = []
        for entry in entries:
            entry_guid = unicode(uuid.uuid4())
            entry_name = name.format(convert_to_clean_snakecase(entry['id']))
            log.debug('gathering %s', entry_name)

            
            content = {}
            content['collection_content'] = entry
            if tag_typename:
                wfs.set_collection(tag_typename)
                filterxml = wfs.set_filter_equal_to('image_id', entry['id'])
                result = wfs.make_request(constraint=filterxml)
                result = wfs.get_request(constraint=filterxml)
                content['tag_url'] = result

            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug(
                        '{} will not be updated.'.format(entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key='index', value=last_product_index)
                ])
            obj.content = json.dumps(content)
            obj.package = None if status == 'new' else package
            obj.save()
            last_product_index += 1
            ids.append(obj.id)
        return ids
示例#21
0
    def _gather(self, job, config):

        ftp_user = config['username']
        ftp_passwd = config['password']
        source_type = config['harvester_type']
        ftp_source = create_ftp_source(source_type)

        if not hasattr(self, 'harvester_logger'):
            self.harvester_logger = self.make_harvester_logger()

        self.provider = 'deimos_imaging'

        existing_files = ftp_source._get_ftp_urls(ftp_user, ftp_passwd)

        metadata_dict = {}
        ids = []
        new_counter = 0
        for ftp_url in existing_files:
            filename = self.parse_filename(ftp_url)
            product_type = self.parse_filedirectory(ftp_url)
            identifier = filename

            content = {'identifier': identifier, 'product_type': product_type, 'ftp_link': ftp_url}  # noqa: E501

            raw_id = identifier.replace(product_type, 'L0R')

            if raw_id in metadata_dict:
                metadata = metadata_dict[raw_id]
            else:
                metadata = self._get_metadata(raw_id)
                metadata_dict[raw_id] = metadata

            for key in metadata:
                content[key] = metadata[key]

            content = json.dumps(content, default=str)

            package = Session.query(Package) \
                .filter(Package.name == identifier.lower()).first()

            if package:
                log.debug('{} will not be updated.'.format(identifier))  # noqa: E501
                status = 'unchanged'
                obj = HarvestObject(guid=ftp_url, job=job,
                                    extras=[HOExtra(key='status',
                                            value=status)])

                obj.content = content
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                log.debug('{} has not been harvested before. Creating a new harvest object.'.format(identifier))  # noqa: E501
                status = 'new'
                new_counter += 1

                extras = [HOExtra(key='status', value=status)]

                obj = HarvestObject(job=job,
                                    guid=ftp_url,
                                    extras=extras)

                obj.content = content
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | Job ID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider,
                timestamp, job.id, new_counter, '0'))  # noqa: E128, E501

        return ids
示例#22
0
    def gather_stage(self, harvest_job):
        requests_cache.install_cache()
        requests_cache.clear()

        session = requests_cache.CachedSession()

        self.log = logging.getLogger(__file__)
        self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        base_url = self.source_config.get('oai_pmh_url')
        metadata_prefix = self.source_config.get('metadata_prefix')
        start_date = self.source_config.get('start_date', None)
        self.update_all = self.source_config.get('update_all', False)

        last_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'last_token')
        next_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'next_token')
        next_station = self._get_last_harvesting_index(self.job.source_id,
                                                       'next_station')
        restart_date = self._get_last_harvesting_index(self.job.source_id,
                                                       'restart_date')
        restart_date = restart_date if last_token else None

        ids = []
        first_query = True
        while (ids == [] and next_token) or first_query:
            first_query = False

            current_token = last_token if next_station else next_token
            if current_token:
                query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format(
                    base_url, current_token)
            elif restart_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, restart_date)
            elif start_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, start_date)
            else:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format(
                    base_url, metadata_prefix)

            self.log.debug('Querying: {}.'.format(query_url))
            raw_list_ids = self.get_list_identifiers(session, query_url)

            list_stations, largest_datastamp = self.get_station_ids(
                raw_list_ids)

            next_token = self.get_resumption_token(raw_list_ids)
            last_token = current_token
            restart_date = restart_date if restart_date else ''
            restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date

            if list_stations == []:
                next_station = None
            else:
                valid_deployment = None
                station_index = 0
                while not valid_deployment and station_index <= len(
                        list_stations) - 1:
                    station = list_stations[station_index]
                    next_station = None if (next_station
                                            == station) else next_station
                    if not next_station:
                        station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format(
                            base_url, metadata_prefix, station)
                        print('Querying station: {}.'.format(station))
                        record = self.get_record(session, station_query)
                        if record:
                            station_info = StationInfo(record)
                            if station_info.isValid():
                                station_info.id = station
                                observation_list = station_info.get_observations(
                                )
                                station_dict = station_info.get_dict()
                                station_info = None
                                for observation in observation_list:
                                    observation_info = ObservationInfo(
                                        session, observation)
                                    deployments_list = observation_info.get_deployments(
                                    )
                                    observation_dict = observation_info.get_dict(
                                    )
                                    observation_info = None
                                    for deployment in deployments_list:
                                        deployment_info = DeploymentInfo(
                                            session, deployment)
                                        if deployment_info.isValid():
                                            deployment_dict = deployment_info.get_dict(
                                            )
                                            deployment_info = None
                                            valid_deployment = True
                                            if station_index + 1 <= len(
                                                    list_stations) - 1:
                                                next_station = list_stations[
                                                    station_index + 1]
                                            else:
                                                next_station = None
                                            entry_guid = unicode(uuid.uuid4())
                                            entry_id = '{}_{}'.format(
                                                station_dict['id'],
                                                deployment_dict['id'])
                                            entry_name = clean_snakecase(
                                                entry_id)
                                            self.log.debug(
                                                'Gathering %s', entry_name)

                                            content = {}
                                            content['station'] = station_dict
                                            content[
                                                'observation'] = observation_dict
                                            content[
                                                'deployment'] = deployment_dict

                                            package_query = Session.query(
                                                Package)
                                            query_filtered = package_query.filter(
                                                Package.name == entry_name)
                                            package = query_filtered.first()

                                            if package:
                                                # Meaning we've previously harvested this,
                                                # but we may want to reharvest it now.
                                                previous_obj = Session.query(HarvestObject) \
                                                    .filter(HarvestObject.guid == entry_guid) \
                                                    .filter(HarvestObject.current == True) \
                                                    .first()  # noqa: E712
                                                if previous_obj:
                                                    previous_obj.current = False
                                                    previous_obj.save()

                                                if self.update_all:
                                                    self.log.debug(
                                                        '{} already exists and will be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'change'

                                                else:
                                                    self.log.debug(
                                                        '{} will not be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'unchanged'

                                            elif not package:
                                                # It's a product we haven't harvested before.
                                                self.log.debug(
                                                    '{} has not been harvested before. Creating a new harvest object.'
                                                    .  # noqa: E501
                                                    format(entry_name
                                                           ))  # noqa: E501
                                                status = 'new'
                                            obj = HarvestObject(
                                                guid=entry_guid,
                                                job=self.job,
                                                extras=[
                                                    HOExtra(key='status',
                                                            value=status),
                                                    HOExtra(key='last_token',
                                                            value=last_token),
                                                    HOExtra(key='next_token',
                                                            value=next_token),
                                                    HOExtra(
                                                        key='next_station',
                                                        value=next_station),
                                                    HOExtra(key='restart_date',
                                                            value=restart_date)
                                                ])
                                            obj.content = json.dumps(content)
                                            obj.package = None if status == 'new' else package
                                            obj.save()
                                            ids.append(obj.id)

                                if not valid_deployment:
                                    self.log.debug(
                                        'Station {} does not have valid deployments.'
                                        .format(station))
                            else:
                                self.log.debug(
                                    'Station {} is not valid.'.format(station))
                    station_index += 1
        return ids
    def _gather_entry(self, entry, path, row, update_all=False):
        # Create a harvest object for each entry
        entry_guid = unicode(uuid.uuid4())
        entry_name = entry.lower()  # noqa: E501
        log.debug('gathering %s', entry)

        package_query = Session.query(Package)
        query_filtered = package_query.filter(Package.name == entry_name)
        package = query_filtered.first()

        if package:
            # Meaning we've previously harvested this,
            # but we may want to reharvest it now.
            previous_obj = Session.query(HarvestObject) \
                .filter(HarvestObject.guid == entry_guid) \
                .filter(HarvestObject.current == True) \
                .first()  # noqa: E712
            if previous_obj:
                previous_obj.current = False
                previous_obj.save()

            if update_all:
                log.debug('{} already exists and will be updated.'.format(
                    entry_name))  # noqa: E501
                status = 'change'
                obj = HarvestObject(
                    guid=entry_guid,
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value=status),
                        HOExtra(key='path', value=path),
                        HOExtra(key='row', value=row)
                    ])
                obj.content = entry
                obj.package = package
                obj.save()
                return obj.id

            else:
                log.debug(
                    '{} will not be updated.'.format(entry_name))  # noqa: E501
                status = 'unchanged'
                obj = HarvestObject(
                    guid=entry_guid,
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value=status),
                        HOExtra(key='path', value=path),
                        HOExtra(key='row', value=row)
                    ])
                obj.content = entry
                obj.package = package
                obj.save()
                return obj.id

        elif not package:
            # It's a product we haven't harvested before.
            log.debug(
                '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                format(entry_name))  # noqa: E501
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value='new'),
                    HOExtra(key='path', value=path),
                    HOExtra(key='row', value=row)
                ])
            obj.content = entry
            obj.package = None
            obj.save()
            return obj.id
    def _crawl_results(self,
                       harvest_url,
                       timeout=5,
                       limit=100,
                       provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        first_query = True
        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url, verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e),
                                        self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(
                    r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           r.status_code,
                                           elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(
                        self.provider, timestamp, r.status_code,
                        r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')
            json_content = json.loads(soup.text)

            # Get the URL for the next loop, or None to break the loop
            log.debug(harvest_url)
            harvest_url = self._get_next_url(harvest_url, json_content)

            # Get the entries from the results
            entry_list = self._get_entries_from_results(json_content)

            if first_query:
                entries = entry_list
            else:
                entries = entry_list[1:]

            first_query = False

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_date = entry['restart_date']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug(
                            '{} already exists and will be updated.'.format(
                                entry_name))  # noqa: E501
                        status = 'change'
                    else:
                        log.debug('{} will not be updated.'.format(
                            entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status',
                                                    value=status),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date)
                                        ])
                    obj.content = json.dumps(entry['content'])
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)

                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug(
                        '{} has not been harvested before. Creating a new harvest object.'
                        .format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status', value='new'),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date)
                                        ])
                    new_counter += 1
                    obj.content = json.dumps(entry['content'])
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
示例#25
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.individual.gather')
        log.debug('DocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None

        existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
                                    first()

        def create_extras(url, status):
            return [
                HOExtra(key='doc_location', value=url),
                HOExtra(key='status', value=status)
            ]

        if not existing_object:
            guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest()
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url, 'new'),
                                           guid=guid)
        else:
            harvest_object = HarvestObject(
                job=harvest_job,
                extras=create_extras(url, 'change'),
                guid=existing_object.guid,
                package_id=existing_object.package_id)

        harvest_object.add()

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
        else:
            extra = HOExtra(object=harvest_object,
                            key='original_document',
                            value=content)
            extra.save()

            extra = HOExtra(object=harvest_object,
                            key='original_format',
                            value=document_format)
            extra.save()

        harvest_object.save()

        return [harvest_object.id]
示例#26
0
                return True

        # Get source URL
        url = harvest_object.source.url.rstrip('/')
        url = url + self._get_rest_api_offset() + '/package/' + harvest_object.guid

        # Get contents
        try:
            content = self._get_content(url)
        except ContentFetchError, e:
            self._save_object_error('Unable to get content for package: %s: %r' %
                                    (url, e), harvest_object)
            return None

        # Save the fetched contents in the HarvestObject
        harvest_object.content = content
        harvest_object.save()
        return True

    def import_stage(self, harvest_object):
        log.debug('In CKANHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)
    def _crawl_results(self, harvest_url, limit=100, timeout=5):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        update_counter = 0

        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e),
                                        self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(
                    r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           r.status_code,
                                           elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(
                        self.provider, timestamp, r.status_code,
                        r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')

            next_url = soup.find('csw:searchresults', elementset="summary")
            records_returned = next_url['numberofrecordsreturned']
            next_record = next_url['nextrecord']
            number_records_matched = next_url['numberofrecordsmatched']

            if next_record != '0':
                current_record = str(
                    eval(next_record) - eval(records_returned))  # noqa: E501
            else:
                current_record = str(
                    eval(number_records_matched) -
                    eval(records_returned))  # noqa: E501

            # Get the URL for the next loop, or None to break the loop
            # Only works if StartPosition is last URL parameter
            harvest_url = self._get_next_url(harvest_url, records_returned,
                                             next_record, limit)  # noqa: E501

            # Get the entries from the results
            entries = self._get_entries_from_results(soup, current_record,
                                                     next_record)  # noqa: E501

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug(
                            '{} already exists and will be updated.'.format(
                                entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    else:
                        log.debug('{} already exists and will not be updated.'.
                                  format(entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(
                        guid=entry_guid,
                        job=self.job,
                        extras=[
                            HOExtra(key='status', value=status),
                            HOExtra(key='restart_record',
                                    value=entry['restart_record'])
                        ])  # noqa: E501
                    obj.content = entry['content']
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)
                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug(
                        '{} has not been harvested before. Creating a new harvest object.'
                        .format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status', value='new'),
                                            HOExtra(
                                                key='restart_record',
                                                value=entry['restart_record'])
                                        ])  # noqa: E501
                    new_counter += 1
                    obj.content = entry['content']
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter,
                                     update_counter))  # noqa: E128, E501

        return ids