示例#1
0
    def _gather_ids(self,url = None, jar= None):
        log.debug('Page %s'%self.page)
        if jar is None:
            jar = CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
        url = url or self.INITIAL_INDEX
        fh = opener.open(url)
        doc = html.parse(fh)
        fh.close()

        new_ids = []
        for a in doc.findall(".//div[@class='main']//a"):
            href = a.get('href').split('?', 1)[0]
            id = href.split('/').pop()
            if not id in self.gathered_ids:
                log.debug('Got Id: %s' % id)
                obj = HarvestObject(guid=sha1(id).hexdigest(), job=self.job, content=id)
                obj.save()

                self.object_ids.append(obj.id)

                new_ids.append(id)

        if len(new_ids) == 0: #or self.page == 2:
            return self.gathered_ids
        else:
            self.gathered_ids.extend(new_ids)

        inputs = []
        for input in doc.findall(".//form[@id='main_form']//input"):
            inputs.append((input.get('name'), input.get('value')))
        inputs.append(('listbox_nextPage:method', ''))
        next_url = self.INDEX_URL + '?' + urllib.urlencode(inputs)
        self.page = self.page + 1
        return self._gather_ids(url=next_url,jar=jar)
    def gather_stage(self,harvest_job):
        log.debug('In ArrayExpressHarvester.gather_stage(%s)' % harvest_job.source.url)
        # Get feed contents
        self._set_config(harvest_job.source.config)
        
        #previous_job = Session.query(HarvestJob) \
        #                .filter(HarvestJob.source==harvest_job.source) \
        #                .filter(HarvestJob.gather_finished!=None) \
        #                .filter(HarvestJob.id!=harvest_job.id) \
        #                .order_by(HarvestJob.gather_finished.desc()) \
        #                .limit(1).first()

        baseURL = harvest_job.source.url+"/xml/v2/experiments"
        #if (previous_job and not previous_job.gather_errors
        #    and not len(previous_job.objects) == 0):
        #    if not self.config.get('force_all',False):
        #        last_time = harvest_job.gather_started.isoformat()
        #        today = format(datetime.date.today())
        #        self.params['date'] = '['+last_time+' '+today+']'
        url = baseURL + "?" + self.getParams()

        print "Fetching from "+url
        doc = etree.parse(url)
        ids = []
        for accessionElement in doc.findall('//experiment/accession'):
            accession = accessionElement.text.strip()
            obj = HarvestObject(guid=accession, job=harvest_job, content=accession)
            print "ArrayExpress accession: "+accession
            obj.save()
            
            ids.append(obj.id)
        print ids
        return ids
    def gather_stage(self,harvest_job):
        log.debug('In SRDAHarvester gather_stage (%s)' % harvest_job.source.url)

        get_all_packages = True
        package_ids = []

	data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
        doc = html.parse(data)
        for td in doc.findall("//td[@class='left_p12_title']/a"):
            link = td.get('href')
            if re.match(r"/search/fsciitem", link):
                id = sha1(link).hexdigest()
                obj = HarvestObject(guid=id, job= harvest_job, content=link)
                obj.save()
                package_ids.append(obj.id)
	
        self._set_config(harvest_job.source.config)

        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
                        .filter(HarvestJob.source==harvest_job.source) \
                        .filter(HarvestJob.gather_finished!=None) \
                        .filter(HarvestJob.id!=harvest_job.id) \
                        .order_by(HarvestJob.gather_finished.desc()) \
                        .limit(1).first()

        return package_ids
def harvest_object_create(context, data_dict):
    """ Create a new harvest object

    :type guid: string (optional)
    :type content: string (optional)
    :type job_id: string 
    :type source_id: string (optional)
    :type package_id: string (optional)
    :type extras: dict (optional)
    """
    check_access('harvest_object_create', context, data_dict)
    data, errors = _validate(data_dict, harvest_object_create_schema(), context)

    if errors:
        raise logic.ValidationError(errors)

    obj = HarvestObject(
        guid=data.get('guid'),
        content=data.get('content'),
        job=data['job_id'],
        harvest_source_id=data.get('source_id'),
        package_id=data.get('package_id'),
        extras=[ HarvestObjectExtra(key=k, value=v) 
            for k, v in data.get('extras', {}).items() ]
    )

    obj.save()
    return harvest_object_dictize(obj, context)
    def gather_stage(self, harvest_job):
        """
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        """
        log.debug("in gather stage: %s" % harvest_job.source.url)
        try:
            harvest_obj_ids = []
            registry = self._create_metadata_registry()
            self._set_config(harvest_job.source.config)
            client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials)

            client.identify()  # check if identify works
            for header in self._identifier_generator(client):
                harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)
        except:
            log.exception("Gather stage failed %s" % harvest_job.source.url)
            self._save_gather_error("Could not gather anything from %s!" % harvest_job.source.url, harvest_job)
            return None
        return harvest_obj_ids
示例#6
0
    def gather_stage(self, harvest_job):

        """Retrieve datasets"""
        
        log.debug('In RostockTestHarvester gather_stage (%s)' % harvest_job.source.url)
        package_ids = []
        self._set_config(None)

        base_url = harvest_job.source.url.rstrip('/')
        package_list_url = base_url + '/api/rest/package'
        content = self._get_content(package_list_url)
        
        package_ids = json.loads(content)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:                                      
                    obj = HarvestObject(guid = package_id, job = harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                    log.info('Got ID from source: %s' %package_id)
                return object_ids

            else:
               self._save_gather_error('No packages received for URL: %s' % url,
                       harvest_job)
               return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
    def gather_stage(self,harvest_job):
        log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url)
        get_all_packages = True

        dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/')
        log.debug(dcatUrl)

        adaptorInstance = socrataAdaptor()
        package_ids = adaptorInstance.listDatasetIds(dcatUrl)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid = package_id, job = harvest_job)
                    obj.save()
                    object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
示例#8
0
    def gather_stage(self, harvest_job):
        log.debug('In FSOHarvester gather_stage')

        http = urllib3.PoolManager()
        metadata_file = http.request('GET', self.METADATA_FILE_URL)

        ids = []
        parser = etree.XMLParser(encoding='utf-8')
        for package in etree.fromstring(metadata_file.data, parser=parser):

            # Get the german dataset if one is available, otherwise get the first one
            base_datasets = package.xpath("dataset[@xml:lang='de']")
            if len(base_datasets) != 0:
                base_dataset = base_datasets[0]
            else:
                base_dataset = package.find('dataset')

            metadata = self._generate_metadata(base_dataset, package)
            if metadata:
                obj = HarvestObject(
                    guid = self._create_uuid(base_dataset.get('datasetID')),
                    job = harvest_job,
                    content = json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + base_dataset.get('datasetID') + ' to the queue')
                ids.append(obj.id)
            else:
                log.debug('Skipping ' + base_dataset.get('datasetID') + ' since no resources or groups are available')

        return ids
    def gather_stage(self,harvest_job):
        log.debug('In NTPCHarvester gather_stage (%s)' % harvest_job.source.url)

        url = self.PREFIX_URL + self.CATALOGUE_INDEX_URL
        get_all_packages = True
        try:
            package_ids = []
            dataset_count = self._get_ntpc_dataset_count(url)
            msg_count = 0
            for x in range(dataset_count/10 + 1):
                page_url = url + '?currentPage=%s' % (x + 1)
                data = urllib2.urlopen(page_url)
                doc = html.parse(data)
                for div in doc.findall("//a[@href]"):
                    if '/NTPC/od/query;' in div.attrib['href']:
                        link = div.attrib['href']
                        id = sha1(link).hexdigest()
                        obj = HarvestObject(guid=id, job=harvest_job, content=link)
                        obj.save()
                        package_ids.append(obj.id)
                        msg_count = msg_count + 1

            if msg_count == 0:
                self._save_gather_error('No packages received for URL: %s' % url,
                        harvest_job)
                return None

        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
    def gather_stage(self, harvest_job):
        """Retrieve datasets"""

        log.debug('In ' + self.city + 'CKANHarvester gather_stage (%s)' % harvest_job.source.url)
        package_ids = []
        self._set_config(None)

        base_url = harvest_job.source.url.rstrip('/')
        package_list_url = base_url + '/3/action/package_list'
        content = self._get_content(package_list_url)

        content_json = json.loads(content)
        package_ids = content_json['result']

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                                        harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
示例#11
0
    def gather_stage(self, harvest_job):
        log.debug('In SNLHarvester gather_stage')

        metadata_path = self._fetch_metadata_file()
        ids = []
        try:
            parser = MetaDataParser(metadata_path)

            for dataset in parser.list_datasets():
                metadata = parser.parse_set(dataset)
                metadata['translations'].extend(
                    self._metadata_term_translations()
                )

                log.debug(metadata)

                obj = HarvestObject(
                    guid=metadata['id'],
                    job=harvest_job,
                    content=json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + metadata['id'] + ' to the queue')
                ids.append(obj.id)
        finally:
            temp_dir = os.path.dirname(metadata_path)
            log.debug('Deleting directory ' + temp_dir)
            shutil.rmtree(temp_dir)

        return ids
示例#12
0
    def gather_stage(self, harvest_job):
        log.debug('In ZhstatHarvester gather_stage')

        ids = []
        parser = etree.XMLParser(encoding='utf-8')

        for dataset in etree.fromstring(self._fetch_metadata(), parser=parser):

            # Get the german data if one is available,
            # otherwise get the first one
            base_datas = dataset.xpath("data[@xml:lang='de']")
            if len(base_datas) != 0:
                base_data = base_datas[0]
            else:
                base_data = dataset.find('data')

            metadata = self._generate_metadata(base_data, dataset)

            if metadata:
                obj = HarvestObject(
                    guid=dataset.get('id'),
                    job=harvest_job,
                    content=json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + dataset.get('id') + ' to the queue')
                ids.append(obj.id)
            else:
                log.debug(
                    'Skipping %s since no resources or groups are available'
                    % dataset.get('id')
                )

        return ids
示例#13
0
    def gather_stage(self, harvest_job):
        try:
            config = json.loads(harvest_job.source.config)
            ckan_term_url = config['ckan_term_url']
        except Exception as e:
            log.exception(e)
            raise ConfigError(
                "In order to run the translation harvester "
                "you need to specify 'ckan_term_url' "
                "in your harvester config json"
            )

        log.debug('Gathering term from %s' % ckan_term_url)
        try:
            terms = self._get_terms(ckan_term_url)

            obj = HarvestObject(
                job=harvest_job,
                content=json.dumps(terms)
            )
            obj.save()

            return [obj.id]
        except Exception as e:
            log.exception(e)
            raise e
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.VariantStore.gather')
        log.debug('VariantStoreHarvester gather_stage for job: %r', harvest_job)

        self._set_config(harvest_job.source.config, log=log)
        obj = HarvestObject(guid = self.guid, job = harvest_job)
        obj.save()
        return [ obj.id ]
示例#15
0
    def gather_stage(self, harvest_job):
        log.debug('In ChangelogHarvester gather_stage')

        # Get the last harvested AuditId
        last_audit = model.Session.query(HarvestLastAudit) \
            .order_by(HarvestLastAudit.created.desc()) \
            .first()

        if last_audit:
            audit_id = last_audit.audit_id
        else:
            audit_id = '0'

        # Get all Audits
        audits = p.toolkit.get_action('changelog_show')(
            {'ignore_auth': True},
            {'audit_id': audit_id, 'top': 1000})

        # Check if there are any new audits to process
        if not len(audits) or (
           len(audits) == 1 and
           audits[0]['AuditId'] == audit_id):
            log.debug(
                'No new audits to process since last run ' +
                '(Last audit id {0})'.format(audit_id))
            return []

        # Ignore the first audit if an audit id was defined as start,
        # as this one will be included in the results
        audits = audits[1:] if audit_id != '0' and len(audits) > 1 else audits

        ids = []
        update_audits = {}
        for audit in audits:
            # We only want to use the most recent update per object per run
            # Store the most recent audit against a hash of the id fields
            if 'update' in audit['Command'].lower():
                m = hashlib.md5()
                m.update(json.dumps(audit['CustomProperties']))
                ids_hash = m.hexdigest()
                update_audits[ids_hash] = audit
            else:
                obj = HarvestObject(guid=audit['AuditId'], job=harvest_job,
                                    content=json.dumps(audit))
                obj.save()
                ids.append(obj.id)

        # Save the last AuditId to know where to start in the next run
        save_last_audit_id(audit['AuditId'], harvest_job.id)

        for key, audit in update_audits.iteritems():
            obj = HarvestObject(guid=audit['AuditId'], job=harvest_job,
                                content=json.dumps(audit))
            obj.save()
            ids.append(obj.id)

        return ids
示例#16
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        self._set_config(harvest_job.source.config)
        sets = []
        harvest_objs = []
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = oaipmh.client.Client(harvest_job.source.url, registry)
        try:
            identifier = client.identify()
        except urllib2.URLError:
            self._save_gather_error('Could not gather anything from %s!' %
                                    harvest_job.source.url, harvest_job)
            return None
        domain = identifier.repositoryName()
        group = Group.by_name(domain)
        if not group:
            group = Group(name=domain, description=domain)
        query = self.config['query'] if 'query' in self.config else ''
        try:
            for set in client.listSets():
                identifier, name, _ = set
                if 'query' in self.config:
                    if query in name:
                        sets.append((identifier, name))
                else:
                    sets.append((identifier, name))
        except NoSetHierarchyError:
            sets.append(('1', 'Default'))
            self._save_gather_error('Could not fetch sets!', harvest_job)

        for set_id, set_name in sets:
            harvest_obj = HarvestObject(job=harvest_job)
            harvest_obj.content = json.dumps(
                                             {
                                              'set': set_id, \
                                              'set_name': set_name, \
                                              'domain': domain
                                              }
                                             )
            harvest_obj.save()
            harvest_objs.append(harvest_obj.id)
        model.repo.commit()
        return harvest_objs
示例#17
0
    def gather_stage(self, harvest_job):
        log.debug('In SFAHarvester gather_stage')
        try:
            file_path = self._fetch_metadata_file()
            ids = []

            de_rows = self._get_row_dict_array(0, file_path)
            for row in de_rows:
                # Construct the metadata dict for the dataset on CKAN
                metadata = {
                    'datasetID': row[u'id'],
                    'title': row[u'title'],
                    'url': row[u'url'],
                    'notes': row[u'notes'],
                    'author': row[u'author'],
                    'maintainer': row[u'maintainer'],
                    'maintainer_email': row[u'maintainer_email'],
                    'license_id': row[u'licence'],
                    'license_url': row[u'licence_url'],
                    'translations': [],
                    'tags': row[u'tags'].split(u', '),
                    'groups': [row[u'groups']]
                }

                metadata['resources'] = self._generate_resources_dict_array(
                    row[u'id']
                )
                metadata['resources'][0]['version'] = row[u'version']
                log.debug(metadata['resources'])

                # Adding term translations
                metadata['translations'].extend(
                    self._generate_term_translations(1, file_path)  # fr
                )
                metadata['translations'].extend(
                    self._generate_term_translations(2, file_path)  # it
                )
                metadata['translations'].extend(
                    self._generate_term_translations(3, file_path)  # en
                )

                log.debug(metadata['translations'])

                obj = HarvestObject(
                    guid=self._create_uuid(row[u'id']),
                    job=harvest_job,
                    content=json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + row[u'id'] + ' to the queue')
                ids.append(obj.id)

                log.debug(de_rows)
        except Exception:
            return False
        return ids
    def _gen_harvest_obj_for_files(self, harvest_job):
        ids = []
        for dataset_name, dataset in self.DATASETS.iteritems():
            csw = ckan_csw.SwisstopoCkanMetadata()
            metadata = csw.get_ckan_metadata(
                dataset['csw_query'], 'de'
            ).copy()
            metadata_fr = csw.get_ckan_metadata(
                dataset['csw_query'], 'fr'
            ).copy()
            metadata_it = csw.get_ckan_metadata(
                dataset['csw_query'], 'it'
            ).copy()
            metadata_en = csw.get_ckan_metadata(
                dataset['csw_query'], 'en'
            ).copy()
            log.debug(metadata)

            metadata['translations'] = self._generate_term_translations()
            log.debug("Translations: %s" % metadata['translations'])

            metadata_trans = {
                u'de': metadata,
                u'fr': metadata_fr,
                u'it': metadata_it,
                u'en': metadata_en,
            }
            metadata['translations'].extend(
                self._generate_metadata_translations(metadata_trans)
            )

            metadata['resources'] = self._generate_resources_dict_array(
                dataset_name
            )
            metadata['resources'].extend(
                self._generate_api_resources(metadata, dataset_name)
            )
            log.debug(metadata['resources'])

            metadata['license_id'], metadata['license_url'] = (
                self._get_org_license(dataset_name)
            )

            metadata['layer_name'] = dataset_name

            obj = HarvestObject(
                guid=metadata['id'],
                job=harvest_job,
                content=json.dumps(metadata)
            )
            obj.save()
            log.debug('adding ' + dataset_name + ' to the queue')
            ids.append(obj.id)

        return ids
示例#19
0
    def gather_stage(self, harvest_job):
        log.debug('In FOPHHarvester gather_stage')
        try:
            file_path = self._fetch_metadata_file()
            ids = []

            de_cols = self._get_col_dict_array(0, file_path)
            for col in de_cols:
                # Construct the metadata dict for the dataset on CKAN
                metadata = {
                    'datasetID': col[u'id'],
                    'title': col[u'title'],
                    'url': col[u'url'],
                    'notes': col[u'notes'],
                    'author': col[u'author'],
                    'author_email': col[u'author_email'],
                    'maintainer': col[u'maintainer'],
                    'maintainer_email': col[u'maintainer_email'],
                    'license_id': col[u'license_id'].lower(),
                    'version': col[u'version'],
                    'translations': [],
                    'tags': []
                }
                tags = col[u'tags'].split(u', ')
                tags = [munge_tag(tag) for tag in tags]
                metadata['tags'] = tags

                metadata['resources'] = self._generate_resources_dict_array(
                    col[u'id'])
                metadata['resources'][0]['version'] = col[u'version']
                log.debug(metadata['resources'])

                # Adding term translations
                metadata['translations'].extend(
                    self._generate_term_translations(1, file_path))  # fr
                metadata['translations'].extend(
                    self._generate_term_translations(2, file_path))  # it
                metadata['translations'].extend(
                    self._generate_term_translations(3, file_path))  # en

                log.debug(metadata['translations'])

                obj = HarvestObject(
                    guid=self._create_uuid(col[u'id']),
                    job=harvest_job,
                    content=json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + col[u'id'] + ' to the queue')
                ids.append(obj.id)

                log.debug(de_cols)
        except Exception:
            return False
        return ids
    def gather_stage(self, harvest_job, encoding=None):
        self._set_config(harvest_job.source.config)
        # Request all remote packages
        try:
            content = self._get_content(harvest_job.source.url)
            LOGGER.debug('Grabbing zip file: %s', harvest_job.source.url)

            object_ids = []
            packages = []

            file_content = StringIO.StringIO(content)
            archive = zipfile.ZipFile(file_content, 'r')
            for name in archive.namelist():
                if name.endswith('.json'):
                    archive_content = archive.read(name)
                    if encoding is not None:
                        archive_content = archive_content.decode(encoding)
                    else:
                        archive_content = self.lstrip_bom(archive_content)

                    package = json.loads(archive_content)
                    normalize_api_dataset(package)
                    packages.append(package)
                    obj = HarvestObject(guid=package['name'], job=harvest_job)
                    obj.content = json.dumps(package)
                    obj.save()
                    object_ids.append(obj.id)

        except zipfile.BadZipfile as err:
            self._save_gather_error(err.message, harvest_job)
            return None
        except ContentFetchError as err:
            self._save_gather_error(err.message, harvest_job)
            return None
        except Exception as err:
            error_template = 'Unable to get content for URL: %s: %s'
            error = error_template % (harvest_job.source.url, str(err))
            self._save_gather_error(error, harvest_job)
            return None

        if object_ids:
            # delete obsolete packages
            super(JSONZipBaseHarvester, self).delete_deprecated_datasets(
                packages,
                harvest_job
            )

            return object_ids
        else:
            self._save_gather_error(
                'No packages received for URL: %s' % harvest_job.source.url,
                harvest_job
            )

            return None
示例#21
0
    def populate_harvest_job(self, harvest_job, set_ids, config, client):
        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
            .filter(HarvestJob.source == harvest_job.source) \
            .filter(HarvestJob.gather_finished != None) \
            .filter(HarvestJob.id != harvest_job.id) \
            .order_by(HarvestJob.gather_finished.desc()) \
            .limit(1).first()

        last_time = None
        if previous_job and previous_job.finished and model.Package.get(harvest_job.source.id).metadata_modified < previous_job.gather_started:
            last_time = previous_job.gather_started.isoformat()

        # Collect package ids
        package_ids = list(self.get_package_ids(set_ids, config, last_time, client))
        log.debug('Identifiers: %s', package_ids)

        if not self._recreate(harvest_job) and package_ids:
            converted_identifiers = {}
            for identifier in package_ids:
                converted_identifiers[datapid_to_name(identifier)] = identifier
                if identifier.endswith(u'm'):
                    converted_identifiers[datapid_to_name(u"%ss" % identifier[0:-1])] = identifier

            for package in model.Session.query(model.Package).filter(model.Package.name.in_(converted_identifiers.keys())).all():
                converted_name = package.name
                if converted_identifiers[converted_name] not in package_ids:
                    converted_name = "%sm" % converted_name[0:-1]
                package_ids.remove(converted_identifiers[converted_name])

        if previous_job:
            for previous_error in [error.guid for error in Session.query(HarvestObject).
                                   filter(HarvestObject.harvest_job_id == previous_job.id).
                                   filter(HarvestObject.state == 'ERROR').all()]:
                if previous_error not in package_ids:
                    package_ids.append(previous_error)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in islice(package_ids, config['limit']) if 'limit' in config else package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                log.debug('Object ids: {i}'.format(i=object_ids))
                return object_ids
            else:
                self._save_gather_error('No packages received for URL: {u}'.format(
                    u=harvest_job.source.url), harvest_job)
                return None
        except Exception as e:
            self._save_gather_error('Gather: {e}'.format(e=e), harvest_job)
            raise
示例#22
0
    def test_import(self):
        source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
        source.save()
        job = HarvestJob(source=source)
        job.save()

        harvest_object = self._run_import("cmdi_1.xml", job)

        self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or [])))

        package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'})

        self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180')
        self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180')
        self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}')
        self.assertEquals(package.get('version', None), '2012-09-07')
        self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
        self.assertEquals(package.get('license_id', None), 'undernegotiation')

        provider = config['ckan.site_url']
        expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0',
                        u'provider': provider,
                        u'type': u'metadata'}

        self.assertTrue(expected_pid in package.get('pids'))

        model.Session.flush()

        harvest_object = self._run_import("cmdi_2.xml", job)

        self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or [])))

        package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'})

        self.assertEquals(package['temporal_coverage_begin'], '1880')
        self.assertEquals(package['temporal_coverage_end'], '1939')
        self.assertEquals(package.get('license_id', None), 'other')
        # Delete package
        harvest_object = HarvestObject()
        harvest_object.content = None
        harvest_object.id = "test-cmdi-delete"
        harvest_object.guid = "test-cmdi-delete"
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.package_id = package.get('id')
        harvest_object.report_status = "deleted"
        harvest_object.save()

        self.harvester.import_stage(harvest_object)

        model.Session.flush()
        self.assertEquals(model.Package.get(package['id']).state, 'deleted')
示例#23
0
    def gather_stage(self,harvest_job):

        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('z3950Harvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # get current objects out of db
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        guid_to_package_id = dict((res[0], res[1]) for res in query)
        current_guids = set(guid_to_package_id.keys())
        current_guids_in_harvest = set()

        # Get contents
        try:
            conn = zoom.Connection(source_url, int(self.source_config.get('port', 210)))
            conn.databaseName = self.source_config.get('database', '')
            conn.preferredRecordSyntax = 'XML'
            conn.elementSetName = 'T'
            query = zoom.Query ('CCL', 'metadata')
            res = conn.search (query)
            ids = []
            for num, result in enumerate(res):
                hash = hashlib.md5(result.data).hexdigest()
                if hash in current_guids:
                    current_guids_in_harvest.add(hash)
                else:
                    obj = HarvestObject(job=harvest_job, guid=hash, extras=[
                        HOExtra(key='status', value='new'),
                        HOExtra(key='original_document', value=result.data.decode('latin-1')),
                        HOExtra(key='original_format', value='fgdc')
                    ])
                    obj.save()
                    ids.append(obj.id)
            for guid in (current_guids - current_guids_in_harvest):
                obj = HarvestObject(job=harvest_job,
                                    guid=guid,
                                    package_id=guid_to_package_id[guid],
                                    extras=[HOExtra(key='status', value='delete')])
                obj.save()
                ids.append(obj.id)
            return ids
        except Exception,e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None
示例#24
0
    def gather_stage(self, harvest_job):
        log.debug('In DataWienGvAt gather_stage')

        doc = etree.parse(self.CATALOGUE_FEED_URL)
        ids = []
        for link in doc.findall("//item/link"):
            link = link.text
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()
            ids.append(obj.id)
        return ids
示例#25
0
    def gather_stage(self, harvest_job):

        if harvest_job.source.url.startswith('basic_test'):
            obj = HarvestObject(guid = 'test1', job = harvest_job)
            obj.extras.append(HarvestObjectExtra(key='key', value='value'))
            obj2 = HarvestObject(guid = 'test2', job = harvest_job)
            obj3 = HarvestObject(guid = 'test_to_delete', job = harvest_job)
            obj.add()
            obj2.add()
            obj3.save() # this will commit both
            return [obj.id, obj2.id, obj3.id]

        return []
示例#26
0
    def gather_stage(self,harvest_job):
        log.debug('In OpenDataCatHarvester gahter_stage')
        # Get feed contents
        doc = etree.parse(self.INDEX_URL)
        ids = []
        for link_element in doc.findall('//item/link'):
            link = link_element.text.strip()
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()

            ids.append(obj.id)
        return ids
示例#27
0
    def gather_stage(self, harvest_job):
        log.debug('In OpendataParisFr gather_stage')

        doc = html.parse(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
        ids = []
        for link in doc.findall("//div[@class='animate download-portlet-element']/a"):
            link = link.get('href')
            if not "#comments" in link:
                id = sha1(link).hexdigest()
                obj = HarvestObject(guid=id, job=harvest_job, content=link)
                obj.save()
                ids.append(obj.id)
        return ids
示例#28
0
    def gather_stage(self,harvest_job):
        log.debug('In OpenGovSeHarvester gahter_stage')
        # Get feed contents
        doc = etree.parse(self.INDEX_URL)
        ids = []
        for id_element in doc.findall('//{%(ns)s}entry/{%(ns)s}id' % {'ns':self.ATOM_NS}):
            link = id_element.text.strip()
            log.debug('Got link: %s' % link)
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()

            ids.append(obj.id)
        return ids
    def _save_harvest_object(self, metadata, harvest_job):
        '''
        Save the harvest object with the given metadata dict and harvest_job
        '''

        obj = HarvestObject(
            guid=metadata['datasetID'],
            job=harvest_job,
            content=json.dumps(metadata)
        )
        obj.save()
        log.debug('adding ' + metadata['datasetID'] + ' to the queue')

        return obj.id
示例#30
0
def doi_update(context, data_dict):
    model = context['model']
    new_package = data_dict
    source_hash = hashlib.sha1(json.dumps(data_dict, sort_keys=True)).hexdigest()
    old_package = p.toolkit.get_action('package_show')(
        {'model': model, 'ignore_auth': True}, {"id":new_package['id']})
    for extra in old_package['extras']:
        if extra['key'] == 'source_hash':
            old_source_hash = extra['value']
            break
    else:
       old_source_hash = None

    if source_hash == old_source_hash and old_package.get('state') =='active':
        print str(datetime.datetime.now()) + ' No change for doi id ' + new_package['id']
        return

    new_package["extras"].append({"key": "source_hash", "value": source_hash})
    new_package["extras"].append({"key": "metadata-source", "value": "doi"})
    new_package["extras"].append({"key": "source_doi_import_identifier", "value": True})
    new_package.pop("name", None)
    owner_org = model.Group.get(ORG_MAPPING.get(new_package['organization']['name']))
    if not owner_org:
        print str(datetime.datetime.now()) + ' Fail to update doi id ' + new_package['id'] + '. Organization ' + new_package['organization']['name'] + ' does not exist.'
        return
    new_package['owner_org'] = owner_org.name
    group_name = new_package.pop('owner_name', None)

    resources = []
    for resource in new_package['resources']:
        resource.pop('resource_group_id', None)
        resource.pop('revision_id', None)
        resource.pop('id', None)
        resources.append(resource)
    new_package['resources'] = resources

    obj = HarvestObject(
        guid=uuid.uuid4().hex,
        job=context['harvest_job'],
        content=context['harvestobj'])
    obj.save()
    new_package["extras"].append({"key": "harvest_object_id", "value": obj.id})

    context['return_id_only'] = True
    p.toolkit.get_action('package_update')(context, new_package)
    print str(datetime.datetime.now()) + ' Updated doi id ' + new_package['id']
    def setup_class(cls):
        # Create package and its harvest object
        CreateTestData.create()
        harvest_setup()
        source = HarvestSource(url=u'http://test-source.org',type='test')
        source.save()

        job = HarvestJob(source=source)
        job.save()

        ho = HarvestObject(package=model.Package.by_name(u'annakarenina'),
                           job=job,
                           guid=u'test-guid',
                           content=u'<xml>test content</xml>')
        ho.save()

        # Save a reference to the harvest object in the package
        rev = model.repo.new_revision()
        pkg = model.Package.by_name(u'annakarenina')
        pkg.extras['harvest_object_id'] = ho.id
        pkg.save()

        model.repo.commit_and_remove()
示例#32
0
    def _gather_ids(self, url=None, jar=None):
        log.debug('Page %s' % self.page)
        if jar is None:
            jar = CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
        url = url or self.INITIAL_INDEX
        fh = opener.open(url)
        doc = html.parse(fh)
        fh.close()

        new_ids = []
        for a in doc.findall(".//div[@class='main']//a"):
            href = a.get('href').split('?', 1)[0]
            id = href.split('/').pop()
            if not id in self.gathered_ids:
                log.debug('Got Id: %s' % id)
                obj = HarvestObject(guid=sha1(id).hexdigest(),
                                    job=self.job,
                                    content=id)
                obj.save()

                self.object_ids.append(obj.id)

                new_ids.append(id)

        if len(new_ids) == 0:  #or self.page == 2:
            return self.gathered_ids
        else:
            self.gathered_ids.extend(new_ids)

        inputs = []
        for input in doc.findall(".//form[@id='main_form']//input"):
            inputs.append((input.get('name'), input.get('value')))
        inputs.append(('listbox_nextPage:method', ''))
        next_url = self.INDEX_URL + '?' + urllib.urlencode(inputs)
        self.page = self.page + 1
        return self._gather_ids(url=next_url, jar=jar)
示例#33
0
    def _gather_object(self, job, product, resources, manifest_content,
                       last_harvest_date):
        name = parse_filename(product).lower()

        status, package = self._was_harvested(name, self.update_all)

        extras = [HOExtra(key='status', value=status)]

        content = json.dumps(
            {
                'name': name,
                'restart_date': last_harvest_date.strftime('%Y-%m-%d'),
                'manifest_content': manifest_content,
                'resources': resources
            },
            default=str)

        obj = HarvestObject(job=job,
                            guid=unicode(uuid.uuid4()),
                            extras=extras,
                            content=content)
        obj.package = package
        obj.save()
        return obj.id
示例#34
0
    def gather_stage(self, harvest_job):
        log.debug('In ZhGisHarvester gather_stage')

        ids = []
        for dataset_id, dataset in self.DATASETS.iteritems():
            csw = ckan_csw.ZhGisCkanMetadata()
            metadata = csw.get_ckan_metadata_by_id(dataset_id).copy()
            log.debug(metadata)

            # Fix metadata information
            metadata['name'] = munge_title_to_name(metadata['name'])
            metadata['service_type'] = (metadata['service_type'].replace(
                'OGC:', ''))

            # Enrich metadata with hardcoded values
            metadata['url'] = dataset['geolion_url']
            metadata['tags'].extend(dataset['tags'])

            metadata['translations'] = self._generate_term_translations()
            log.debug("Translations: %s" % metadata['translations'])

            metadata['resources'] = (
                self._generate_resource_dict_array(metadata))
            log.debug(metadata['resources'])

            metadata['license_id'] = self.LICENSE['name']
            metadata['license_url'] = self.LICENSE['url']

            obj = HarvestObject(guid=metadata['id'],
                                job=harvest_job,
                                content=json.dumps(metadata))
            obj.save()
            log.debug('adding ' + metadata['name'] + ' to the queue')
            ids.append(obj.id)

        return ids
示例#35
0
    def _run_job_for_single_document(self, harvest_job, object_id):

        harvester = FisbrokerPlugin()

        # we circumvent gather_stage() and fetch_stage() and just load the
        # content with a known object_id and create the harvest object:
        url = harvest_job.source.url
        # _get_content() returns XML
        content = harvester._get_content(url)
        obj = HarvestObject(guid=object_id,
                            job=harvest_job,
                            content=content,
                            extras=[HarvestObjectExtra(key='status',value='new')])
        obj.save()

        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)

        harvest_job.status = u'Finished'
        harvest_job.save()

        return obj
示例#36
0
    def _crawl_results(self, harvest_url, limit=100, timeout=5, username=None, password=None, provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        update_counter = 0
        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url,
                                 auth=HTTPBasicAuth(username, password),
                                 verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e), self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, r.status_code, elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(log_message.format(self.provider,
                    timestamp, r.status_code, r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')

            # Get the URL for the next loop, or None to break the loop
            harvest_url = self._get_next_url(soup)

            # Get the entries from the results
            entries = self._get_entries_from_results(soup)

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_date = entry['restart_date']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    # We need package_show to ensure that all the conversions
                    # are carried out.
                    context = {"user": "******", "ignore_auth": True,
                               "model": model, "session": Session}
                    pkg_dict = logic.get_action('package_show')(context, {"id": package.name})  # noqa: E501
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    # E.g., a Sentinel dataset exists,
                    # but doesn't have a NOA resource yet.
                    elif self.flagged_extra and not get_pkg_dict_extra(pkg_dict, self.flagged_extra):  # noqa: E501
                        log.debug('{} already exists and will be extended.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    else:
                        log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value=status),
                                                HOExtra(key='restart_date',
                                                value=entry_restart_date)])
                    obj.content = entry['content']
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)
                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value='new'),
                                                HOExtra(key='restart_date',
                                                value=entry_restart_date)])
                    new_counter += 1
                    obj.content = entry['content']
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider,
                                       timestamp, self.job.id, new_counter, update_counter))  # noqa: E128, E501
        return ids
示例#37
0
    def _mark_datasets_for_deletion(self, guids_in_source, harvest_job):
        # This is the same as the method in the base class, except that a different query is used.

        object_ids = []

        portal = self._get_portal_from_config(harvest_job.source.config)

        starttime = time.time()
        # Get all previous current guids and dataset ids for this harvested portal independent of
        # the harvest objects. This allows cleaning the harvest data without loosing the
        # dataset mappings.
        # Build a subquery to get all active packages having a GUID first
        subquery = model.Session.query(model.PackageExtra.value, model.Package.id) \
            .join(model.Package, model.Package.id == model.PackageExtra.package_id)\
            .filter(model.Package.state == model.State.ACTIVE) \
            .filter(model.PackageExtra.state == model.State.ACTIVE) \
            .filter(model.PackageExtra.key == 'guid') \
            .subquery()
        # then get all active packages of the current portal and join with their GUIDs if
        # available (outer join)
        query = model.Session.query(model.Package.id, subquery.c.value) \
            .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id)\
            .outerjoin(subquery, subquery.c.id == model.Package.id)\
            .filter(model.Package.state == model.State.ACTIVE) \
            .filter(model.PackageExtra.state == model.State.ACTIVE) \
            .filter(model.PackageExtra.key == EXTRA_KEY_HARVESTED_PORTAL) \
            .filter(model.PackageExtra.value == portal)

        checkpoint_start = time.time()
        guid_to_package_id = {}
        for package_id, guid in query:
            if guid:
                guid_to_package_id[guid] = package_id
            # Also remove all packages without a GUID, use ID as GUID to share logic below
            else:
                guid_to_package_id[package_id] = package_id
        checkpoint_end = time.time()
        LOGGER.debug('Time for query harvest source related datasets : %s',
                     str(checkpoint_end - checkpoint_start))

        guids_in_db = guid_to_package_id.keys()

        # Get objects/datasets to delete (ie in the DB but not in the source)
        guids_to_delete = set(guids_in_db) - set(guids_in_source)

        # Create a harvest object for each of them, flagged for deletion
        for guid in guids_to_delete:
            obj = HarvestObject(guid=guid, job=harvest_job,
                                package_id=guid_to_package_id[guid],
                                extras=[HarvestObjectExtra(key='status',
                                                           value='delete')])

            # Mark the rest of objects for this guid as not current
            model.Session.query(HarvestObject) \
                .filter_by(guid=guid) \
                .update({'current': False}, False)
            obj.save()
            object_ids.append(obj.id)

        endtime = time.time()
        LOGGER.debug('Found %s packages for deletion. Time total: %s', len(guids_to_delete),
                     str(endtime - starttime))

        return object_ids
示例#38
0
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SatcenBetter Harvester gather_stage for job: %r',
                       harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        self.update_all = self.source_config.get('update_all', False)
        interface = INTERFACE(self.source_config, COLLECTION)

        last_product_index = (self._get_last_harvesting_index(
            harvest_job.source_id, interface))
        interface.update_index(last_product_index)
        interface.build_url()

        log.debug('URL: {}'.format(interface.current_url))  # noqa: E501

        ids = []
        try:
            results = interface.get_results()
        except Timeout as e:
            self._save_gather_error('Request timed out: {}'.format(e),
                                    self.job)  # noqa: E501
            return ids
        if type(results) is not list:
            self._save_gather_error('{} error: {}'.format(
                results['status_code'], results['message']),
                                    self.job)  # noqa: E501
            return ids

        for entry in results:
            name_path = interface.get_name_path()

            name_url = get_field(entry,
                                 name_path['relative_location'].split(","),
                                 name_path['fixed_attributes'])
            entry_name = parse_name(name_url).lower()
            entry_guid = unicode(uuid.uuid4())
            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key=interface.get_pagination_mechanism(),
                            value=interface.get_index())
                ])
            obj.content = json.dumps(entry)
            obj.package = None if status == 'new' else package
            obj.save()
            interface.increment_index()
            ids.append(obj.id)
        return ids
示例#39
0
    def test_import(self):
        source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi")
        source.save()
        job = HarvestJob(source=source)
        job.save()

        harvest_object = self._run_import("cmdi_1.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package.get('name', None),
                          utils.pid_to_name(package.get('id', None)))
        self.assertEquals(utils.get_primary_pid(package),
                          u'http://urn.fi/urn:nbn:fi:lb-20140730180')
        self.assertEquals(package.get('notes', None),
                          u'{"eng": "Test description"}')
        self.assertEquals(package.get('version', None), '2012-09-07')
        self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}')
        self.assertEquals(package.get('license_id', None), 'undernegotiation')

        provider = config['ckan.site_url']
        expected_pid = {
            u'id': u'http://islrn.org/resources/248-895-085-557-0',
            u'provider': provider,
            u'type': u'relation',
            u'relation': u'generalRelation'
        }

        self.assertTrue(expected_pid not in package.get('pids'))

        model.Session.flush()

        harvest_object = self._run_import("cmdi_2.xml", job)
        package_id = json.loads(harvest_object.content)['unified']['id']

        self.assertEquals(
            len(harvest_object.errors), 0, u"\n".join(
                unicode(error.message)
                for error in (harvest_object.errors or [])))

        package = get_action('package_show')({
            'user': '******'
        }, {
            'id': package_id
        })

        self.assertEquals(package['temporal_coverage_begin'], '1880')
        self.assertEquals(package['temporal_coverage_end'], '1939')
        self.assertEquals(package.get('license_id', None), 'other')
        # Delete package
        harvest_object = HarvestObject()
        harvest_object.content = None
        harvest_object.id = "test-cmdi-delete"
        harvest_object.guid = "test-cmdi-delete"
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.package_id = package.get('id')
        harvest_object.report_status = "deleted"
        harvest_object.save()

        self.harvester.import_stage(harvest_object)

        model.Session.flush()
        self.assertEquals(model.Package.get(package['id']).state, 'deleted')
示例#40
0
    def gather_stage(self, harvest_job, collection_package_id=None):
        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('WafHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            response = requests.get(source_url, timeout=60)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None

        content = response.content
        scraper = _get_scraper(response.headers.get('server'))

        ######  Get current harvest object out of db ######

        url_to_modified_db = {}  ## mapping of url to last_modified in db
        url_to_ids = {}  ## mapping of url to guid in db

        HOExtraAlias1 = aliased(HOExtra)
        HOExtraAlias2 = aliased(HOExtra)
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id, HOExtraAlias1.value, HOExtraAlias2.value).\
                                    join(HOExtraAlias1, HarvestObject.extras).\
                                    join(HOExtraAlias2, HarvestObject.extras).\
                                    filter(HOExtraAlias1.key=='waf_modified_date').\
                                    filter(HOExtraAlias2.key=='waf_location').\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        for guid, package_id, modified_date, url in query:
            url_to_modified_db[url] = modified_date
            url_to_ids[url] = (guid, package_id)

        ######  Get current list of records from source ######

        url_to_modified_harvest = {
        }  ## mapping of url to last_modified in harvest
        try:
            for url, modified_date in _extract_waf(content, source_url,
                                                   scraper):
                url_to_modified_harvest[url] = modified_date
        except Exception as e:
            msg = 'Error extracting URLs from %s, error was %s' % (source_url,
                                                                   e)
            self._save_gather_error(msg, harvest_job)
            return None

        ######  Compare source and db ######

        harvest_locations = set(url_to_modified_harvest.keys())
        old_locations = set(url_to_modified_db.keys())

        new = harvest_locations - old_locations
        delete = old_locations - harvest_locations
        possible_changes = old_locations & harvest_locations
        change = []

        for item in possible_changes:
            if (not url_to_modified_harvest[item] or not url_to_modified_db[
                    item]  #if there is no date assume change
                    or
                    url_to_modified_harvest[item] > url_to_modified_db[item]):
                change.append(item)

        def create_extras(url, date, status):
            extras = [
                HOExtra(key='waf_modified_date', value=date),
                HOExtra(key='waf_location', value=url),
                HOExtra(key='status', value=status)
            ]
            if collection_package_id:
                extras.append(
                    HOExtra(key='collection_package_id',
                            value=collection_package_id))
            return extras

        ids = []
        for location in new:
            guid = hashlib.md5(location.encode('utf8', 'ignore')).hexdigest()
            obj = HarvestObject(job=harvest_job,
                                extras=create_extras(
                                    location,
                                    url_to_modified_harvest[location], 'new'),
                                guid=guid)
            obj.save()
            ids.append(obj.id)

        for location in change:
            obj = HarvestObject(
                job=harvest_job,
                extras=create_extras(location,
                                     url_to_modified_harvest[location],
                                     'change'),
                guid=url_to_ids[location][0],
                package_id=url_to_ids[location][1],
            )
            obj.save()
            ids.append(obj.id)

        for location in delete:
            obj = HarvestObject(
                job=harvest_job,
                extras=create_extras('', '', 'delete'),
                guid=url_to_ids[location][0],
                package_id=url_to_ids[location][1],
            )
            model.Session.query(HarvestObject).\
                  filter_by(guid=url_to_ids[location][0]).\
                  update({'current': False}, False)

            obj.save()
            ids.append(obj.id)

        if len(ids) > 0:
            log.debug(
                '{0} objects sent to the next stage: {1} new, {2} change, {3} delete'
                .format(len(ids), len(new), len(change), len(delete)))
            return ids
        else:
            self._save_gather_error('No records to change', harvest_job)
            return []
示例#41
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.individual.gather')
        log.debug('DocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None

        existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
                                    first()

        def create_extras(url, status):
            return [
                HOExtra(key='doc_location', value=url),
                HOExtra(key='status', value=status)
            ]

        if not existing_object:
            guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest()
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url, 'new'),
                                           guid=guid)
        else:
            harvest_object = HarvestObject(
                job=harvest_job,
                extras=create_extras(url, 'change'),
                guid=existing_object.guid,
                package_id=existing_object.package_id)

        harvest_object.add()

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
        else:
            extra = HOExtra(object=harvest_object,
                            key='original_document',
                            value=content)
            extra.save()

            extra = HOExtra(object=harvest_object,
                            key='original_format',
                            value=document_format)
            extra.save()

        harvest_object.save()

        return [harvest_object.id]
示例#42
0
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        max_dataset = self.source_config.get('max_dataset', 100)
        wfs_url = self.source_config.get('wfs_url')
        wfs_version = self.source_config.get('wfs_version')
        collection = self.source_config.get('collection')
        typename = COLLECTION[collection].get('collection_typename')
        tag_typename = COLLECTION[collection].get('tag_typename', None)
        self.update_all =  self.source_config.get('update_all', False)

        last_product_index = (
            self._get_last_harvesting_index(harvest_job.source_id)
        )

        if last_product_index:
            last_product_index = last_product_index + 1
        else:
            last_product_index = 0

        wfs = WFS(url=wfs_url, version=wfs_version)

        wfs.set_collection(typename)
        sortby=['When']

        result = wfs.make_request(max_dataset, sortby, last_product_index)
        entries = result['features']
        name = '{}_{}'.format(collection.lower(), '{}')
        ids = []
        for entry in entries:
            entry_guid = unicode(uuid.uuid4())
            entry_name = name.format(convert_to_clean_snakecase(entry['id']))
            log.debug('gathering %s', entry_name)

            
            content = {}
            content['collection_content'] = entry
            if tag_typename:
                wfs.set_collection(tag_typename)
                filterxml = wfs.set_filter_equal_to('image_id', entry['id'])
                result = wfs.make_request(constraint=filterxml)
                result = wfs.get_request(constraint=filterxml)
                content['tag_url'] = result

            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug(
                        '{} will not be updated.'.format(entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key='index', value=last_product_index)
                ])
            obj.content = json.dumps(content)
            obj.package = None if status == 'new' else package
            obj.save()
            last_product_index += 1
            ids.append(obj.id)
        return ids
示例#43
0
    def gather_stage(self, harvest_job):
        '''
        analyze the source, return a list of IDs
            and create one HarvestObject per dataset 
        '''
        logger.info('Starts Gather SIU Transp')
        # load paths
        self.set_paths()
        self.siu_data_lib.get_query_files()

        # basic things you'll need
        self.source = harvest_job.source
        self.source_config = json.loads(self.source.config)

        # allow to get config from URL
        # Sample: https://raw.githubusercontent.com/avdata99/ckan-env/develop/docs/full_config.json
        config_from_url = self.source_config.get('from_url', None)
        if config_from_url is not None:
            logger.info('Updating config from URL')
            response = requests.get(config_from_url)
            update_config = response.json()
            self.source_config.update(update_config)

        self.siu_data_lib.base_url = self.source.url
        self.siu_data_lib.username = self.source_config['username']
        self.siu_data_lib.password = self.source_config['password']
        
        # ####################################
        # get previous harvested packages
        pfr = self.get_packages_for_source(harvest_source_id=self.source.id)
        prev_names = [pkg['name'] for pkg in pfr['results']]
        logger.info('Get previous harvested objects {}'.format(prev_names))
        # TODO
        # ####################################
        
        object_ids = []  # lista de IDs a procesar, esto se devuelve en esta funcion
        
        self.source_dataset = get_harvest_source(self.source.id)
        owner_org = self.source_dataset.get('owner_org')
        logger.info('Gather SIU Transp to ORG {}'.format(owner_org))
        
        # Iterar por cada query para obtener diferentes conjuntos de datos
        # Por cada archivo en siu_transp_data/queries se generarán múltiples datasets para publicar
        
        report = []  # resumen de todos los resultados
        logger.info('Iter files')
        
        # ver si la config me pide sobreescribir metadatos en los datasets de cada archivo
        override = self.source_config.get('override', {})
        logger.info("General override {}".format(override))
            
        for qf in self.siu_data_lib.query_files:
            only_files = self.source_config.get('only_files', None)
            query_file_name = qf.split('/')[-1]
            if only_files is not None:
                if query_file_name not in only_files:
                    logger.info('Skipping file by config {}'.format(query_file_name))
                    continue
            
            logger.info('Gather SIU Transp FILE {}'.format(qf))
            stqf = SIUTranspQueryFile(portal=self.siu_data_lib, path=qf)
            # open to read query params
            stqf.open()
            # request all data
            stqf.request_all(results_folder_path=self.results_folder_path)
            for err in stqf.errors:
                hgerr = HarvestGatherError(message=err, job=harvest_job)
                hgerr.save()


            # ====== Prepare dict to override datasets metadata ============
            override_this = override.get(query_file_name, {})
            logger.info("To override {}: {}".format(query_file_name, override_this))
            
            # extras need to be {"key": "extra name", "value": "extra value"}
            extras = override_this.get('extras', {})
            new_extras = []
            for extra_key, extra_value in extras.iteritems():
                logger.info("Override extra found {}: {}".format(extra_key, extra_value))
                if not isinstance(extra_value, str):
                    extra_value = str(extra_value)
                new_extras.append({"key": extra_key, "value": extra_value})
            
            if len(new_extras) > 0:
                override_this['extras'] = new_extras

            # tags need to be {"name": "tag name"}
            tags = override_this.get('tags', [])
            new_tags = []
            for tag in tags:
                logger.info("Override tag found {}".format(unicode(tag).encode("utf-8")))
                new_tags.append({"name": tag})
            
            if len(new_tags) > 0:
                override_this['tags'] = new_tags

            # groups need to be {"name": "tag name"}
            groups = override_this.get('groups', [])
            new_groups = []
            for group in groups:
                logger.info("Override group found {}".format(group))
                # check if groups must be created
                context = {'model': model, 'session': model.Session, 'user': self._get_user_name()}
                try:
                    p.toolkit.get_action('group_create')(context, {"name": group})
                except Exception as e:
                    logger.error('Error creating group (skipped) {}: {}'.format(group, e))
                    
                new_groups.append({"name": group})
            
            if len(new_groups) > 0:
                override_this['groups'] = new_groups

            # ================================
                
            report += stqf.requests
            for dataset in stqf.datasets:
                if dataset['name'] in prev_names:
                    action = 'update'
                    # leave this list just with packages to remove
                    prev_names.remove(dataset['name'])
                else:
                    action = 'create'
                logger.info('Dataset {} to {}'.format(dataset['name'], action))
                ho_dict = {
                    'title': dataset['title'],
                    'name': dataset['name'],
                    'owner_org': owner_org,
                    'notes': dataset['notes'],
                    'tags': dataset['tags'],
                    'resources': dataset['resources'],
                    'action': action
                }

                # fix extras if they exists
                ho_dict.update(override_this)
                logger.info("Overrided ho_dict {}".format(ho_dict))
                    

                # Each harvest object will be passed to other stages in harvest process
                obj = HarvestObject(guid=dataset['name'],
                                    job=harvest_job,
                                    content=json.dumps(ho_dict))
                obj.save()
                logger.info('Objects ID appends {}'.format(obj.id))
                object_ids.append(obj.id)

        # TODO compare with previous harvested data to remove dataset no more at harvest source

        # resumen final
        logger.info('REQUESTS: \n{}'.format('\n\t'.join(report)))
        return object_ids
    def gather_stage(self, harvest_job):
        # The gather stage scans a remote resource (in our case, the /data.json file) for
        # a list of datasets to import.

        log.debug('In datajson harvester gather_stage (%s)' %
                  harvest_job.source.url)

        source = json.load(urllib2.urlopen(harvest_job.source.url))
        if len(source) == 0: return None

        # Loop through the packages we've already imported from this source
        # and go into their extra fields to get their source_datajson_identifier,
        # which corresponds to the /data.json 'identifier' field. Make a mapping
        # so we know how to update existing records.
        existing_datasets = {}
        for hobj in model.Session.query(HarvestObject).filter_by(
                source=harvest_job.source, current=True):
            try:
                pkg = get_action('package_show')(self.context(), {
                    "id": hobj.package_id
                })
            except:
                # reference is broken
                continue
            for extra in pkg["extras"]:
                if extra["key"] == "source_datajson_identifier":
                    existing_datasets[extra["value"]] = hobj.package_id

        # If we've lost an association to the HarvestSource, scan all packages in the database.
        if False:
            for pkg in model.Session.query(Package):
                if pkg.extras.get("source_datajson_url") == harvest_job.source.url \
                    and pkg.extras.get("source_datajson_identifier"):
                    existing_datasets[
                        pkg.extras["source_datajson_identifier"]] = pkg.id

        # Create HarvestObjects for any records in the /data.json file.

        object_ids = []
        seen_datasets = set()

        for dataset in source:
            # Create a new HarvestObject for this identifier and save the
            # dataset metdata inside it for later.

            # Get the package_id of this resource if we've already imported
            # it into our system. Otherwise, assign a brand new GUID to the
            # HarvestObject. I'm not sure what the point is of that.

            if dataset['identifier'] in existing_datasets:
                pkg_id = existing_datasets[dataset["identifier"]]
                seen_datasets.add(pkg_id)
            else:
                pkg_id = uuid.uuid4().hex

            # Create a new HarvestObject and store in it the GUID of the
            # existing dataset (if it exists here already) and the dataset's
            # metadata from the /data.json file.
            obj = HarvestObject(guid=pkg_id,
                                job=harvest_job,
                                content=json.dumps(dataset))
            obj.save()
            object_ids.append(obj.id)

        # Remove packages no longer in the /data.json file.
        for id in existing_datasets.values():
            if id not in seen_datasets:
                log.warn('deleting package %s because it is no longer in %s' %
                         (id, harvest_job.source.url))
                Session.query(Package).filter(Package.id == id)

        return object_ids
    def _gather_entry(self, entry, path, row, update_all=False):
        # Create a harvest object for each entry
        entry_guid = unicode(uuid.uuid4())
        entry_name = entry.lower()  # noqa: E501
        log.debug('gathering %s', entry)

        package_query = Session.query(Package)
        query_filtered = package_query.filter(Package.name == entry_name)
        package = query_filtered.first()

        if package:
            # Meaning we've previously harvested this,
            # but we may want to reharvest it now.
            previous_obj = Session.query(HarvestObject) \
                .filter(HarvestObject.guid == entry_guid) \
                .filter(HarvestObject.current == True) \
                .first()  # noqa: E712
            if previous_obj:
                previous_obj.current = False
                previous_obj.save()

            if update_all:
                log.debug('{} already exists and will be updated.'.format(
                    entry_name))  # noqa: E501
                status = 'change'
                obj = HarvestObject(
                    guid=entry_guid,
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value=status),
                        HOExtra(key='path', value=path),
                        HOExtra(key='row', value=row)
                    ])
                obj.content = entry
                obj.package = package
                obj.save()
                return obj.id

            else:
                log.debug(
                    '{} will not be updated.'.format(entry_name))  # noqa: E501
                status = 'unchanged'
                obj = HarvestObject(
                    guid=entry_guid,
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value=status),
                        HOExtra(key='path', value=path),
                        HOExtra(key='row', value=row)
                    ])
                obj.content = entry
                obj.package = package
                obj.save()
                return obj.id

        elif not package:
            # It's a product we haven't harvested before.
            log.debug(
                '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                format(entry_name))  # noqa: E501
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value='new'),
                    HOExtra(key='path', value=path),
                    HOExtra(key='row', value=row)
                ])
            obj.content = entry
            obj.package = None
            obj.save()
            return obj.id
    def _parse_products(self, products):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0

        # Create a harvest object for each entry
        for entry in products:

            entry_guid = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_name = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_restart_date = entry['master']

            package = Session.query(Package) \
                .filter(Package.name == entry_name).first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'
                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value=status),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                obj.content = json.dumps(entry)
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .format(entry_name))  # noqa: E501
                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value='new'),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                new_counter += 1
                obj.content = json.dumps(entry)
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
示例#47
0
    def gather_stage(self, harvest_job):
        requests_cache.install_cache()
        requests_cache.clear()

        session = requests_cache.CachedSession()

        self.log = logging.getLogger(__file__)
        self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        base_url = self.source_config.get('oai_pmh_url')
        metadata_prefix = self.source_config.get('metadata_prefix')
        start_date = self.source_config.get('start_date', None)
        self.update_all = self.source_config.get('update_all', False)

        last_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'last_token')
        next_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'next_token')
        next_station = self._get_last_harvesting_index(self.job.source_id,
                                                       'next_station')
        restart_date = self._get_last_harvesting_index(self.job.source_id,
                                                       'restart_date')
        restart_date = restart_date if last_token else None

        ids = []
        first_query = True
        while (ids == [] and next_token) or first_query:
            first_query = False

            current_token = last_token if next_station else next_token
            if current_token:
                query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format(
                    base_url, current_token)
            elif restart_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, restart_date)
            elif start_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, start_date)
            else:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format(
                    base_url, metadata_prefix)

            self.log.debug('Querying: {}.'.format(query_url))
            raw_list_ids = self.get_list_identifiers(session, query_url)

            list_stations, largest_datastamp = self.get_station_ids(
                raw_list_ids)

            next_token = self.get_resumption_token(raw_list_ids)
            last_token = current_token
            restart_date = restart_date if restart_date else ''
            restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date

            if list_stations == []:
                next_station = None
            else:
                valid_deployment = None
                station_index = 0
                while not valid_deployment and station_index <= len(
                        list_stations) - 1:
                    station = list_stations[station_index]
                    next_station = None if (next_station
                                            == station) else next_station
                    if not next_station:
                        station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format(
                            base_url, metadata_prefix, station)
                        print('Querying station: {}.'.format(station))
                        record = self.get_record(session, station_query)
                        if record:
                            station_info = StationInfo(record)
                            if station_info.isValid():
                                station_info.id = station
                                observation_list = station_info.get_observations(
                                )
                                station_dict = station_info.get_dict()
                                station_info = None
                                for observation in observation_list:
                                    observation_info = ObservationInfo(
                                        session, observation)
                                    deployments_list = observation_info.get_deployments(
                                    )
                                    observation_dict = observation_info.get_dict(
                                    )
                                    observation_info = None
                                    for deployment in deployments_list:
                                        deployment_info = DeploymentInfo(
                                            session, deployment)
                                        if deployment_info.isValid():
                                            deployment_dict = deployment_info.get_dict(
                                            )
                                            deployment_info = None
                                            valid_deployment = True
                                            if station_index + 1 <= len(
                                                    list_stations) - 1:
                                                next_station = list_stations[
                                                    station_index + 1]
                                            else:
                                                next_station = None
                                            entry_guid = unicode(uuid.uuid4())
                                            entry_id = '{}_{}'.format(
                                                station_dict['id'],
                                                deployment_dict['id'])
                                            entry_name = clean_snakecase(
                                                entry_id)
                                            self.log.debug(
                                                'Gathering %s', entry_name)

                                            content = {}
                                            content['station'] = station_dict
                                            content[
                                                'observation'] = observation_dict
                                            content[
                                                'deployment'] = deployment_dict

                                            package_query = Session.query(
                                                Package)
                                            query_filtered = package_query.filter(
                                                Package.name == entry_name)
                                            package = query_filtered.first()

                                            if package:
                                                # Meaning we've previously harvested this,
                                                # but we may want to reharvest it now.
                                                previous_obj = Session.query(HarvestObject) \
                                                    .filter(HarvestObject.guid == entry_guid) \
                                                    .filter(HarvestObject.current == True) \
                                                    .first()  # noqa: E712
                                                if previous_obj:
                                                    previous_obj.current = False
                                                    previous_obj.save()

                                                if self.update_all:
                                                    self.log.debug(
                                                        '{} already exists and will be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'change'

                                                else:
                                                    self.log.debug(
                                                        '{} will not be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'unchanged'

                                            elif not package:
                                                # It's a product we haven't harvested before.
                                                self.log.debug(
                                                    '{} has not been harvested before. Creating a new harvest object.'
                                                    .  # noqa: E501
                                                    format(entry_name
                                                           ))  # noqa: E501
                                                status = 'new'
                                            obj = HarvestObject(
                                                guid=entry_guid,
                                                job=self.job,
                                                extras=[
                                                    HOExtra(key='status',
                                                            value=status),
                                                    HOExtra(key='last_token',
                                                            value=last_token),
                                                    HOExtra(key='next_token',
                                                            value=next_token),
                                                    HOExtra(
                                                        key='next_station',
                                                        value=next_station),
                                                    HOExtra(key='restart_date',
                                                            value=restart_date)
                                                ])
                                            obj.content = json.dumps(content)
                                            obj.package = None if status == 'new' else package
                                            obj.save()
                                            ids.append(obj.id)

                                if not valid_deployment:
                                    self.log.debug(
                                        'Station {} does not have valid deployments.'
                                        .format(station))
                            else:
                                self.log.debug(
                                    'Station {} is not valid.'.format(station))
                    station_index += 1
        return ids
    def reimport_batch(self, package_ids, context):
        '''Batch-reimport all packages in `package_ids` from their original
           harvest source.'''

        ckan_fb_mapping = {}

        # first, do checks that can be done without connection to FIS-Broker
        for package_id in package_ids:
            package = Package.get(package_id)

            if not package:
                raise PackageIdDoesNotExistError(package_id)

            if not dataset_was_harvested(package):
                raise PackageNotHarvestedError(package_id)

            harvester = harvester_for_package(package)
            harvester_url = harvester.url
            harvester_type = harvester.type
            if not harvester_type == HARVESTER_ID:
                raise PackageNotHarvestedInFisbrokerError(package_id)

            fb_guid = fisbroker_guid(package)
            if not fb_guid:
                raise NoFisbrokerIdError(package_id)

            ckan_fb_mapping[package.id] = fb_guid

        # get the harvest source for FIS-Broker datasets
        fb_source = get_fisbroker_source()
        if not fb_source:
            raise NoFBHarvesterDefined()
        source_id = fb_source.get('id', None)

        # Create and start a new harvest job
        job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id})
        harvest_job = HarvestJob.get(job_dict['id'])
        harvest_job.gather_started = datetime.datetime.utcnow()
        assert harvest_job

        # instatiate the CSW connector (on the reasonable assumption that harvester_url is
        # the same for all package_ids)
        package_id = None
        reimported_packages = []
        try:
            csw = CatalogueServiceWeb(harvester_url)
            for package_id, fb_guid in ckan_fb_mapping.items():
                # query connector to get resource document
                csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd'])

                # show resource document
                record = csw.records.get(fb_guid, None)
                if record:
                    obj = HarvestObject(guid=fb_guid,
                                        job=harvest_job,
                                        content=record.xml,
                                        package_id=package_id,
                                        extras=[
                                            HarvestObjectExtra(key='status',value='change'),
                                            HarvestObjectExtra(key='type',value='reimport'),
                                        ])
                    obj.save()

                    assert obj, obj.content

                    harvester = FisbrokerPlugin()
                    harvester.force_import = True
                    harvester.import_stage(obj)
                    rejection_reason = self._dataset_rejected(obj)
                    if rejection_reason:
                        raise FBImportError(package_id, rejection_reason)

                    harvester.force_import = False
                    Session.refresh(obj)

                    reimported_packages.append(record)

                else:
                    raise NotFoundInFisbrokerError(package_id, fb_guid)

        except RequestException as error:
            raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__))


        # successfully finish harvest job
        harvest_job.status = u'Finished'
        harvest_job.finished = datetime.datetime.utcnow()
        harvest_job.save()

        return reimported_packages