示例#1
0
    def gather_stage(self, harvest_job):
        log.debug('In ZhstatHarvester gather_stage')

        ids = []
        parser = etree.XMLParser(encoding='utf-8')

        for dataset in etree.fromstring(self._fetch_metadata(), parser=parser):

            # Get the german data if one is available,
            # otherwise get the first one
            base_datas = dataset.xpath("data[@xml:lang='de']")
            if len(base_datas) != 0:
                base_data = base_datas[0]
            else:
                base_data = dataset.find('data')

            metadata = self._generate_metadata(base_data, dataset)

            if metadata:
                obj = HarvestObject(guid=dataset.get('id'),
                                    job=harvest_job,
                                    content=json.dumps(metadata))
                obj.save()
                log.debug('adding ' + dataset.get('id') + ' to the queue')
                ids.append(obj.id)
            else:
                log.debug(
                    'Skipping %s since no resources or groups are available' %
                    dataset.get('id'))

        return ids
    def gather_stage(self,harvest_job):
        log.debug('In ArrayExpressHarvester.gather_stage(%s)' % harvest_job.source.url)
        # Get feed contents
        self._set_config(harvest_job.source.config)
        
        #previous_job = Session.query(HarvestJob) \
        #                .filter(HarvestJob.source==harvest_job.source) \
        #                .filter(HarvestJob.gather_finished!=None) \
        #                .filter(HarvestJob.id!=harvest_job.id) \
        #                .order_by(HarvestJob.gather_finished.desc()) \
        #                .limit(1).first()

        baseURL = harvest_job.source.url+"/xml/v2/experiments"
        #if (previous_job and not previous_job.gather_errors
        #    and not len(previous_job.objects) == 0):
        #    if not self.config.get('force_all',False):
        #        last_time = harvest_job.gather_started.isoformat()
        #        today = format(datetime.date.today())
        #        self.params['date'] = '['+last_time+' '+today+']'
        url = baseURL + "?" + self.getParams()

        print "Fetching from "+url
        doc = etree.parse(url)
        ids = []
        for accessionElement in doc.findall('//experiment/accession'):
            accession = accessionElement.text.strip()
            obj = HarvestObject(guid=accession, job=harvest_job, content=accession)
            print "ArrayExpress accession: "+accession
            obj.save()
            
            ids.append(obj.id)
        print ids
        return ids
示例#3
0
    def gather_stage(self,harvest_job):
        log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url)
        get_all_packages = True

        dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/')
        log.debug(dcatUrl)

        adaptorInstance = socrataAdaptor()
        package_ids = adaptorInstance.listDatasetIds(dcatUrl)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid = package_id, job = harvest_job)
                    obj.save()
                    object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
    def gather_stage(self,harvest_job):
        log.debug('In NTPCHarvester gather_stage (%s)' % harvest_job.source.url)

        url = self.PREFIX_URL + self.CATALOGUE_INDEX_URL
        get_all_packages = True
        try:
            package_ids = []
            dataset_count = self._get_ntpc_dataset_count(url)
            msg_count = 0
            for x in range(dataset_count/10 + 1):
                page_url = url + '?currentPage=%s' % (x + 1)
                data = urllib2.urlopen(page_url)
                doc = html.parse(data)
                for div in doc.findall("//a[@href]"):
                    if '/NTPC/od/query;' in div.attrib['href']:
                        link = div.attrib['href']
                        id = sha1(link).hexdigest()
                        obj = HarvestObject(guid=id, job=harvest_job, content=link)
                        obj.save()
                        package_ids.append(obj.id)
                        msg_count = msg_count + 1

            if msg_count == 0:
                self._save_gather_error('No packages received for URL: %s' % url,
                        harvest_job)
                return None

        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
示例#5
0
    def gather_stage(self, harvest_job):

        """Retrieve datasets"""
        
        log.debug('In RostockTestHarvester gather_stage (%s)' % harvest_job.source.url)
        package_ids = []
        self._set_config(None)

        base_url = harvest_job.source.url.rstrip('/')
        package_list_url = base_url + '/api/rest/package'
        content = self._get_content(package_list_url)
        
        package_ids = json.loads(content)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:                                      
                    obj = HarvestObject(guid = package_id, job = harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                    log.info('Got ID from source: %s' %package_id)
                return object_ids

            else:
               self._save_gather_error('No packages received for URL: %s' % url,
                       harvest_job)
               return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
    def gather_stage(self, harvest_job):
        """Retrieve datasets"""

        log.debug('In ' + self.city + 'CKANHarvester gather_stage (%s)' % harvest_job.source.url)
        package_ids = []
        self._set_config(None)

        base_url = harvest_job.source.url.rstrip('/')
        package_list_url = base_url + '/3/action/package_list'
        content = self._get_content(package_list_url)

        content_json = json.loads(content)
        package_ids = content_json['result']

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                                        harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
示例#7
0
    def _gather_ids(self,url = None, jar= None):
        log.debug('Page %s'%self.page)
        if jar is None:
            jar = CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
        url = url or self.INITIAL_INDEX
        fh = opener.open(url)
        doc = html.parse(fh)
        fh.close()

        new_ids = []
        for a in doc.findall(".//div[@class='main']//a"):
            href = a.get('href').split('?', 1)[0]
            id = href.split('/').pop()
            if not id in self.gathered_ids:
                log.debug('Got Id: %s' % id)
                obj = HarvestObject(guid=sha1(id).hexdigest(), job=self.job, content=id)
                obj.save()

                self.object_ids.append(obj.id)

                new_ids.append(id)

        if len(new_ids) == 0: #or self.page == 2:
            return self.gathered_ids
        else:
            self.gathered_ids.extend(new_ids)

        inputs = []
        for input in doc.findall(".//form[@id='main_form']//input"):
            inputs.append((input.get('name'), input.get('value')))
        inputs.append(('listbox_nextPage:method', ''))
        next_url = self.INDEX_URL + '?' + urllib.urlencode(inputs)
        self.page = self.page + 1
        return self._gather_ids(url=next_url,jar=jar)
示例#8
0
    def gather_stage(self, harvest_job):
        log.debug('In FSOHarvester gather_stage')

        http = urllib3.PoolManager()
        metadata_file = http.request('GET', self.METADATA_FILE_URL)

        ids = []
        parser = etree.XMLParser(encoding='utf-8')
        for package in etree.fromstring(metadata_file.data, parser=parser):

            # Get the german dataset if one is available, otherwise get the first one
            base_datasets = package.xpath("dataset[@xml:lang='de']")
            if len(base_datasets) != 0:
                base_dataset = base_datasets[0]
            else:
                base_dataset = package.find('dataset')

            metadata = self._generate_metadata(base_dataset, package)
            if metadata:
                obj = HarvestObject(
                    guid = self._create_uuid(base_dataset.get('datasetID')),
                    job = harvest_job,
                    content = json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + base_dataset.get('datasetID') + ' to the queue')
                ids.append(obj.id)
            else:
                log.debug('Skipping ' + base_dataset.get('datasetID') + ' since no resources or groups are available')

        return ids
示例#9
0
    def gather_stage(self, harvest_job):
        """Retrieve datasets"""

        log.debug('In KoelnCKANHarvester gather_stage (%s)' %
                  harvest_job.source.url)
        package_ids = []
        self._set_config(None)

        base_url = harvest_job.source.url.rstrip('/')
        package_list_url = base_url + '/3/action/package_list'
        content = self._get_content(package_list_url)

        content_json = json.loads(content)
        package_ids = content_json['result']

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                return object_ids

            else:
                self._save_gather_error(
                    'No packages received for URL: %s' % url, harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
示例#10
0
    def gather_stage(self, harvest_job):
        log.debug('In ZhstatHarvester gather_stage')

        ids = []
        parser = etree.XMLParser(encoding='utf-8')

        for dataset in etree.fromstring(self._fetch_metadata(), parser=parser):

            # Get the german data if one is available,
            # otherwise get the first one
            base_datas = dataset.xpath("data[@xml:lang='de']")
            if len(base_datas) != 0:
                base_data = base_datas[0]
            else:
                base_data = dataset.find('data')

            metadata = self._generate_metadata(base_data, dataset)

            if metadata:
                obj = HarvestObject(
                    guid=dataset.get('id'),
                    job=harvest_job,
                    content=json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + dataset.get('id') + ' to the queue')
                ids.append(obj.id)
            else:
                log.debug(
                    'Skipping %s since no resources or groups are available'
                    % dataset.get('id')
                )

        return ids
示例#11
0
    def _gather_object(self, job, url, size, start_date, forecast_date):
        filename = parse_filename(url)
        filename_id = (filename.replace('-v02.0-fv02.0', '').replace(
            '-fv02.0',
            '').replace('-sv01.00', '').replace('-sv05.00', '').replace(
                '-v02', '').replace('-sv10.00',
                                    '').replace('-sv09.00',
                                                '').replace('-sv07.00', ''))

        status, package = self._was_harvested(filename_id, self.update_all)

        extras = [HOExtra(key='status', value=status)]
        assert start_date
        content = json.dumps(
            {
                'identifier': filename_id,
                'ftp_link': url,
                'size': size,
                'start_date': start_date,
                'forecast_date': forecast_date,
                'restart_date': start_date
            },
            default=str)
        obj = HarvestObject(job=job, guid=url, extras=extras, content=content)
        obj.package = package
        obj.save()
        return obj.id
示例#12
0
    def _run_import(self, xml, job):
        if not model.User.get('harvest'):
            model.User(name='harvest', sysadmin=True).save()
        if not model.Group.get('test'):
            get_action('organization_create')({
                'user': '******'
            }, {
                'name': 'test'
            })

        record = _get_record(xml)

        metadata = CmdiReader()(record)
        metadata['unified']['owner_org'] = "test"

        harvest_object = HarvestObject()
        harvest_object.content = json.dumps(metadata.getMap())
        harvest_object.id = xml
        harvest_object.guid = xml
        harvest_object.source = job.source
        harvest_object.harvest_source_id = None
        harvest_object.job = job
        harvest_object.save()

        self.harvester.import_stage(harvest_object)
        return harvest_object
示例#13
0
    def gather_stage(self, harvest_job):
        log.debug('In SNLHarvester gather_stage')

        metadata_path = self._fetch_metadata_file()
        ids = []
        try:
            parser = MetaDataParser(metadata_path)

            for dataset in parser.list_datasets():
                metadata = parser.parse_set(dataset)
                metadata['translations'].extend(
                    self._metadata_term_translations()
                )

                log.debug(metadata)

                obj = HarvestObject(
                    guid=metadata['id'],
                    job=harvest_job,
                    content=json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + metadata['id'] + ' to the queue')
                ids.append(obj.id)
        finally:
            temp_dir = os.path.dirname(metadata_path)
            log.debug('Deleting directory ' + temp_dir)
            shutil.rmtree(temp_dir)

        return ids
示例#14
0
    def gather_stage(self,harvest_job):
        log.debug('In SRDAHarvester gather_stage (%s)' % harvest_job.source.url)

        get_all_packages = True
        package_ids = []

	data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
        doc = html.parse(data)
        for td in doc.findall("//td[@class='left_p12_title']/a"):
            link = td.get('href')
            if re.match(r"/search/fsciitem", link):
                id = sha1(link).hexdigest()
                obj = HarvestObject(guid=id, job= harvest_job, content=link)
                obj.save()
                package_ids.append(obj.id)
	
        self._set_config(harvest_job.source.config)

        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
                        .filter(HarvestJob.source==harvest_job.source) \
                        .filter(HarvestJob.gather_finished!=None) \
                        .filter(HarvestJob.id!=harvest_job.id) \
                        .order_by(HarvestJob.gather_finished.desc()) \
                        .limit(1).first()

        return package_ids
示例#15
0
    def setup_class(cls):
        try:
            from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra
        except ImportError:
            raise SkipTest('The harvester extension is needed for these tests')

        cls.content1 = '<xml>Content 1</xml>'
        ho1 = HarvestObject(
            guid='test-ho-1',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content1)

        cls.content2 = '<xml>Content 2</xml>'
        cls.original_content2 = '<xml>Original Content 2</xml>'
        ho2 = HarvestObject(
            guid='test-ho-2',
            job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
            content=cls.content2)

        hoe = HarvestObjectExtra(key='original_document',
                                 value=cls.original_content2,
                                 object=ho2)

        Session.add(ho1)
        Session.add(ho2)
        Session.add(hoe)
        Session.commit()

        cls.object_id_1 = ho1.id
        cls.object_id_2 = ho2.id
示例#16
0
    def gather_stage(self, harvest_job):
        """
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        """
        log.debug("in gather stage: %s" % harvest_job.source.url)
        try:
            harvest_obj_ids = []
            registry = self._create_metadata_registry()
            self._set_config(harvest_job.source.config)
            client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials)

            client.identify()  # check if identify works
            for header in self._identifier_generator(client):
                harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)
        except:
            log.exception("Gather stage failed %s" % harvest_job.source.url)
            self._save_gather_error("Could not gather anything from %s!" % harvest_job.source.url, harvest_job)
            return None
        return harvest_obj_ids
示例#17
0
    def gather_stage(self, harvest_job):
        log.debug('In SRDAHarvester gather_stage (%s)' %
                  harvest_job.source.url)

        get_all_packages = True
        package_ids = []

        data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
        doc = html.parse(data)
        for td in doc.findall("//td[@class='left_p12_title']/a"):
            link = td.get('href')
            if re.match(r"/search/fsciitem", link):
                id = sha1(link).hexdigest()
                obj = HarvestObject(guid=id, job=harvest_job, content=link)
                obj.save()
                package_ids.append(obj.id)

        self._set_config(harvest_job.source.config)

        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
                        .filter(HarvestJob.source==harvest_job.source) \
                        .filter(HarvestJob.gather_finished!=None) \
                        .filter(HarvestJob.id!=harvest_job.id) \
                        .order_by(HarvestJob.gather_finished.desc()) \
                        .limit(1).first()

        return package_ids
示例#18
0
def harvest_object_create(context, data_dict):
    """ Create a new harvest object

    :type guid: string (optional)
    :type content: string (optional)
    :type job_id: string 
    :type source_id: string (optional)
    :type package_id: string (optional)
    :type extras: dict (optional)
    """
    check_access('harvest_object_create', context, data_dict)
    data, errors = _validate(data_dict, harvest_object_create_schema(), context)

    if errors:
        raise logic.ValidationError(errors)

    obj = HarvestObject(
        guid=data.get('guid'),
        content=data.get('content'),
        job=data['job_id'],
        harvest_source_id=data.get('source_id'),
        package_id=data.get('package_id'),
        extras=[ HarvestObjectExtra(key=k, value=v) 
            for k, v in data.get('extras', {}).items() ]
    )

    obj.save()
    return harvest_object_dictize(obj, context)
    def test_zfaulty_xml_unknown_errors(self):
        harv, job = self._create_harvester()
        res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml"
        urllib2.urlopen = mock.Mock(return_value=StringIO(res))
        gathered = harv.gather_stage(job)

        urllib2.urlopen = mock.Mock(return_value=open("FSD2355.xml"))
        harvest_obj = HarvestObject.get(gathered[0])
        self.assert_(harv.fetch_stage(harvest_obj))
        self.assert_(harv.import_stage(harvest_obj))
        print Package.text_search(\
                            Session.query(Package),
                            'Kansalaiskeskustelu ydinvoimasta 2006').all()
        self.assert_(len(Package.text_search(\
                            Session.query(Package),
                            'Kansalaiskeskustelu ydinvoimasta 2006').all()) >= 1)

        res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml"
        urllib2.urlopen = mock.Mock(return_value=StringIO(res))
        gathered = harv.gather_stage(job)
        urllib2.urlopen = mock.Mock(return_value=open("FSD2362.xml"))
        harvest_obj = HarvestObject.get(gathered[0])
        self.assert_(harv.fetch_stage(harvest_obj))
        self.assert_(harv.import_stage(harvest_obj))
        self.assert_(len(Package.text_search(\
                                Session.query(Package),
                                'Energia-asennetutkimus 2004').all()) >= 1)
示例#20
0
def harvest_object_create(context, data_dict):
    """ Create a new harvest object

    :type guid: string (optional)
    :type content: string (optional)
    :type job_id: string 
    :type source_id: string (optional)
    :type package_id: string (optional)
    :type extras: dict (optional)
    """
    check_access('harvest_object_create', context, data_dict)
    data, errors = _validate(data_dict, harvest_object_create_schema(),
                             context)

    if errors:
        raise logic.ValidationError(errors)

    obj = HarvestObject(guid=data.get('guid'),
                        content=data.get('content'),
                        job=data['job_id'],
                        harvest_source_id=data.get('source_id'),
                        package_id=data.get('package_id'),
                        extras=[
                            HarvestObjectExtra(key=k, value=v)
                            for k, v in data.get('extras', {}).items()
                        ])

    obj.save()
    return harvest_object_dictize(obj, context)
示例#21
0
    def gather_stage(self, harvest_job):
        try:
            config = json.loads(harvest_job.source.config)
            ckan_term_url = config['ckan_term_url']
        except Exception as e:
            log.exception(e)
            raise ConfigError(
                "In order to run the translation harvester "
                "you need to specify 'ckan_term_url' "
                "in your harvester config json"
            )

        log.debug('Gathering term from %s' % ckan_term_url)
        try:
            terms = self._get_terms(ckan_term_url)

            obj = HarvestObject(
                job=harvest_job,
                content=json.dumps(terms)
            )
            obj.save()

            return [obj.id]
        except Exception as e:
            log.exception(e)
            raise e
    def test_ckan_duplicated_name(self):
        dataset0 = {
            'owner_org': self.org['id'],
            'holder_name': 'test holder',
            'holder_identifier': 'abcdef',
            'notes': 'some notes',
            'modified': '2000-01-01',
            'theme': 'AGRI',
            'frequency': 'UNKNOWN',
            'publisher_name': 'publisher',
            'identifier': 'aasdfa',
            'publisher_identifier': 'publisher',
            'resources': [],
            'extras': [],
        }

        dataset1 = {
            'owner_org': self.org['id'],
            'title': 'duplicated title',
            'name': 'duplicated-title',
            'id': 'dummyid'
        }
        dataset1.update(dataset0)
        data = json.dumps(dataset1)

        harvest_dict = self._create_harvest_obj('http://mock/source/', name='dupname1', owner_org=self.org['id'])
        harvest_obj = HarvestObject.get(harvest_dict['id'])
        harvest_obj.content = data
        h = DCATRDFHarvester()
        import_successful = h.import_stage(harvest_obj)
        self.assertTrue(import_successful, harvest_obj.errors)
        Session.flush()
        dataset1['_id'] = harvest_obj.package_id

        dataset2 = {'title': 'duplicated title',
                    'name': 'duplicated-title',
                    'id': 'dummyid2'}

        dataset2.update(dataset0)
        dataset2['identifier'] = 'otherid'
        data = json.dumps(dataset2)

        harvest_dict = self._create_harvest_obj('http://mock/source/', name='dupname2', owner_org=self.org['id'])
        harvest_obj = HarvestObject.get(harvest_dict['id'])
        harvest_obj.content = data
        h = DCATRDFHarvester()
        import_successful = h.import_stage(harvest_obj)
        self.assertTrue(import_successful, harvest_obj.errors)
        Session.flush()
        dataset2['_id'] = harvest_obj.package_id

        # duplicated names are mangled, one should have numeric suffix
        pkg_dict = helpers.call_action('package_show', context={}, name_or_id=dataset1['_id'])
        self.assertEqual(pkg_dict['title'], dataset1['title'])
        self.assertEqual(pkg_dict['name'], 'duplicated-title')

        pkg_dict = helpers.call_action('package_show', context={}, name_or_id=dataset2['_id'])
        self.assertEqual(pkg_dict['title'], dataset2['title'])
        self.assertEqual(pkg_dict['name'], 'duplicated-title1')
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.VariantStore.gather')
        log.debug('VariantStoreHarvester gather_stage for job: %r', harvest_job)

        self._set_config(harvest_job.source.config, log=log)
        obj = HarvestObject(guid = self.guid, job = harvest_job)
        obj.save()
        return [ obj.id ]
示例#24
0
def doi_update(context, data_dict):
    model = context['model']
    new_package = data_dict
    source_hash = hashlib.sha1(json.dumps(data_dict,
                                          sort_keys=True)).hexdigest()
    old_package = p.toolkit.get_action('package_show')({
        'model': model,
        'ignore_auth': True
    }, {
        "id": new_package['id']
    })
    for extra in old_package['extras']:
        if extra['key'] == 'source_hash':
            old_source_hash = extra['value']
            break
    else:
        old_source_hash = None

    if source_hash == old_source_hash and old_package.get('state') == 'active':
        print str(datetime.datetime.now()
                  ) + ' No change for doi id ' + new_package['id']
        return

    new_package["extras"].append({"key": "source_hash", "value": source_hash})
    new_package["extras"].append({"key": "metadata-source", "value": "doi"})
    new_package["extras"].append({
        "key": "source_doi_import_identifier",
        "value": True
    })
    new_package.pop("name", None)
    owner_org = model.Group.get(
        ORG_MAPPING.get(new_package['organization']['name']))
    if not owner_org:
        print str(
            datetime.datetime.now()) + ' Fail to update doi id ' + new_package[
                'id'] + '. Organization ' + new_package['organization'][
                    'name'] + ' does not exist.'
        return
    new_package['owner_org'] = owner_org.name
    group_name = new_package.pop('owner_name', None)

    resources = []
    for resource in new_package['resources']:
        resource.pop('resource_group_id', None)
        resource.pop('revision_id', None)
        resource.pop('id', None)
        resources.append(resource)
    new_package['resources'] = resources

    obj = HarvestObject(guid=uuid.uuid4().hex,
                        job=context['harvest_job'],
                        content=context['harvestobj'])
    obj.save()
    new_package["extras"].append({"key": "harvest_object_id", "value": obj.id})

    context['return_id_only'] = True
    p.toolkit.get_action('package_update')(context, new_package)
    print str(datetime.datetime.now()) + ' Updated doi id ' + new_package['id']
示例#25
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('GeminiWafHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        # Get contents
        try:
            content = self._get_content(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None
        ids = []
        try:
            for url in self._extract_urls(content, url):
                try:
                    content = self._get_content(url)
                except Exception as e:
                    msg = 'Couldn\'t harvest WAF link: %s: %s' % (url, e)
                    self._save_gather_error(msg, harvest_job)
                    continue
                else:
                    # We need to extract the guid to pass it to the next stage
                    try:
                        gemini_string, gemini_guid = self.get_gemini_string_and_guid(
                            content, url)
                        if gemini_guid:
                            log.debug('Got GUID %s' % gemini_guid)
                            # Create a new HarvestObject for this identifier
                            # Generally the content will be set in the fetch stage, but as we alredy
                            # have it, we might as well save a request
                            obj = HarvestObject(guid=gemini_guid,
                                                job=harvest_job,
                                                content=gemini_string)
                            obj.save()

                            ids.append(obj.id)

                    except Exception as e:
                        msg = 'Could not get GUID for source %s: %r' % (url, e)
                        self._save_gather_error(msg, harvest_job)
                        continue
        except Exception as e:
            msg = 'Error extracting URLs from %s' % url
            self._save_gather_error(msg, harvest_job)
            return None

        if len(ids) > 0:
            return ids
        else:
            self._save_gather_error(
                'Couldn\'t find any links to metadata files', harvest_job)
            return None
示例#26
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        self._set_config(harvest_job.source.config)
        sets = []
        harvest_objs = []
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = oaipmh.client.Client(harvest_job.source.url, registry)
        try:
            identifier = client.identify()
        except urllib2.URLError:
            self._save_gather_error('Could not gather anything from %s!' %
                                    harvest_job.source.url, harvest_job)
            return None
        domain = identifier.repositoryName()
        group = Group.by_name(domain)
        if not group:
            group = Group(name=domain, description=domain)
        query = self.config['query'] if 'query' in self.config else ''
        try:
            for set in client.listSets():
                identifier, name, _ = set
                if 'query' in self.config:
                    if query in name:
                        sets.append((identifier, name))
                else:
                    sets.append((identifier, name))
        except NoSetHierarchyError:
            sets.append(('1', 'Default'))
            self._save_gather_error('Could not fetch sets!', harvest_job)

        for set_id, set_name in sets:
            harvest_obj = HarvestObject(job=harvest_job)
            harvest_obj.content = json.dumps(
                                             {
                                              'set': set_id, \
                                              'set_name': set_name, \
                                              'domain': domain
                                              }
                                             )
            harvest_obj.save()
            harvest_objs.append(harvest_obj.id)
        model.repo.commit()
        return harvest_objs
示例#27
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.CSW.gather')
        log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job)
        # Get source URL
        url = harvest_job.source.url

        try:
            self._setup_csw_client(url)
        except Exception as e:
            self._save_gather_error('Error contacting the CSW server: %s' % e,
                                    harvest_job)
            return None

        log.debug('Starting gathering for %s' % url)
        used_identifiers = []
        ids = []
        try:
            for identifier in self.csw.getidentifiers(page=10):
                try:
                    log.info('Got identifier %s from the CSW', identifier)
                    if identifier in used_identifiers:
                        log.error(
                            'CSW identifier %r already used, skipping...' %
                            identifier)
                        continue
                    if identifier is None:
                        log.error('CSW returned identifier %r, skipping...' %
                                  identifier)
                        ## log an error here? happens with the dutch data
                        continue

                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=identifier, job=harvest_job)
                    obj.save()

                    ids.append(obj.id)
                    used_identifiers.append(identifier)
                except Exception as e:
                    self._save_gather_error(
                        'Error for the identifier %s [%r]' % (identifier, e),
                        harvest_job)
                    continue

        except Exception as e:
            log.error('Exception: %s' % text_traceback())
            self._save_gather_error(
                'Error gathering the identifiers from the CSW server [%s]' %
                six.text_type(e), harvest_job)
            return None

        if len(ids) == 0:
            self._save_gather_error('No records received from the CSW server',
                                    harvest_job)
            return None

        return ids
示例#28
0
    def gather_stage(self, harvest_job):
        log.debug('In SFAHarvester gather_stage')
        try:
            file_path = self._fetch_metadata_file()
            ids = []

            de_rows = self._get_row_dict_array(0, file_path)
            for row in de_rows:
                # Construct the metadata dict for the dataset on CKAN
                metadata = {
                    'datasetID': row[u'id'],
                    'title': row[u'title'],
                    'url': row[u'url'],
                    'notes': row[u'notes'],
                    'author': row[u'author'],
                    'maintainer': row[u'maintainer'],
                    'maintainer_email': row[u'maintainer_email'],
                    'license_id': row[u'licence'],
                    'license_url': row[u'licence_url'],
                    'translations': [],
                    'tags': row[u'tags'].split(u', '),
                    'groups': [row[u'groups']]
                }

                metadata['resources'] = self._generate_resources_dict_array(
                    row[u'id']
                )
                metadata['resources'][0]['version'] = row[u'version']
                log.debug(metadata['resources'])

                # Adding term translations
                metadata['translations'].extend(
                    self._generate_term_translations(1, file_path)  # fr
                )
                metadata['translations'].extend(
                    self._generate_term_translations(2, file_path)  # it
                )
                metadata['translations'].extend(
                    self._generate_term_translations(3, file_path)  # en
                )

                log.debug(metadata['translations'])

                obj = HarvestObject(
                    guid=self._create_uuid(row[u'id']),
                    job=harvest_job,
                    content=json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + row[u'id'] + ' to the queue')
                ids.append(obj.id)

                log.debug(de_rows)
        except Exception:
            return False
        return ids
示例#29
0
    def gather_stage(self, harvest_job):
        log.debug('In FOPHHarvester gather_stage')
        try:
            file_path = self._fetch_metadata_file()
            ids = []

            de_cols = self._get_col_dict_array(0, file_path)
            for col in de_cols:
                # Construct the metadata dict for the dataset on CKAN
                metadata = {
                    'datasetID': col[u'id'],
                    'title': col[u'title'],
                    'url': col[u'url'],
                    'notes': col[u'notes'],
                    'author': col[u'author'],
                    'author_email': col[u'author_email'],
                    'maintainer': col[u'maintainer'],
                    'maintainer_email': col[u'maintainer_email'],
                    'license_id': col[u'license_id'].lower(),
                    'version': col[u'version'],
                    'translations': [],
                    'tags': []
                }
                tags = col[u'tags'].split(u', ')
                tags = [munge_tag(tag) for tag in tags]
                metadata['tags'] = tags

                metadata['resources'] = self._generate_resources_dict_array(
                    col[u'id'])
                metadata['resources'][0]['version'] = col[u'version']
                log.debug(metadata['resources'])

                # Adding term translations
                metadata['translations'].extend(
                    self._generate_term_translations(1, file_path))  # fr
                metadata['translations'].extend(
                    self._generate_term_translations(2, file_path))  # it
                metadata['translations'].extend(
                    self._generate_term_translations(3, file_path))  # en

                log.debug(metadata['translations'])

                obj = HarvestObject(
                    guid=self._create_uuid(col[u'id']),
                    job=harvest_job,
                    content=json.dumps(metadata)
                )
                obj.save()
                log.debug('adding ' + col[u'id'] + ' to the queue')
                ids.append(obj.id)

                log.debug(de_cols)
        except Exception:
            return False
        return ids
    def _gen_harvest_obj_for_files(self, harvest_job):
        ids = []
        for dataset_name, dataset in self.DATASETS.iteritems():
            csw = ckan_csw.SwisstopoCkanMetadata()
            metadata = csw.get_ckan_metadata(
                dataset['csw_query'], 'de'
            ).copy()
            metadata_fr = csw.get_ckan_metadata(
                dataset['csw_query'], 'fr'
            ).copy()
            metadata_it = csw.get_ckan_metadata(
                dataset['csw_query'], 'it'
            ).copy()
            metadata_en = csw.get_ckan_metadata(
                dataset['csw_query'], 'en'
            ).copy()
            log.debug(metadata)

            metadata['translations'] = self._generate_term_translations()
            log.debug("Translations: %s" % metadata['translations'])

            metadata_trans = {
                u'de': metadata,
                u'fr': metadata_fr,
                u'it': metadata_it,
                u'en': metadata_en,
            }
            metadata['translations'].extend(
                self._generate_metadata_translations(metadata_trans)
            )

            metadata['resources'] = self._generate_resources_dict_array(
                dataset_name
            )
            metadata['resources'].extend(
                self._generate_api_resources(metadata, dataset_name)
            )
            log.debug(metadata['resources'])

            metadata['license_id'], metadata['license_url'] = (
                self._get_org_license(dataset_name)
            )

            metadata['layer_name'] = dataset_name

            obj = HarvestObject(
                guid=metadata['id'],
                job=harvest_job,
                content=json.dumps(metadata)
            )
            obj.save()
            log.debug('adding ' + dataset_name + ' to the queue')
            ids.append(obj.id)

        return ids
示例#31
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        log.info("in gather stage: %s" % harvest_job.source.url)

        try:
            harvest_obj_ids = []
            registry = self._create_metadata_registry()
            self._set_config(harvest_job.source.config)
            client = oaipmh.client.Client(harvest_job.source.url,
                                          registry,
                                          self.credentials,
                                          force_http_get=self.force_http_get)
            # Start looking from here
            client.identify()  # check if identify works
            for header in self._identifier_generator(client):
                harvest_obj = HarvestObject(guid=header.identifier(),
                                            job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)
                log.info("Harvest obj %s created" % harvest_obj.id)
                # return harvest_obj_ids # This is to get only one record
        except urllib.error.HTTPError as e:
            log.exception(
                'Gather stage failed on %s (%s): %s, %s' %
                (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
        except Exception as e:
            log.exception('Gather stage failed on %s: %s' % (
                harvest_job.source.url,
                str(e),
            ))
            self._save_gather_error(
                'Could not gather anything from %s: %s / %s' %
                (harvest_job.source.url, str(e), traceback.format_exc()),
                harvest_job)
            return None
        log.info("Gather stage successfully finished with %s harvest objects" %
                 len(harvest_obj_ids))
        return harvest_obj_ids
    def gather_stage(self, harvest_job, encoding=None):
        self._set_config(harvest_job.source.config)
        # Request all remote packages
        try:
            content = self._get_content(harvest_job.source.url)
            LOGGER.debug('Grabbing zip file: %s', harvest_job.source.url)

            object_ids = []
            packages = []

            file_content = StringIO.StringIO(content)
            archive = zipfile.ZipFile(file_content, 'r')
            for name in archive.namelist():
                if name.endswith('.json'):
                    archive_content = archive.read(name)
                    if encoding is not None:
                        archive_content = archive_content.decode(encoding)
                    else:
                        archive_content = self.lstrip_bom(archive_content)

                    package = json.loads(archive_content)
                    normalize_api_dataset(package)
                    packages.append(package)
                    obj = HarvestObject(guid=package['name'], job=harvest_job)
                    obj.content = json.dumps(package)
                    obj.save()
                    object_ids.append(obj.id)

        except zipfile.BadZipfile as err:
            self._save_gather_error(err.message, harvest_job)
            return None
        except ContentFetchError as err:
            self._save_gather_error(err.message, harvest_job)
            return None
        except Exception as err:
            error_template = 'Unable to get content for URL: %s: %s'
            error = error_template % (harvest_job.source.url, str(err))
            self._save_gather_error(error, harvest_job)
            return None

        if object_ids:
            # delete obsolete packages
            super(JSONZipBaseHarvester, self).delete_deprecated_datasets(
                packages,
                harvest_job
            )

            return object_ids
        else:
            self._save_gather_error(
                'No packages received for URL: %s' % harvest_job.source.url,
                harvest_job
            )

            return None
示例#33
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.ITagEnricher.gather')
        log.debug('ITagEnricher gather_stage for job: %r', harvest_job)

        # Save a reference
        self.job = harvest_job

        self._set_source_config(self.job.source.config)

        context = {
            'model': model,
            'session': model.Session,
            'user': self._get_user_name()
        }

        org_id = model.Package.get(harvest_job.source.id).owner_org
        organization = logic.get_action('organization_show')(context, {
            'id': org_id
        })  # noqa: E501

        # Exclude Sentinel-3 because it seems like iTag can't handle the curved
        # footprints.
        filter_query = '+organization:{} -itag:tagged -FamilyName:Sentinel-3'.format(
            organization['name'])  # noqa: E501

        ids = []

        # We'll limit this to 10 datasets per job so that results appear
        # faster
        start = 0
        rows = self.source_config.get('datasets_per_job', 10)
        untagged = logic.get_action('package_search')(context, {
            'fq': filter_query,
            'rows': rows,
            'start': start
        })
        results = untagged['results']
        for result in results:
            spatial = None
            for i in result['extras']:
                if i['key'] == 'spatial':
                    spatial = i['value']
            if spatial:
                obj = HarvestObject(
                    guid=result['id'],
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value='change'),  # noqa: E501
                        HOExtra(key='spatial', value=spatial),  # noqa: E501
                        HOExtra(key='package', value=json.dumps(result))
                    ])  # noqa: E501
                obj.save()
                ids.append(obj.id)

        return ids
示例#34
0
    def populate_harvest_job(self, harvest_job, set_ids, config, client):
        # Check if this source has been harvested before
        previous_job = Session.query(HarvestJob) \
            .filter(HarvestJob.source == harvest_job.source) \
            .filter(HarvestJob.gather_finished != None) \
            .filter(HarvestJob.id != harvest_job.id) \
            .order_by(HarvestJob.gather_finished.desc()) \
            .limit(1).first()

        last_time = None
        if previous_job and previous_job.finished and model.Package.get(harvest_job.source.id).metadata_modified < previous_job.gather_started:
            last_time = previous_job.gather_started.isoformat()

        # Collect package ids
        package_ids = list(self.get_package_ids(set_ids, config, last_time, client))
        log.debug('Identifiers: %s', package_ids)

        if not self._recreate(harvest_job) and package_ids:
            converted_identifiers = {}
            for identifier in package_ids:
                converted_identifiers[datapid_to_name(identifier)] = identifier
                if identifier.endswith(u'm'):
                    converted_identifiers[datapid_to_name(u"%ss" % identifier[0:-1])] = identifier

            for package in model.Session.query(model.Package).filter(model.Package.name.in_(converted_identifiers.keys())).all():
                converted_name = package.name
                if converted_identifiers[converted_name] not in package_ids:
                    converted_name = "%sm" % converted_name[0:-1]
                package_ids.remove(converted_identifiers[converted_name])

        if previous_job:
            for previous_error in [error.guid for error in Session.query(HarvestObject).
                                   filter(HarvestObject.harvest_job_id == previous_job.id).
                                   filter(HarvestObject.state == 'ERROR').all()]:
                if previous_error not in package_ids:
                    package_ids.append(previous_error)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in islice(package_ids, config['limit']) if 'limit' in config else package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                log.debug('Object ids: {i}'.format(i=object_ids))
                return object_ids
            else:
                self._save_gather_error('No packages received for URL: {u}'.format(
                    u=harvest_job.source.url), harvest_job)
                return None
        except Exception as e:
            self._save_gather_error('Gather: {e}'.format(e=e), harvest_job)
            raise
示例#35
0
    def gather_stage(self, harvest_job):
        log.debug('In DataWienGvAt gather_stage')

        doc = etree.parse(self.CATALOGUE_FEED_URL)
        ids = []
        for link in doc.findall("//item/link"):
            link = link.text
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()
            ids.append(obj.id)
        return ids
示例#36
0
    def gather_stage(self, harvest_job):
        log.debug('In DataWienGvAt gather_stage')

        doc = etree.parse(self.CATALOGUE_FEED_URL)
        ids = []
        for link in doc.findall("//item/link"):
            link = link.text
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()
            ids.append(obj.id)
        return ids
示例#37
0
 def delete_geocat_ids(self, harvest_job, harvest_obj_ids,
                       packages_to_delete):
     delete_harvest_obj_ids = []
     for package_info in packages_to_delete:
         obj = HarvestObject(guid=package_info[1].name,
                             job=harvest_job,
                             extras=[
                                 HarvestObjectExtra(key='import_action',
                                                    value='delete')
                             ])
         obj.save()
         delete_harvest_obj_ids.append(obj.id)
     return delete_harvest_obj_ids
示例#38
0
    def gather_stage(self, harvest_job):
        log.debug('In OpenDataCatHarvester gahter_stage')
        # Get feed contents
        doc = etree.parse(self.INDEX_URL)
        ids = []
        for link_element in doc.findall('//item/link'):
            link = link_element.text.strip()
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()

            ids.append(obj.id)
        return ids
示例#39
0
    def gather_stage(self, harvest_job):
        log.debug('In OpendataParisFr gather_stage')

        doc = html.parse(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
        ids = []
        for link in doc.findall("//div[@class='animate download-portlet-element']/a"):
            link = link.get('href')
            if not "#comments" in link:
                id = sha1(link).hexdigest()
                obj = HarvestObject(guid=id, job=harvest_job, content=link)
                obj.save()
                ids.append(obj.id)
        return ids
示例#40
0
    def gather_stage(self,harvest_job):
        log.debug('In OpenDataCatHarvester gahter_stage')
        # Get feed contents
        doc = etree.parse(self.INDEX_URL)
        ids = []
        for link_element in doc.findall('//item/link'):
            link = link_element.text.strip()
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()

            ids.append(obj.id)
        return ids
示例#41
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        logger.debug("in gather stage: %s" % harvest_job.source.url)
        try:
            harvest_obj_ids = []
            self._set_config(harvest_job.source.config)

            skip_licenses = {
                'c12c3333-1ad7-4a3a-a629-ed51fcb636ac',
                'a270745d-07d5-4e93-94fc-ba6e0afc97fb',
            }

            # TODO: switch
            # for record in json.loads(open('/tmp/data.json').read())['dataset']:
            for record in requests.get(
                    urlparse.urljoin(harvest_job.source.url,
                                     'data.json')).json()['dataset']:
                license_id = record.get('license',
                                        'cc-by').strip('/').split('/')[-1]
                if license_id in skip_licenses:
                    continue
                if 'hub.pacificdata' == record.get('isPartOf'):
                    continue
                if 'Info' in record.get('theme', []):
                    continue
                harvest_obj = HarvestObject(guid=record['identifier'],
                                            content=json.dumps(record),
                                            job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)
        except urllib2.HTTPError, e:
            logger.exception(
                'Gather stage failed on %s (%s): %s, %s' %
                (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
示例#42
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        logger.debug("in gather stage: %s" % harvest_job.source.url)
        try:
            harvest_obj_ids = []
            self._set_config(harvest_job.source.config)
            url = urljoin(harvest_job.source.url, '/v1/dataset/search')

            for record in self._fetch_record_outline(url):

                # if record['key'] != 'a38c7d49-5a5d-4aa6-a64e-421178bd06d7':
                # continue
                harvest_obj = HarvestObject(guid=record['key'],
                                            content=record['country'],
                                            job=harvest_job)
                harvest_obj.save()
                harvest_obj_ids.append(harvest_obj.id)

                # TODO: remove
                # break
        except (HTTPError) as e:
            logger.exception(
                'Gather stage failed on %s (%s): %s, %s' %
                (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
        except (Exception) as e:
            logger.exception('Gather stage failed on %s: %s' % (
                harvest_job.source.url,
                str(e),
            ))
            self._save_gather_error(
                'Could not gather anything from %s' % harvest_job.source.url,
                harvest_job)
            return None
        return harvest_obj_ids
 def test_harvester_urlerror(self):
     harv, job = self._create_harvester()
     urllib2.urlopen = realopen
     self.assert_(harv.gather_stage(job) == None)
     errs = Session.query(HarvestGatherError).all()
     self.assert_(len(errs) == 1)
     harv_obj = HarvestObject()
     harv_obj.job = job
     harv_obj.content = json.dumps({'url': "http://foo"})
     # XML error and URL error, also the lack of url in content
     self.assert_(harv.import_stage(harv_obj) == False)
     errs = Session.query(HarvestObjectError).all()
     print errs
     self.assert_(len(errs) == 1)
示例#44
0
    def gather_stage(self,harvest_job):
        log.debug('In OpenGovSeHarvester gahter_stage')
        # Get feed contents
        doc = etree.parse(self.INDEX_URL)
        ids = []
        for id_element in doc.findall('//{%(ns)s}entry/{%(ns)s}id' % {'ns':self.ATOM_NS}):
            link = id_element.text.strip()
            log.debug('Got link: %s' % link)
            id = sha1(link).hexdigest()
            obj = HarvestObject(guid=id, job=harvest_job, content=link)
            obj.save()

            ids.append(obj.id)
        return ids
    def _save_harvest_object(self, metadata, harvest_job):
        '''
        Save the harvest object with the given metadata dict and harvest_job
        '''

        obj = HarvestObject(
            guid=metadata['datasetID'],
            job=harvest_job,
            content=json.dumps(metadata)
        )
        obj.save()
        log.debug('adding ' + metadata['datasetID'] + ' to the queue')

        return obj.id
示例#46
0
    def gather_stage(self, harvest_job):
        log.debug('In OpendataParisFr gather_stage')

        doc = html.parse(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
        ids = []
        for link in doc.findall(
                "//div[@class='animate download-portlet-element']/a"):
            link = link.get('href')
            if not "#comments" in link:
                id = sha1(link).hexdigest()
                obj = HarvestObject(guid=id, job=harvest_job, content=link)
                obj.save()
                ids.append(obj.id)
        return ids
示例#47
0
    def _save_harvest_object(self, metadata, harvest_job):
        '''
        Save the harvest object with the given metadata dict and harvest_job
        '''

        obj = HarvestObject(
            guid=metadata['datasetID'],
            job=harvest_job,
            content=json.dumps(metadata)
        )
        obj.save()
        log.debug('adding ' + metadata['datasetID'] + ' to the queue')

        return obj.id
示例#48
0
    def test_harvest_basic(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/esdInventory_live_truncated.xml',
            'type': u'inventory',
        }
        source, job = self._create_source_and_job(source_fixture)

        # Gather
        harvester = InventoryHarvester()
        # mock boundary stuff to avoid needing PostGIS - it is not tested here
        # and that allows this test to run on sqlite
        with patch('ckanext.dgulocal.harvester.get_boundary') as get_boundary:
            get_boundary.return_value = None
            object_ids = harvester.gather_stage(job)

        assert_equal(len(object_ids), 3)
        assert len(job.gather_errors) == 0

        # Fetch
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            assert harvest_object
            success = harvester.fetch_stage(harvest_object)
            assert_equal(success, True)
            assert not harvest_object.errors

        # Import
        objects = []
        for object_id in object_ids:
            obj = HarvestObject.get(object_id)
            assert obj
            objects.append(obj)
            harvester.import_stage(obj)
            assert not harvest_object.errors

        pkgs = Session.query(Package).filter(
            Package.type != u'harvest_source').all()

        assert_equal(len(pkgs), 3)

        pkg_ids = [pkg.id for pkg in pkgs]

        for obj in objects:
            assert obj.current == True
            assert obj.package_id in pkg_ids
示例#49
0
    def gather_stage(self, harvest_job):

        log = logging.getLogger(__name__ + '.WAF.gather')
        log.debug('z3950Harvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        source_url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # get current objects out of db
        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)

        guid_to_package_id = dict((res[0], res[1]) for res in query)
        current_guids = set(guid_to_package_id.keys())
        current_guids_in_harvest = set()

        # Get contents
        try:
            conn = zoom.Connection(source_url,
                                   int(self.source_config.get('port', 210)))
            conn.databaseName = self.source_config.get('database', '')
            conn.preferredRecordSyntax = 'XML'
            conn.elementSetName = 'T'
            query = zoom.Query('CCL', 'metadata')
            res = conn.search(query)
            ids = []
            for num, result in enumerate(res):
                hash = hashlib.md5(result.data).hexdigest()
                if hash in current_guids:
                    current_guids_in_harvest.add(hash)
                else:
                    obj = HarvestObject(
                        job=harvest_job,
                        guid=hash,
                        extras=[
                            HOExtra(key='status', value='new'),
                            HOExtra(key='original_document',
                                    value=result.data.decode('latin-1')),
                            HOExtra(key='original_format', value='fgdc')
                        ])
                    obj.save()
                    ids.append(obj.id)
            for guid in (current_guids - current_guids_in_harvest):
                obj = HarvestObject(
                    job=harvest_job,
                    guid=guid,
                    package_id=guid_to_package_id[guid],
                    extras=[HOExtra(key='status', value='delete')])
                obj.save()
                ids.append(obj.id)
            return ids
        except Exception, e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (source_url, e),harvest_job)
            return None
    def gather_stage(self, harvest_object):
        log.debug('In OdgovltHarvester gather_stage')

        sync = IvpkIrsSync(sa.create_engine(harvest_object.source.url))
        sync.sync_groups()

        ids = []
        for ivpk_dataset in sync.get_ivpk_datasets():
            content = json.dumps(dict(ivpk_dataset), cls=DatetimeEncoder)
            obj = HarvestObject(guid=ivpk_dataset.ID,
                                job=harvest_object,
                                content=content)
            obj.save()
            ids.append(obj.id)
        return ids
    def test_harvest_basic(self):

        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/esdInventory_live_truncated.xml',
            'type': u'inventory',
        }
        source, job = self._create_source_and_job(source_fixture)

        # Gather
        harvester = InventoryHarvester()
        # mock boundary stuff to avoid needing PostGIS - it is not tested here
        # and that allows this test to run on sqlite
        with patch('ckanext.dgulocal.harvester.get_boundary') as get_boundary:
            get_boundary.return_value = None
            object_ids = harvester.gather_stage(job)

        assert_equal(len(object_ids), 3)
        assert len(job.gather_errors) == 0

        # Fetch
        for object_id in object_ids:
            harvest_object = HarvestObject.get(object_id)
            assert harvest_object
            success = harvester.fetch_stage(harvest_object)
            assert_equal(success, True)
            assert not harvest_object.errors

        # Import
        objects = []
        for object_id in object_ids:
            obj = HarvestObject.get(object_id)
            assert obj
            objects.append(obj)
            harvester.import_stage(obj)
            assert not harvest_object.errors

        pkgs = Session.query(Package).filter(Package.type!=u'harvest_source').all()

        assert_equal(len(pkgs), 3)

        pkg_ids = [pkg.id for pkg in pkgs]

        for obj in objects:
            assert obj.current == True
            assert obj.package_id in pkg_ids
示例#52
0
def fetch_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_object_id']
        log.info('Received harvest object id: %s' % id)
    except KeyError:
        log.error('No harvest object id received')
        channel.basic_ack(method.delivery_tag)
        return False

    obj = HarvestObject.get(id)
    if not obj:
        log.error('Harvest object does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    obj.retry_times += 1
    obj.save()

    if obj.retry_times >= 5:
        obj.state = "ERROR"
        obj.save()
        log.error('Too many consecutive retries for object {0}'.format(obj.id))
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest object to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == obj.source.type:
            fetch_and_import_stages(harvester, obj)

    model.Session.remove()
    channel.basic_ack(method.delivery_tag)
示例#53
0
    def test_harvester(self):
        job = HarvestJob(source = self.source)

        harvester = InventoryHarvester()

        # Gather all of the datasets from the XML content and make sure
        # we have created some harvest objects
        result = harvester.gather_stage(job, test_content=self._get_file_content('inventory.xml'))
        self.assertEqual(len(result), 79)

        # We only want one for testing
        harvest_object_id = result[0]
        harvest_obj = HarvestObject.get(harvest_object_id)

        # Run the fetch stage
        fetch_result = harvester.fetch_stage(harvest_obj)
        self.assertTrue(fetch_result)

        # Make sure we can create a dataset by running the import stage
        harvester.import_stage(harvest_obj)
        self.assertIsNotNone(harvest_obj.package_id)

        # Get the newly created package and make sure it is in the correct
        # organisation
        pkg = toolkit.get_action('package_show')(
            { 'ignore_auth': True, 'user': self.sysadmin['name'] },
            { 'id': harvest_obj.package_id },
        )
        self.assertEqual(pkg['organization']['id'], self.publisher['id'])
示例#54
0
    def _run_job_for_single_document(
        self, job, force_import=False, expect_gather_errors=False, expect_obj_errors=False
    ):

        harvester = GeminiDocHarvester()

        harvester.force_import = force_import

        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1
        if expect_gather_errors:
            assert len(job.gather_errors) > 0
        else:
            assert len(job.gather_errors) == 0

        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)
        if expect_obj_errors:
            assert len(obj.errors) > 0
        else:
            assert len(obj.errors) == 0

        job.status = u"Finished"
        job.save()

        return obj
示例#55
0
    def test_harvest_basic(self):

        # Create source
        source_fixture = {"url": u"http://127.0.0.1:8999/waf/index.html", "type": u"gemini-waf"}

        source, job = self._create_source_and_job(source_fixture)

        harvester = GeminiWafHarvester()

        # We need to send an actual job, not the dict
        object_ids = harvester.gather_stage(job)

        assert len(object_ids) == 2

        # Fetch stage always returns True for Waf harvesters
        assert harvester.fetch_stage(object_ids) == True

        objects = []
        for object_id in object_ids:
            obj = HarvestObject.get(object_id)
            assert obj
            objects.append(obj)
            harvester.import_stage(obj)

        pkgs = Session.query(Package).all()

        assert len(pkgs) == 2

        pkg_ids = [pkg.id for pkg in pkgs]

        for obj in objects:
            assert obj.current == True
            assert obj.package_id in pkg_ids
示例#56
0
    def test_error_mail_sent_with_object_error(self, mock_mailer_mail_recipient):

        context, harvest_source, harvest_job = self._create_harvest_source_and_job_if_not_existing()

        data_dict = {
            'guid': 'guid',
            'content': 'content',
            'job_id': harvest_job['id'],
            'extras': {'a key': 'a value'},
            'source_id': harvest_source['id']
        }
        harvest_object = toolkit.get_action('harvest_object_create')(
            context, data_dict)

        harvest_object_model = HarvestObject.get(harvest_object['id'])

        # create a HarvestObjectError
        msg = 'HarvestObjectError occured: %s' % harvest_job['id']
        harvest_object_error = HarvestObjectError(message=msg, object=harvest_object_model)
        harvest_object_error.save()

        status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']})

        send_error_mail(
            context,
            harvest_source['id'],
            status
        )

        assert_equal(1, status['last_job']['stats']['errored'])
        assert mock_mailer_mail_recipient.called
示例#57
0
    def _run_job_for_single_document(self,job,force_import=False,expect_gather_errors=False,expect_obj_errors=False):

        harvester = GeminiDocHarvester()

        harvester.force_import = force_import


        object_ids = harvester.gather_stage(job)
        assert object_ids, len(object_ids) == 1
        if expect_gather_errors:
            assert len(job.gather_errors) > 0
        else:
            assert len(job.gather_errors) == 0

        assert harvester.fetch_stage(object_ids) == True

        obj = HarvestObject.get(object_ids[0])
        assert obj, obj.content

        harvester.import_stage(obj)
        Session.refresh(obj)
        if expect_obj_errors:
            assert len(obj.errors) > 0
        else:
            assert len(obj.errors) == 0

        job.status = u'Finished'
        job.save()

        return obj
示例#58
0
def harvest_object_show(context, data_dict):

    p.toolkit.check_access('harvest_object_show', context, data_dict)

    id = data_dict.get('id')
    dataset_id = data_dict.get('dataset_id')

    if id:
        attr = data_dict.get('attr', None)
        obj = HarvestObject.get(id, attr=attr)
    elif dataset_id:
        model = context['model']

        pkg = model.Package.get(dataset_id)
        if not pkg:
            raise p.toolkit.ObjectNotFound('Dataset not found')

        obj = model.Session.query(HarvestObject) \
            .filter(HarvestObject.package_id == pkg.id) \
            .filter(
            HarvestObject.current == True  # noqa: E711
        ).first()
    else:
        raise p.toolkit.ValidationError(
            'Please provide either an "id" or a "dataset_id" parameter')

    if not obj:
        raise p.toolkit.ObjectNotFound('Harvest object not found')

    return harvest_object_dictize(obj, context)