def gather_stage(self, harvest_job):

        if harvest_job.source.url.startswith('basic_test'):
            obj = HarvestObject(guid = 'test1', job = harvest_job)
            obj.extras.append(HarvestObjectExtra(key='key', value='value'))
            obj2 = HarvestObject(guid = 'test2', job = harvest_job)
            obj3 = HarvestObject(guid = 'test_to_delete', job = harvest_job)
            obj.add()
            obj2.add()
            obj3.save() # this will commit both
            return [obj.id, obj2.id, obj3.id]

        return []
示例#2
0
    def gather_stage(self, harvest_job):

        if harvest_job.source.url.startswith('basic_test'):
            obj = HarvestObject(guid='test1', job=harvest_job)
            obj.extras.append(HarvestObjectExtra(key='key', value='value'))
            obj2 = HarvestObject(guid='test2', job=harvest_job)
            obj3 = HarvestObject(guid='test_to_delete', job=harvest_job)
            obj.add()
            obj2.add()
            obj3.save()  # this will commit both
            return [obj.id, obj2.id, obj3.id]

        return []
示例#3
0
    def gather_stage(self, harvest_job):
        log = logging.getLogger(__name__ + '.individual.gather')
        log.debug('DocHarvester gather_stage for job: %r', harvest_job)

        self.harvest_job = harvest_job

        # Get source URL
        url = harvest_job.source.url

        self._set_source_config(harvest_job.source.config)

        # Get contents
        try:
            content = self._get_content_as_unicode(url)
        except Exception as e:
            self._save_gather_error('Unable to get content for URL: %s: %r' % \
                                        (url, e),harvest_job)
            return None

        existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
                                    filter(HarvestObject.current==True).\
                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
                                    first()

        def create_extras(url, status):
            return [
                HOExtra(key='doc_location', value=url),
                HOExtra(key='status', value=status)
            ]

        if not existing_object:
            guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest()
            harvest_object = HarvestObject(job=harvest_job,
                                           extras=create_extras(url, 'new'),
                                           guid=guid)
        else:
            harvest_object = HarvestObject(
                job=harvest_job,
                extras=create_extras(url, 'change'),
                guid=existing_object.guid,
                package_id=existing_object.package_id)

        harvest_object.add()

        # Check if it is an ISO document
        document_format = guess_standard(content)
        if document_format == 'iso':
            harvest_object.content = content
        else:
            extra = HOExtra(object=harvest_object,
                            key='original_document',
                            value=content)
            extra.save()

            extra = HOExtra(object=harvest_object,
                            key='original_format',
                            value=document_format)
            extra.save()

        harvest_object.save()

        return [harvest_object.id]