def run(self):

        # Running this task doesn't delete anything from CKAN itself - so require --force flag to be sent to run it
        if not self.force:
            raise Exception('Warning: this class does not delete CKAN records. Use --force to run it.')

        # Build a dict of all modules and collections
        # We then retrieve the appropriate collection from the records module name (AudTable)
        # Exclude the MongoDeleteTask though
        collections = {cls.module: cls(None).get_collection() for cls in MongoTask.__subclasses__()}

        ke_data = KEParser(self.input().open('r'), file_path=self.input().path, schema_file=self.keemu_schema_file)

        for record in self.iterate_data(ke_data):

            module = record.get('AudTable')
            irn = record.get('AudKey')

            try:
                collection = collections[module]
            except KeyError:
                log.debug('Skipping eaudit record for %s' % module)
                # We do not have a collection for this module - skip to next record
                continue
            else:
                log.info('Deleting record %s(%s)' % (module, irn))
                self.delete(collection, irn)

        self.mark_complete()
    def run(self):

        # Running this task doesn't delete anything from CKAN itself - so require --force flag to be sent to run it
        if not self.force:
            raise Exception(
                'Warning: this class does not delete CKAN records. Use --force to run it.'
            )

        # Build a dict of all modules and collections
        # We then retrieve the appropriate collection from the records module name (AudTable)
        # Exclude the MongoDeleteTask though
        collections = {
            cls.module: cls(None).get_collection()
            for cls in MongoTask.__subclasses__()
        }

        ke_data = KEParser(self.input().open('r'),
                           file_path=self.input().path,
                           schema_file=self.keemu_schema_file)

        for record in self.iterate_data(ke_data):

            module = record.get('AudTable')
            irn = record.get('AudKey')

            try:
                collection = collections[module]
            except KeyError:
                log.debug('Skipping eaudit record for %s' % module)
                # We do not have a collection for this module - skip to next record
                continue
            else:
                log.info('Deleting record %s(%s)' % (module, irn))
                self.delete(collection, irn)

        self.mark_complete()
示例#3
0
    def process_record(self, data):

        # Only import if it's one of the record types we want
        record_type = data.get('ColRecordType', 'Missing')

        if record_type in self.excluded_types:
            log.debug('Skipping record %s: Excluded type %s', data['irn'],
                      record_type)
            raise InvalidRecordException

        # Make sure the UUID is valid

        guid = data.get('AdmGUIDPreferredValue', None)

        if guid:

            try:
                UUID(guid, version=4)
            except ValueError:
                # print 'Skipping: not a valid UUID'
                # Value error - not a valid hex code for a UUID.
                # continue
                print 'ERROR: ', guid
                raise InvalidRecordException

        # If we don't have collection department, skip it
        if not data.get('ColDepartment', None):
            raise InvalidRecordException

        date_inserted = data.get('AdmDateInserted', None)

        # Some records have an invalid AdmDateInserted=20-09-27
        # As we need this for the stats, we need to skip them - just checking against date length as it's much quicker
        if not date_inserted or len(DATE_FORMAT) != len(date_inserted):
            log.error('Skipping record %s: invalid AdmDateInserted %s',
                      data['irn'], date_inserted)
            raise InvalidRecordException

        # For now, the mongo aggregator cannot handle int / bool in $concat
        # So properties that are used in dynamicProperties need to be cast as strings
        for i in [
                'DnaTotalVolume', 'FeaCultivated', 'MinMetRecoveryWeight',
                'MinMetWeightAsRegistered'
        ]:
            if i in data:
                data[i] = str(data[i])

        # If record is a CITES species, mark cites = True
        scientific_name = data.get('DarScientificName', None)

        if scientific_name and scientific_name in self.cites_species:
            data['cites'] = True

        # For the embargo date, we're going to use the latest of NhmSecEmbargoDate and NhmSecEmbargoExtensionDate
        # So loop through, convert to timestamp.

        embargo_list = []

        for f in ['NhmSecEmbargoDate', 'NhmSecEmbargoExtensionDate']:
            if data.get(f):
                ts = self.date_to_timestamp(data.get(f))
            else:
                ts = 0
            embargo_list.append(ts)

        # Set the Real Embargo data to the largest embargo or extension date
        data['RealEmbargoDate'] = max(embargo_list)
        return super(MongoCatalogueTask, self).process_record(data)