Пример #1
0
    def process(self, data):

        log.debug('Processing %s record %s', self.model_class.__name__.lower(), data['irn'])

        try:
            # Do we already have a record for this?
            record = self.get_record(data.get('irn'))

            # Is this a stub record? If it is, we want to change the type and reload.
            # Seems a bit of a hack, but SQLAlchemy does not have a simple way of modifying the type
            #  This only runs for catalogue records
            if isinstance(record, StubModel):

                polymorphic_type = self.model_class.__mapper_args__['polymorphic_identity']
                # Manually set type
                self.session.execute('UPDATE %s.catalogue SET type=:type WHERE irn=:irn' % self.keemu_schema, {'type': polymorphic_type, 'irn': data.get('irn')})

                # If this has a child table, insert the IRN so updates will work
                if self.model_class.__mapper__.local_table.name != 'specimen':
                    # And create empty row in the polymorphic table
                    self.session.execute('INSERT INTO %s.%s (irn) VALUES (:irn)' % (self.keemu_schema, self.model_class.__mapper__.local_table.name), {'irn': data.get('irn')})

                # Commit & expunge so the item can be reloaded
                self.session.commit()
                self.session.expunge(record)
                record = self.get_record(data.get('irn'))

            # Process the relationships
            data = self._process_relationships(data, record)

            # Populate the data
            record.rebuild(**data)

        except NoResultFound:

            data = self._process_relationships(data)
            # Create a new record
            record = self.model_class(**data)

        try:

            self.session.merge(record)
            self.session.commit()

        except DataError, e:
            # Save this error to the log - will need to follow up on these
            log.critical('DB DataError: record %s not created.' % data['irn'], {'data': data}, exc_info=e)
Пример #2
0
    def get_model_class(self, data):
        """
        Retrieve the model class for a specimen record, using candidate classes based on name, dept etc.,
        """
        model_class = None

        # If it doesn't even have a record type, what's the point of keeping it?
        if not 'ColRecordType' in data:
            log.debug('Skipping record %s: No record type', data['irn'])
            return None

        # Build an array of potential candidate classes
        candidate_classes = []

        collection = data['ColKind'] if 'ColKind' in data else None
        collection_department = data['ColDepartment'] if 'ColDepartment' in data else None

        # KE EMU has case insensitive record types (2820735: specimen)
        # So make sure the first letter is capitalised
        record_type = data['ColRecordType'][0].capitalize() + data['ColRecordType'][1:]
        matches = self.re_model.match(record_type)

        if matches:
            cls = matches.group(0).replace(' ', '')

            if collection:
                # Add candidate class based on ColKind (used for mineralogy) MeteoritesSpecimenModel
                candidate_classes.append('{0}{1}Model'.format(data['ColKind'], cls))

            if collection_department:
                # Add candidate class BotanySpecimenModel
                candidate_classes.append('{0}{1}Model'.format(collection_department, cls))

            # Add candidate class SpecimenModel, ArtefactModel
            candidate_classes.append('{0}Model'.format(cls))

        for candidate_class in candidate_classes:
            if candidate_class in globals():
                # Do we have a model class for this candidate
                model_class = globals()[candidate_class]
                break

        return model_class
Пример #3
0
    def run(self):

        # Need to load an SQLA model
        # So build a dict of all models keyed by KE EMu module
        models = {}

        for cls in KEDataTask.__subclasses__():
            models[cls.module] = cls.model_class if cls.model_class else CatalogueModel

        ke_data = KEParser(self.input().open('r'), schema_file=self.keemu_schema_file, input_file_path=self.input().path)

        for data in ke_data:
            module = data.get('AudTable')
            irn = data.get('AudKey')
            try:
                model = models[module]
            except KeyError:
                log.debug('Skipping eaudit record for %s' % module)
            else:

                try:

                    log.debug('Deleting record %s(%s)' % (model, irn))

                    # Load the object and then delete so we use the SQLA inheritance
                    obj = self.session.query(self.model).filter(self.model.irn == 1).one()
                    self.session.delete(obj)

                except NoResultFound:

                    # We cannot delete this record as it doesn't exist
                    # There are a lot of records being inserted and then deleted again
                    # So will never appear on the insert exports
                    date_inserted = datetime.strptime(data.get('AdmDateInserted'),"%Y-%m-%d")
                    date_deleted = datetime.strptime(data.get('AudDate'),"%Y-%m-%d")

                     # If date deleted is within 7 days of the insert date, do not flag an error
                    if date_deleted - timedelta(days=7) < date_inserted:
                        log.debug('Record %s(%s) not found for deletion, but within date threshold (inserted: %s deleted: %s)' % (model.__name__, irn, date_inserted, date_deleted))
                    else:
                        log.error('Record %s(%s) not found for deletion' % (model, irn))

        self.session.commit()
        self.output().touch()
Пример #4
0
    def process(self, data):

        # Try and get the model class
        self.model_class = self.get_model_class(data)

        # If we don't have a model class, continue to next record
        if not self.model_class:

            record_type = data.get('ColRecordType', 'Missing')

            # If record type is one we've knowingly excluded
            if record_type in self.excluded_types:
                log.debug('Skipping record %s: No model class for %s', data['irn'], record_type)
            else:
                # Critical error - log to DB
                log.critical('Unknown model class %s for %s. Investigate and then add to [excluded_types] if not required.', record_type, data['irn'])

            # Next record
            return

        # Filter out some of the records
        if not 'ColDepartment' in data:
            log.debug('Skipping record %s: No collection department', data['irn'])
            return None

        if not 'AdmDateInserted' in data:
            log.debug('Skipping record %s: No AdmDateInserted', data['irn'])
            return None

        # Skip records if SecRecordStatus is one of 'DELETE', 'Reserved', 'Stub', 'Stub Record', 'DELETE-MERGED'
        if 'SecRecordStatus' in data and data['SecRecordStatus'] in ['DELETE', 'Reserved', 'Stub', 'Stub Record', 'DELETE-MERGED']:
            log.debug('Skipping record %s: Incorrect record status', data['irn'])
            return None

        # Botany records include ones from Linnean Society. Should be excluded.
        if 'RegHerbariumCurrentOrgAcroLocal' in data and data['RegHerbariumCurrentOrgAcroLocal'] == 'LINN':
            log.debug('Skipping record %s: Non-BM botany record', data['irn'])
            return None

        # 4257 Artefacts have no kind or name. Skip them
        if data['ColRecordType'] == 'Artefact' and 'ArtKind' not in data and 'ArtName' not in data:
            return None

        # Process determinations
        determinations = data.get('EntIdeTaxonRef', None) or data.get('EntIndIndexLotTaxonNameLocalRef', None)

        if determinations:

            data['specimen_taxonomy'] = []

            determinations = self.ensure_list(determinations)

            # Load the taxonomy records for these determinations
            taxonomy_records = self.session.query(TaxonomyModel).filter(TaxonomyModel.irn.in_(determinations)).all()

            # Loop through all retrieved taxonomy records, and add a determination for them
            # This will act as a filter, removing all duplicates / missing taxa
            for taxonomy_record in taxonomy_records:
                filed_as = (taxonomy_record.irn == data.get('EntIdeFiledAsTaxonRef', None))
                data['specimen_taxonomy'].append(Determination(taxonomy_irn=taxonomy_record.irn, specimen_irn=data['irn'], filed_as=filed_as))

        # Parasite card host / parasites

        host_parasites = {
            'host': data.get('CardHostRef', []),
            'parasite': data.get('CardParasiteRef', []),
        }

        stages = self.ensure_list(data.get('CardParasiteStage', []))

        for host_parasite_type, refs in host_parasites.items():
            refs = self.ensure_list(refs)

            for i, ref in enumerate(refs):
                try:
                    stage = stages[i]
                except IndexError:
                    stage = None

                assoc_object = HostParasiteAssociation(taxonomy_irn=ref, parasite_card_irn=data['irn'], parasite_host=host_parasite_type, stage=stage)

                try:
                    data['host_parasite_taxonomy'].append(assoc_object)
                except KeyError:
                    data['host_parasite_taxonomy'] = [assoc_object]

        # Some special field mappings

        # Try to use PalDetDate is if DarYearIdentified is missing
        if not 'DarYearIdentified' in data:
            try:
                date_matches = self.re_date.search(data['PalDetDate'])
                if date_matches:
                    data['DarYearIdentified'] = date_matches.group(1)
                    data['DarMonthIdentified'] = date_matches.group(2)
                    data['DarDayIdentified'] = date_matches.group(3)
            except (KeyError, TypeError):
                # If PalDetDate doesn't exists or isn't a string (can also be a list if there's multiple determination dates - which we ignore)
                pass

        # EntCatCatalogueNumber requires EntCatPrefix if it's used in catalogue_number
        try:
            data['EntCatCatalogueNumber'] = '{0}{1}'.format(data['EntCatPrefix'], data['EntCatCatalogueNumber'])
        except KeyError:
            pass

        # Set egg part type if not already set
        if self.model_class is EggModel and 'PrtType' not in data:
            data['PrtType'] = 'egg'

        super(CatalogueTask, self).process(data)