示例#1
0
    def endElementNS(self, name, qname):
        # just get the element name without the namespace
        elName = name[1]

        if elName == 'dmdSec':
            self.inDmdSec = False
            # if we currently processing an experiment structure, let's
            # save the institution value before we finalise the experiment
            if self.processExperimentStruct:
                self.metsObject.institution = self.institution

                # let's save the experiment in the DB
                if self.tardisExpId:
                    self.modelExperiment = models.Experiment.objects.get(
                        pk=self.tardisExpId)
                else:
                    self.modelExperiment = models.Experiment()
                self.modelExperiment.id = self.tardisExpId
                self.modelExperiment.url = self.metsObject.url
                self.modelExperiment.approved = True
                self.modelExperiment.title = self.metsObject.title
                self.modelExperiment.institution_name = \
                                            self.metsObject.institution
                self.modelExperiment.description = self.metsObject.description
                self.modelExperiment.start_time = self.metsObject.start_time
                self.modelExperiment.end_time = self.metsObject.end_time
                self.modelExperiment.created_by = self.createdBy

                self.modelExperiment.save()

                self.holder.experimentDatabaseId = self.modelExperiment.id

                x = 0
                for author in self.metsObject.authors:
                    author_experiment = models.Author_Experiment(
                        experiment=self.modelExperiment,
                        author=author, order=x)
                    author_experiment.save()
                    x = x + 1

            elif self.processDatasetStruct:
                # let's save the dataset in the DB
                self.modelDataset = models.Dataset(
                    description=self.metsObject.title,
                    immutable=settings.IMMUTABLE_METS_DATASETS)
                self.modelDataset.save()
                self.modelDataset.experiments.add(self.modelExperiment)
                self.modelDataset.save()

                # let's also save the modelDataset in a dictionary so that we
                # can look it up easily later on when we start processing
                # the datafiles.
                self.datasetLookupDict[self.metsObject.id] = self.modelDataset

            self.metsObject = None

            self.processExperimentStruct = False
            self.processDatasetStruct = False

        elif elName == 'title' and self.inDmdSec:
            self.grabTitle = False

        elif elName == 'startTime' and self.processExperimentStruct:
            self.grabStartTime = False

        elif elName == 'endTime' and self.processExperimentStruct:
            self.grabEndTime = False

        elif elName == 'url' and self.processExperimentStruct:
            self.grabExperimentUrl = False

        elif elName == 'abstract' and self.processExperimentStruct:
            self.grabAbstract = False

        elif elName == 'name' and self.processExperimentStruct:
            self.inName = False

        elif elName == 'namePart' and self.inName:
            self.grabMightBeAuthor = False

        elif elName == 'roleTerm' and self.inName:
            self.grabRoleTerm = False
            self.mightBeAuthor = None

        elif elName == 'name' and self.inInstitution:
            self.grabInstitution = False

        elif elName == 'agent':
            self.inInstitution = False

        elif elName == 'amdSec':

            # we're done processing the metadata entries
            self.inAmdSec = False

            # let's reset the cached experiment model object
            self.modelExperiment = None

            logger.info(self.holder.metadataMap)
            for mdId in self.holder.metadataMap:
                if mdId.startswith('NOMD-'):
                    df = self.holder.metadataMap[mdId][0]

                    if df.dataset.id in self.datasetLookupDict:
                        # look up the dataset this file belongs to
                        thisFilesDataset = self.datasetLookupDict[
                            df.dataset.id]

                        size = df.size

                        if not df.size:
                            size = 0

                        def checksum(obj, type_):
                            # Check if the checksum is of type
                            if obj.checksumType != type_:
                                return ''
                            checksum = obj.checksum.lower()
                            # Ensure the checksum is hexdecimal
                            if not re.match('[0-9a-f]+$', checksum):
                                return ''
                            # Get algorithm
                            try:
                                name = type_.replace('-','').lower()
                                alg = getattr(hashlib, name)
                            except:
                                return ''
                            # Check checksum is the correct length
                            hex_length = alg('').digest_size * 2
                            if hex_length != len(checksum):
                                return ''
                            # Should be valid checksum of given type
                            return checksum

                        sync_url, proto = get_sync_url_and_protocol(
                                                    get_sync_root(),
                                                    df.url)

                        self.modelDatafile = models.Dataset_File(
                            dataset=thisFilesDataset,
                            filename=df.name,
                            size=size,
                            md5sum=checksum(df, 'MD5'),
                            sha512sum=checksum(df,
                                           'SHA-512'))

                        logger.info('=== saving datafile: %s' % df.name)
                        self.modelDatafile.save() 

                        replica = models.Replica(
                            datafile=self.modelDatafile,
                            url=sync_url,
                            protocol=proto,
                            location=self.syncLocation)
                        replica.save()


        elif elName == 'techMD' and self.inAmdSec:
            self.inTechMd = False
            self.metadataId = None
            self.metsObject = None
            self.processMetadata = False

        elif elName == 'xmlData' and self.inTechMd:
            self.inXmlData = False

        elif elName != self.xmlDataChildElement and \
                self.customHandler is not None:
            self.customHandler.endElement(elName)

        elif elName == self.xmlDataChildElement and self.inXmlData:

            if self.customHandler is not None:
                self.tempMetadataHolder = self.customHandler.metadataDict

            try:
                schema = models.Schema.objects.get(
                    namespace__exact=self.elementNamespace)

                # get the associated parameter names for the given schema
                parameterNames = \
                    models.ParameterName.objects.filter(
                    schema__namespace__exact=schema.namespace).order_by('id')

                # let's create a trigger holder which we can use to check
                # if we still need to create another parameterset entry in the
                # DB
                createParamSetFlag = {'experiment': True, 'dataset': True,
                                      'datafile': True}
                datasetParameterSet = None
                datafileParameterSet = None

                if self.metadataId  in self.holder.metadataMap:
                    for metsObject in self.holder.metadataMap[self.metadataId]:
                        self.metsObject = metsObject

                        metsObjectClassName = self.metsObject.__class__.__name__

                        if metsObjectClassName == 'Experiment':
                            if createParamSetFlag['experiment']:
                                # create a new parameter set for the metadata
                                parameterSet = \
                                    models.ExperimentParameterSet(
                                    schema=schema,
                                    experiment=self.modelExperiment)
                                parameterSet.save()

                                # now let's process the experiment parameters
                                for parameterName in parameterNames:
                                    if parameterName.name in \
                                            self.tempMetadataHolder:
                                        parameterValues = self.tempMetadataHolder[
                                            parameterName.name]
                                        self._saveParameters('ExperimentParameter',
                                                parameterName, parameterValues,
                                                parameterSet)

                                createParamSetFlag['experiment'] = False

                            else:
                                # this is not even allowed as there's only going
                                # to be one experiment per METS file
                                raise Exception('forbidden state!')

                        elif metsObjectClassName == 'Dataset':
                            if createParamSetFlag['dataset']:
                                dataset = self.datasetLookupDict[
                                    self.metsObject.id]

                                # create a new parameter set for the
                                # dataset metadata
                                datasetParameterSet = \
                                    models.DatasetParameterSet(schema=schema,
                                    dataset=dataset)

                                datasetParameterSet.save()

                                # now let's process the dataset parameters
                                for parameterName in parameterNames:
                                    if parameterName.name in \
                                            self.tempMetadataHolder:
                                        parameterValues = self.tempMetadataHolder[
                                            parameterName.name]
                                        self._saveParameters('DatasetParameter',
                                                parameterName, parameterValues,
                                                datasetParameterSet)

                                # disable creation for the next visit
                                createParamSetFlag['dataset'] = False

                        elif metsObjectClassName == 'Datafile':
                            # this will be a good time to save the
                            # "hard" metadata of this datafile so that
                            # when we start adding "soft" metadata
                            # parameters to it, we already have an
                            # entry for it in the DB
                            logger.info('=== found datafile: %s' % self.metsObject.name)

                            # look up the dataset this file belongs to
                            thisFilesDataset = self.datasetLookupDict[
                                self.metsObject.dataset.id]

                            # also check if the file already exists
                            datafile = thisFilesDataset.dataset_file_set.filter(
                                filename=self.metsObject.name, size=self.metsObject.size)


                            if datafile.count() == 0:
                                size = self.metsObject.size

                                if not self.metsObject.size:
                                    size = 0


                                def checksum(obj, type_):
                                    # Check if the checksum is of type
                                    if obj.checksumType != type_:
                                        return ''
                                    checksum = obj.checksum.lower()
                                    # Ensure the checksum is hexdecimal
                                    if not re.match('[0-9a-f]+$', checksum):
                                        return ''
                                    # Get algorithm
                                    try:
                                        name = type_.replace('-','').lower()
                                        alg = getattr(hashlib, name)
                                    except:
                                        return ''
                                    # Check checksum is the correct length
                                    hex_length = alg('').digest_size * 2
                                    if hex_length != len(checksum):
                                        return ''
                                    # Should be valid checksum of given type
                                    return checksum

                                sync_url, proto = get_sync_url_and_protocol(
                                                    self.syncRootDir,
                                                    self.metsObject.url)

                                self.modelDatafile = models.Dataset_File(
                                    dataset=thisFilesDataset,
                                    filename=self.metsObject.name,
                                    size=size,
                                    md5sum=checksum(self.metsObject, 'MD5'),
                                    sha512sum=checksum(self.metsObject,
                                                       'SHA-512'))

                                logger.info('=== saving datafile: %s' % self.metsObject.name)
                                self.modelDatafile.save()
                                replica = models.Replica(
                                    datafile=self.modelDatafile,
                                    url=sync_url,
                                    protocol=proto,
                                    location=self.syncLocation)
                                replica.save()
                                
                            else:
                                self.modelDatafile = thisFilesDataset.dataset_file_set.get(
                                    filename=self.metsObject.name, size=self.metsObject.size)
                            # TODO: we need to note here that we are
                            # only creating a datafile entry in the DB
                            # for files that have corresponding
                            # metadata. if we are to create a file
                            # entry for files with no metadata, we'll
                            # need to get the unaccessed datafiles
                            # from datasetLookupDict.

                            if createParamSetFlag['datafile']:
                                # create a new parameter set for the metadata
                                datafileParameterSet = \
                                    models.DatafileParameterSet(schema=schema,
                                    dataset_file=self.modelDatafile)
                                datafileParameterSet.save()

                                # now let's process the datafile parameters
                                for parameterName in parameterNames:
                                    if parameterName.name in \
                                            self.tempMetadataHolder:
                                        parameterValues = self.tempMetadataHolder[
                                            parameterName.name]
                                        self._saveParameters('DatafileParameter',
                                                parameterName, parameterValues,
                                                datafileParameterSet)
                                createParamSetFlag['datafile'] = False


            except models.Schema.DoesNotExist:
                logger.warning('unsupported schema being ingested ' +
                    self.elementNamespace)

            # reset the current xmlData child element so that if a new
            # parameter set is read, we can process it again
            self.xmlDataChildElement = None
            self.customHandler = None

        elif elName == self.parameterName and \
                self.xmlDataChildElement is not None:

            # reset self.parameterName to None so the next parameter can be
            # processed
            self.parameterName = None
            elif line.startswith('</experiment>'):

                if current == 'dataset':
                    d = Dataset(experiment=experiment,
                            description=dataset['description'])
                    d.save()
                else:

                    if self.null_check(datafile['name']):
                        filename = datafile['name']
                    else:
                        filename = datafile['path']

                    sync_url, proto = get_sync_url_and_protocol(
                                        sync_root,
                                        datafile['path'])

                    dfile = Dataset_File(dataset=d,
                                         filename=filename,
                                         url=sync_url,
                                         size=datafile['size'],
                                         protocol=proto)
                    dfile.save()

                    current_df_id = dfile.id

                    if 'metadata' in datafile:
                        for md in datafile['metadata']:
                            xmlns = getXmlnsFromTechXMLRaw(md)
示例#3
0
    def endElementNS(self, name, qname):
        # just get the element name without the namespace
        elName = name[1]

        if elName == 'dmdSec':
            self.inDmdSec = False
            # if we currently processing an experiment structure, let's
            # save the institution value before we finalise the experiment
            if self.processExperimentStruct:
                self.metsObject.institution = self.institution

                # let's save the experiment in the DB
                if self.tardisExpId:
                    self.modelExperiment = models.Experiment.objects.get(
                        pk=self.tardisExpId)
                else:
                    self.modelExperiment = models.Experiment()
                self.modelExperiment.id = self.tardisExpId
                self.modelExperiment.url = self.metsObject.url
                self.modelExperiment.approved = True
                self.modelExperiment.title = self.metsObject.title
                self.modelExperiment.institution_name = \
                                            self.metsObject.institution
                self.modelExperiment.description = self.metsObject.description
                self.modelExperiment.start_time = self.metsObject.start_time
                self.modelExperiment.end_time = self.metsObject.end_time
                self.modelExperiment.created_by = self.createdBy

                self.modelExperiment.save()

                self.holder.experimentDatabaseId = self.modelExperiment.id

                x = 0
                for author in self.metsObject.authors:
                    author_experiment = models.Author_Experiment(
                        experiment=self.modelExperiment,
                        author=author, order=x)
                    author_experiment.save()
                    x = x + 1

            elif self.processDatasetStruct:
                # let's save the dataset in the DB
                self.modelDataset = models.Dataset(
                    description=self.metsObject.title,
                    immutable=settings.IMMUTABLE_METS_DATASETS)
                self.modelDataset.save()
                self.modelDataset.experiments.add(self.modelExperiment)
                self.modelDataset.save()

                # let's also save the modelDataset in a dictionary so that we
                # can look it up easily later on when we start processing
                # the datafiles.
                self.datasetLookupDict[self.metsObject.id] = self.modelDataset

            self.metsObject = None

            self.processExperimentStruct = False
            self.processDatasetStruct = False

        elif elName == 'title' and self.inDmdSec:
            self.grabTitle = False

        elif elName == 'startTime' and self.processExperimentStruct:
            self.grabStartTime = False

        elif elName == 'endTime' and self.processExperimentStruct:
            self.grabEndTime = False

        elif elName == 'url' and self.processExperimentStruct:
            self.grabExperimentUrl = False

        elif elName == 'abstract' and self.processExperimentStruct:
            self.grabAbstract = False

        elif elName == 'name' and self.processExperimentStruct:
            self.inName = False

        elif elName == 'namePart' and self.inName:
            self.grabMightBeAuthor = False

        elif elName == 'roleTerm' and self.inName:
            self.grabRoleTerm = False
            self.mightBeAuthor = None

        elif elName == 'name' and self.inInstitution:
            self.grabInstitution = False

        elif elName == 'agent':
            self.inInstitution = False

        elif elName == 'amdSec':
            # we're done processing the metadata entries
            self.inAmdSec = False

            # let's reset the cached experiment model object
            self.modelExperiment = None

        elif elName == 'techMD' and self.inAmdSec:
            self.inTechMd = False
            self.metadataId = None
            self.metsObject = None
            self.processMetadata = False

        elif elName == 'xmlData' and self.inTechMd:
            self.inXmlData = False

        elif elName != self.xmlDataChildElement and \
                self.customHandler is not None:
            self.customHandler.endElement(elName)

        elif elName == self.xmlDataChildElement and self.inXmlData:

            if self.customHandler is not None:
                self.tempMetadataHolder = self.customHandler.metadataDict

            try:
                schema = models.Schema.objects.get(
                    namespace__exact=self.elementNamespace)

                # get the associated parameter names for the given schema
                parameterNames = \
                    models.ParameterName.objects.filter(
                    schema__namespace__exact=schema.namespace).order_by('id')

                # let's create a trigger holder which we can use to check
                # if we still need to create another parameterset entry in the
                # DB
                createParamSetFlag = {'experiment': True, 'dataset': True,
                                      'datafile': True}
                datasetParameterSet = None
                datafileParameterSet = None

                if self.metadataId  in self.holder.metadataMap:
                    for metsObject in self.holder.metadataMap[self.metadataId]:
                        self.metsObject = metsObject

                        metsObjectClassName = self.metsObject.__class__.__name__

                        if metsObjectClassName == 'Experiment':
                            if createParamSetFlag['experiment']:
                                # create a new parameter set for the metadata
                                parameterSet = \
                                    models.ExperimentParameterSet(
                                    schema=schema,
                                    experiment=self.modelExperiment)
                                parameterSet.save()

                                # now let's process the experiment parameters
                                for parameterName in parameterNames:
                                    if parameterName.name in \
                                            self.tempMetadataHolder:
                                        parameterValues = self.tempMetadataHolder[
                                            parameterName.name]
                                        self._saveParameters('ExperimentParameter',
                                                parameterName, parameterValues,
                                                parameterSet)

                                createParamSetFlag['experiment'] = False

                            else:
                                # this is not even allowed as there's only going
                                # to be one experiment per METS file
                                raise Exception('forbidden state!')

                        elif metsObjectClassName == 'Dataset':
                            if createParamSetFlag['dataset']:
                                dataset = self.datasetLookupDict[
                                    self.metsObject.id]

                                # create a new parameter set for the
                                # dataset metadata
                                datasetParameterSet = \
                                    models.DatasetParameterSet(schema=schema,
                                    dataset=dataset)

                                datasetParameterSet.save()

                                # now let's process the dataset parameters
                                for parameterName in parameterNames:
                                    if parameterName.name in \
                                            self.tempMetadataHolder:
                                        parameterValues = self.tempMetadataHolder[
                                            parameterName.name]
                                        self._saveParameters('DatasetParameter',
                                                parameterName, parameterValues,
                                                datasetParameterSet)

                                # disable creation for the next visit
                                createParamSetFlag['dataset'] = False

                        elif metsObjectClassName == 'Datafile':
                            # this will be a good time to save the
                            # "hard" metadata of this datafile so that
                            # when we start adding "soft" metadata
                            # parameters to it, we already have an
                            # entry for it in the DB

                            # look up the dataset this file belongs to
                            thisFilesDataset = self.datasetLookupDict[
                                self.metsObject.dataset.id]

                            # also check if the file already exists
                            datafile = thisFilesDataset.dataset_file_set.filter(
                                filename=self.metsObject.name, size=self.metsObject.size)

                            if datafile.count() == 0:
                                size = self.metsObject.size

                                if not self.metsObject.size:
                                    size = 0


                                def checksum(obj, type_):
                                    # Check if the checksum is of type
                                    if obj.checksumType != type_:
                                        return ''
                                    checksum = obj.checksum.lower()
                                    # Ensure the checksum is hexdecimal
                                    if not re.match('[0-9a-f]+$', checksum):
                                        return ''
                                    # Get algorithm
                                    try:
                                        name = type_.replace('-','').lower()
                                        alg = getattr(hashlib, name)
                                    except:
                                        return ''
                                    # Check checksum is the correct length
                                    hex_length = alg('').digest_size * 2
                                    if hex_length != len(checksum):
                                        return ''
                                    # Should be valid checksum of given type
                                    return checksum

                                sync_url, proto = get_sync_url_and_protocol(
                                                    self.syncRootDir,
                                                    self.metsObject.url)

                                self.modelDatafile = models.Dataset_File(
                                    dataset=thisFilesDataset,
                                    filename=self.metsObject.name,
                                    url=sync_url,
                                    size=size,
                                    md5sum=checksum(self.metsObject, 'MD5'),
                                    sha512sum=checksum(self.metsObject,
                                                       'SHA-512'),
                                    protocol=proto)

                                self.modelDatafile.save()
                            else:
                                self.modelDatafile = thisFilesDataset.dataset_file_set.get(
                                    filename=self.metsObject.name, size=self.metsObject.size)
                            # TODO: we need to note here that we are
                            # only creating a datafile entry in the DB
                            # for files that have corresponding
                            # metadata. if we are to create a file
                            # entry for files with no metadata, we'll
                            # need to get the unaccessed datafiles
                            # from datasetLookupDict.

                            if createParamSetFlag['datafile']:
                                # create a new parameter set for the metadata
                                datafileParameterSet = \
                                    models.DatafileParameterSet(schema=schema,
                                    dataset_file=self.modelDatafile)
                                datafileParameterSet.save()

                                # now let's process the datafile parameters
                                for parameterName in parameterNames:
                                    if parameterName.name in \
                                            self.tempMetadataHolder:
                                        parameterValues = self.tempMetadataHolder[
                                            parameterName.name]
                                        self._saveParameters('DatafileParameter',
                                                parameterName, parameterValues,
                                                datafileParameterSet)
                                createParamSetFlag['datafile'] = False

            except models.Schema.DoesNotExist:
                logger.warning('unsupported schema being ingested ' +
                    self.elementNamespace)

            # reset the current xmlData child element so that if a new
            # parameter set is read, we can process it again
            self.xmlDataChildElement = None
            self.customHandler = None

        elif elName == self.parameterName and \
                self.xmlDataChildElement is not None:

            # reset self.parameterName to None so the next parameter can be
            # processed
            self.parameterName = None
                exp['abstract'] = ab

            elif line.startswith('</experiment>'):

                if current == 'dataset':
                    d = Dataset(experiment=experiment,
                                description=dataset['description'])
                    d.save()
                else:

                    if self.null_check(datafile['name']):
                        filename = datafile['name']
                    else:
                        filename = datafile['path']

                    sync_url, proto = get_sync_url_and_protocol(
                        sync_root, datafile['path'])

                    dfile = Dataset_File(dataset=d,
                                         filename=filename,
                                         url=sync_url,
                                         size=datafile['size'],
                                         protocol=proto)
                    dfile.save()

                    current_df_id = dfile.id

                    if 'metadata' in datafile:
                        for md in datafile['metadata']:
                            xmlns = getXmlnsFromTechXMLRaw(md)

                            try: