Exemplo n.º 1
0
    def _parse_metadata_file(self):
        """Load the archive (scientific) Metadata file, parse it with\
        ElementTree and return its content (or None if the Archive contains no metadata).

        :raises: :class:`dwca.exceptions.InvalidArchive` if the archive references an inexisting
        metadata file.
        """
        # If the archive has descriptor, look for the metadata filename there.
        if self.descriptor and self.descriptor.metadata_filename:
            filename = self.descriptor.metadata_filename

            try:
                return self._parse_xml_included_file(filename)
            except IOError as exc:
                if exc.errno == ENOENT:  # File not found
                    msg = "{} is referenced in the archive descriptor but missing.".format(
                        filename)
                    raise InvalidArchive(msg)

        else:  # Otherwise, the metadata file has to be named 'EML.xml'
            try:
                return self._parse_xml_included_file(DEFAULT_METADATA_FILENAME)
            except IOError as e:
                if e.errno == ENOENT:  # File not found, this is an archive without metadata
                    return None
Exemplo n.º 2
0
    def _parse_metadata_file(self):
        # type: () -> Optional[Element]
        """Load the archive (scientific) Metadata file, parse it with\
        ElementTree and return its content (or `None` if the archive has no metadata).

        :raises: :class:`dwca.exceptions.InvalidArchive` if the archive references an non-existent
        metadata file.
        """
        # If the archive has descriptor, look for the metadata filename there.
        if self.descriptor and self.descriptor.metadata_filename:
            filename = self.descriptor.metadata_filename

            try:
                return self._parse_xml_included_file(filename)
            except IOError as exc:
                if exc.errno == ENOENT:  # File not found
                    msg = "{} is referenced in the archive descriptor but missing.".format(
                        filename)
                    raise InvalidArchive(msg)

        else:  # Otherwise, the metadata file has to be named 'EML.xml'
            try:
                return self._parse_xml_included_file(
                    self.default_metadata_filename)
            except IOError as exc:
                if exc.errno == ENOENT:  # File not found, this is an archive without metadata
                    return None

        assert False  # For MyPy, see: https://github.com/python/mypy/issues/4223#issuecomment-342865133
Exemplo n.º 3
0
    def __init__(self, csv_line, position, datafile_descriptor):
        # type: (str, int, DataFileDescriptor) -> None

        #: An instance of :class:`dwca.descriptors.DataFileDescriptor` describing the originating
        #: data file.
        self.descriptor = datafile_descriptor  # type: DataFileDescriptor

        #: The row position/index (starting at 0) in the source data file. This can be used, for example with
        #: :meth:`dwca.read.DwCAReader.get_corerow_by_position` or :meth:`dwca.files.CSVDataFile.get_row_by_position`.
        self.position = position  # type: int

        #: The csv line type as stated in the archive descriptor.
        #: (or None if the archive has no descriptor). Examples:
        #: http://rs.tdwg.org/dwc/terms/Occurrence,
        #: http://rs.gbif.org/terms/1.0/VernacularName, ...
        self.rowtype = self.descriptor.type  # type: Optional[str]

        # self.raw_fields is a list of the csv_line's content
        #:
        self.raw_fields = csv_line_to_fields(
            csv_line,
            line_ending=self.descriptor.lines_terminated_by,
            field_ending=self.descriptor.fields_terminated_by,
            fields_enclosed_by=self.descriptor.fields_enclosed_by)

        # TODO: raw_fields is a new property: to test

        # TODO: Consistency check ?? self.raw_fields length should be :
        # num of self.raw_fields described in core_meta + 2 (id and \n)

        #: A dict containing the Row data, such as::
        #:
        #:      {'dwc_term_1': 'value',
        #:       'dwc_term_2': 'value',
        #:       ...}
        #:
        #: Usage::
        #:
        #:      myrow.data['http://rs.tdwg.org/dwc/terms/locality']  # => "Brussels"
        #:
        #: .. note:: The :func:`dwca.darwincore.utils.qualname` helper is available to make such calls less verbose.
        self.data = {}  # type: Dict[str, str]

        for field_descriptor in self.descriptor.fields:
            try:
                column_index = int(field_descriptor['index'])
                field_row_value = self.raw_fields[column_index]
            except TypeError:
                # int() argument must be a string... We don't have an index for this field
                field_row_value = None
            except IndexError:
                msg = 'The descriptor references a non-existent field (index={i})'.format(
                    i=column_index)
                raise InvalidArchive(msg)

            field_default_value = field_descriptor['default']

            self.data[field_descriptor[
                'term']] = field_row_value or field_default_value or ''
Exemplo n.º 4
0
    def __init__(self, csv_line, position, descriptor):
        #: An instance of :class:`dwca.descriptors.DataFileDescriptor` describing the originating
        #: data file.
        self.descriptor = descriptor

        #: The row position/index (starting at 0) in the source data file. This can be used, for example with
        #: :meth:`DwCAReader.get_corerow_by_position` or :meth:`CSVDataFile.get_row_by_position`.
        self.position = position

        #: The csv line type as stated in the archive descriptor.
        #: Examples: http://rs.tdwg.org/dwc/terms/Occurrence,
        #: http://rs.gbif.org/terms/1.0/VernacularName, ...
        self.rowtype = self.descriptor.type

        line_ending = self.descriptor.lines_terminated_by
        field_ending = self.descriptor.fields_terminated_by

        fields_enclosed_by = self.descriptor.fields_enclosed_by

        # self.raw_fields is a list of the csv_line's content
        #:
        self.raw_fields = []
        for f in csv_line.rstrip(line_ending).split(field_ending):
            self.raw_fields.append(f.strip(fields_enclosed_by))

        # TODO: raw_fields is a new property: to test

        # TODO: Consistency chek ?? self.raw_fields length should be :
        # num of self.raw_fields described in core_meta + 2 (id and \n)

        #: A dict containing the Row data, such as:
        #: {'dwc_term_1': 'value',
        #: 'dwc_term_2': 'value',
        #: ...}.
        #:
        #: Example::
        #:
        #:      print myrow.data['http://rs.tdwg.org/dwc/terms/locality']  # => "Brussels"
        #:
        #: .. note:: The :func:`dwca.darwincore.utils.qualname` helper is avalaible to make such calls less verbose.
        self.data = {}

        for f in self.descriptor.fields:
            # if field by default, we can find its value directly in <field>
            # attribute
            if f['default'] is not None:
                self.data[f['term']] = f['default']
            else:
                # else, we have to look in core file
                field_index = int(f['index'])
                try:
                    self.data[f['term']] = self.raw_fields[field_index]
                except IndexError:
                    msg = 'The descriptor references a non-existent field (index={i})'.format(i=field_index)
                    raise InvalidArchive(msg)
Exemplo n.º 5
0
    def __init__(self,
                 metaxml_content: str,
                 files_to_ignore: List[str] = None) -> None:
        if files_to_ignore is None:
            files_to_ignore = []

        # Let's drop the XML namespace to avoid prefixes
        metaxml_content = re.sub(' xmlns="[^"]+"',
                                 '',
                                 metaxml_content,
                                 count=1)

        #: A :class:`xml.etree.ElementTree.Element` instance containing the complete Archive Descriptor.
        self.raw_element = ET.fromstring(metaxml_content)  # type: Element

        #: The path (relative to archive root) of the (scientific) metadata of the archive.
        self.metadata_filename = self.raw_element.get('metadata', None)

        #: An instance of :class:`dwca.descriptors.DataFileDescriptor` describing the core data file.
        raw_core_element = self.raw_element.find('core')
        self.core = DataFileDescriptor.make_from_metafile_section(
            raw_core_element)  # type: DataFileDescriptor

        #: A list of :class:`dwca.descriptors.DataFileDescriptor` instances describing each of the archive's extension
        #: data files.
        self.extensions = []  # type: List[DataFileDescriptor]
        for extension_tag in self.raw_element.findall(
                'extension'):  # type: Element
            location_tag = extension_tag.find('./files/location')
            if location_tag is not None:
                extension_filename = location_tag.text
                if extension_filename not in files_to_ignore:
                    self.extensions.append(
                        DataFileDescriptor.make_from_metafile_section(
                            extension_tag))
            else:
                raise InvalidArchive(
                    "An extension file is referenced in Metafile, but its path is not specified."
                )

        #: A list of extension (types) in use in the archive.
        #:
        #: Example::
        #:
        #:     ["http://rs.gbif.org/terms/1.0/VernacularName",
        #:      "http://rs.gbif.org/terms/1.0/Description"]
        self.extensions_type = [e.type for e in self.extensions]
Exemplo n.º 6
0
    def _unzip_or_untar(self) -> str:
        """Create a temporary dir. and uncompress/unarchive self.archive_path there.

        Returns the path to that temporary directory.

        Raises InvalidArchive if not a zip nor a tgz file.
        """
        tmp_dir = mkdtemp()

        # We first try to unzip (most common archives)
        try:
            # Security note: with Python < 2.7.4, a zip file may be able to write outside of the
            # directory using absolute paths, parent (..) path, ... See note in ZipFile.extract doc
            zipfile.ZipFile(self.archive_path, 'r').extractall(tmp_dir)
        except zipfile.BadZipfile:
            # Doesn't look like a valid zip, let's see if it's a tar archive (possibly compressed)
            try:
                tarfile.open(self.archive_path, 'r:*').extractall(tmp_dir)
            except tarfile.ReadError:
                raise InvalidArchive("The archive cannot be read. Is it a .zip or .tgz file?")

        return tmp_dir