def __init__(self, file_stream, url): DocumentParser.__init__(self, file_stream) self._url = url self._namespace_URI = 'http://www.loc.gov/METS/' self._mods_namespace_URI = 'http://www.loc.gov/mods/v3' #read the content of the file self._content_str = self._file_stream.read() self._logical_structure = None self._physical_structure = None self._meta_data = None self._relation = None self._file_list = None #some METS files contain uppercase mets directive #self._content_str = self._content_str.replace('METS=', 'mets=') #self._content_str = self._content_str.replace('', '') #self._content_str = self._content_str.replace('MODS=', 'mods=') #self._content_str = self._content_str.replace('', '') try: self._doc = parseString(self._content_str) except Exception: raise ParserError.InvalidDocument("The file is invalid. (is it" \ "corrupted?)") if self._check_xml() is not True: raise ParserError.InvalidDocument("The file is invalid. (is it" \ "corrupted?)")
def _get_record(self): """Get the record object in the xml file.""" self._file_stream.seek(0) content_str = self._file_stream.read() doc = parseString(content_str) records = doc.getElementsByTagNameNS(self._namespace_URI, 'mods') # get the id number of the first record if len(records) == 0: raise ParserError.InvalidDocument( "XML/Mods Core document should contains at lease one record!") if len(records) > 1: raise ParserError.InvalidDocument( "XML/Mods Core document should not contains more than "\ "one record!") return records[0]