예제 #1
0
    def __init__(self, filename):
        """
        Constructor

        :param filename: archive filename
        :type: filename: str or unicode
        """
        self.filename = filename
        if ".zip" in self.filename:
            stream = open_stream(self.filename)
            self.zip = zipfile.ZipFile(stream)
            self.filenames = [entry.filename for entry in self.zip.infolist()]
        else:
             self.filenames= []
             for entry in listdir(self.filename):
                  if not isfile(join(self.filename, entry)):
                      for i in listdir(join(self.filename,entry)):
                          self.filenames.append(entry+"/"+i)
                  else:
                      self.filenames.append(entry)
        document_pattern = re.compile(self.get_document_pattern())
        page_pattern = re.compile(self.get_page_pattern())
        document_matches = [
            _f for _f in [document_pattern.match(name) for name in self.filenames] if _f]
        page_matches = [
            _f for _f in [page_pattern.match(name) for name in self.filenames] if _f]
        self.document_codes = {match.group(1): [] for match in document_matches}
        document_name=list(self.document_codes.keys())[0]
        for match in page_matches:
            self.document_codes[document_name].append(match.group(0))
예제 #2
0
    def __init__(self, filename):
        """
        Constructor.

        :param filename: XML filename
        :type: filename: str or unicode
        """
        self.filename = filename
        self.filesize = os.path.getsize(filename)

        stream = open_stream(self.filename)
        self.document_tree = None
        parser = etree.XMLParser()
        self.document_tree = etree.parse(stream, parser)
        self.root_element = self.document_tree.getroot()
        self.root_element_tag = str(self.root_element.tag)
        self.doc_type = str(self.document_tree.docinfo.doctype)
        self.namespaces = self.root_element.nsmap
        self.schema_locations = self.root_element.get(SCHEMA_LOCATION.text)
        if self.schema_locations is not None:
            self.schema_locations = self.schema_locations.split(" ")
        else:
            self.schema_locations = []
        self.no_ns_schema_location = self.root_element.get(
            NO_NS_SCHEMA_LOCATION.text)
예제 #3
0
    def __init__(self, filename):
        """
        Constructor. If the filename cannot be parsed into valid XML
        an empty document is created.

        :param filename: XML filename
        :type: filename: str or unicode
        """
        self.filename = filename
        stream = open_stream(self.filename)

        self.issue_tree = None
        self.issue = ''
        self.newspaper_id = ''
        self.articles = []
        self.date = datetime.now()
        self.page_count = 0
        self.day_of_week = ''
        self.document_type = "newspaper"
        self.model = "papers"
        # Attempt to parse the file, even if its XML is invalid e.g:
        # <wd ...>.../wd>
        parser = etree.XMLParser(recover=True)
        self.issue_tree = etree.parse(stream, parser)

        has_issue = len(self.query("..//issue")) > 0
        if not has_issue:
            raise Exception("Missing 'issue' element")

        self.issue = self.single_query('.//issue')

        # bl_ncnp_issue_apex.dtd, GALENP.dtd, nccoissue.dtd
        newspaper_id = self.single_query('//issue/id/text()')
        if newspaper_id is None:
            # LTO_issue.md
            newspaper_id = self.single_query(
                '//issue/metadatainfo/PSMID/text()')
        if newspaper_id is not None:
            self.newspaper_id = newspaper_id

        self.articles = [
            Article(article, self.filename)
            for article in self.query('.//article')
        ]

        # bl_ncnp_issue_apex.dtd, GALENP.dtd, LTO_issue.dtd
        raw_date = self.single_query('//pf/text()')
        if raw_date is None:
            # nccoissue.dtd
            raw_date = self.single_query('//da/searchableDateStart/text()')
        if raw_date:
            self.date = datetime.strptime(raw_date, '%Y%m%d')
        else:
            self.date = None

        try:
            self.page_count = int(self.single_query('//ip/text()'))
        except Exception:
            pass
예제 #4
0
 def open_document(self, document_code):
     """
     Opens metadata file.
     :param document_code: document file code
     :type document_code: str or unicode
     :return: stream
     """
     if ".zip" in self.filename:
         return self.zip.open(document_code + '-mets.xml')
     else:
         return open_stream(self.filename + "/" + document_code +
                            '-mets.xml')
예제 #5
0
 def open_page(self, document_code, page_code):
     """
     Opens page file.
     :param document_code: page file code
     :type document_code: str or unicode
     :param page_code: file code
     :type page_code: str or unicode
     :return: stream
     """
     if ".zip" in self.filename:
         return self.zip.open(page_code)
     else:
         return open_stream(self.filename + "/" + page_code)
예제 #6
0
    def open_page(self, document_code, page_code):
        """
        Opens page file.

        :param document_code: page file code
        :type document_code: str or unicode
        :param page_code: file code
        :type page_code: str or unicode
        :return: stream
        :rtype: zipfile.ZipExt
        """
        if ".zip" in self.filename:
            return self.zip.open(document_code + '_' + page_code + '.xml')
        else:
            return open_stream(self.filename + "/" + document_code + '_' +
                               page_code + '.xml')
예제 #7
0
    def __init__(self, filename):
        """
        Constructor.

        :param filename: XML filename
        :type: filename: str or unicode
        """
        self.filename = filename
        stream = open_stream(self.filename)
        parser = etree.XMLParser(recover=True)
        self.xml_tree = etree.parse(stream, parser)
        self.articles = [
            Article(article, self.filename)
            for article in self.query('.//result')
        ]
        self.document_type = "newspaper"
        self.model = "nzpp"
예제 #8
0
    def __init__(self, filename):
        """
        Constructor

        :param filename: archive filename
        :type: filename: str or unicode
        """
        self.filename = filename
        stream = open_stream(self.filename)
        self.zip = zipfile.ZipFile(stream)
        self.filenames = [entry.filename for entry in self.zip.infolist()]
        document_pattern = re.compile(self.get_document_pattern())
        page_pattern = re.compile(self.get_page_pattern())
        document_matches = [
            _f for _f in [document_pattern.match(name) for name in self.filenames] if _f]
        page_matches = [
            _f for _f in [page_pattern.match(name) for name in self.filenames] if _f]
        self.document_codes = {match.group(1): [] for match in document_matches}
        for match in page_matches:
            self.document_codes[match.group(1)].append(match.group(2))
예제 #9
0
 def __init__(self, filename) -> None:
     self.filename = filename
     stream = open_stream(self.filename)
     tree = etree.parse(stream)
     self.headings = parse_tree(tree)
예제 #10
0
 def __init__(self, filename):
     self.filename = filename
     stream = open_stream(filename)
     output_string = StringIO()
     extract_text_to_fp(stream, output_string, laparams=LAParams())
     self.raw_text = output_string