def __init__(self, filename): """ Constructor :param filename: archive filename :type: filename: str or unicode """ self.filename = filename if ".zip" in self.filename: stream = open_stream(self.filename) self.zip = zipfile.ZipFile(stream) self.filenames = [entry.filename for entry in self.zip.infolist()] else: self.filenames= [] for entry in listdir(self.filename): if not isfile(join(self.filename, entry)): for i in listdir(join(self.filename,entry)): self.filenames.append(entry+"/"+i) else: self.filenames.append(entry) document_pattern = re.compile(self.get_document_pattern()) page_pattern = re.compile(self.get_page_pattern()) document_matches = [ _f for _f in [document_pattern.match(name) for name in self.filenames] if _f] page_matches = [ _f for _f in [page_pattern.match(name) for name in self.filenames] if _f] self.document_codes = {match.group(1): [] for match in document_matches} document_name=list(self.document_codes.keys())[0] for match in page_matches: self.document_codes[document_name].append(match.group(0))
def __init__(self, filename): """ Constructor. :param filename: XML filename :type: filename: str or unicode """ self.filename = filename self.filesize = os.path.getsize(filename) stream = open_stream(self.filename) self.document_tree = None parser = etree.XMLParser() self.document_tree = etree.parse(stream, parser) self.root_element = self.document_tree.getroot() self.root_element_tag = str(self.root_element.tag) self.doc_type = str(self.document_tree.docinfo.doctype) self.namespaces = self.root_element.nsmap self.schema_locations = self.root_element.get(SCHEMA_LOCATION.text) if self.schema_locations is not None: self.schema_locations = self.schema_locations.split(" ") else: self.schema_locations = [] self.no_ns_schema_location = self.root_element.get( NO_NS_SCHEMA_LOCATION.text)
def __init__(self, filename): """ Constructor. If the filename cannot be parsed into valid XML an empty document is created. :param filename: XML filename :type: filename: str or unicode """ self.filename = filename stream = open_stream(self.filename) self.issue_tree = None self.issue = '' self.newspaper_id = '' self.articles = [] self.date = datetime.now() self.page_count = 0 self.day_of_week = '' self.document_type = "newspaper" self.model = "papers" # Attempt to parse the file, even if its XML is invalid e.g: # <wd ...>.../wd> parser = etree.XMLParser(recover=True) self.issue_tree = etree.parse(stream, parser) has_issue = len(self.query("..//issue")) > 0 if not has_issue: raise Exception("Missing 'issue' element") self.issue = self.single_query('.//issue') # bl_ncnp_issue_apex.dtd, GALENP.dtd, nccoissue.dtd newspaper_id = self.single_query('//issue/id/text()') if newspaper_id is None: # LTO_issue.md newspaper_id = self.single_query( '//issue/metadatainfo/PSMID/text()') if newspaper_id is not None: self.newspaper_id = newspaper_id self.articles = [ Article(article, self.filename) for article in self.query('.//article') ] # bl_ncnp_issue_apex.dtd, GALENP.dtd, LTO_issue.dtd raw_date = self.single_query('//pf/text()') if raw_date is None: # nccoissue.dtd raw_date = self.single_query('//da/searchableDateStart/text()') if raw_date: self.date = datetime.strptime(raw_date, '%Y%m%d') else: self.date = None try: self.page_count = int(self.single_query('//ip/text()')) except Exception: pass
def open_document(self, document_code): """ Opens metadata file. :param document_code: document file code :type document_code: str or unicode :return: stream """ if ".zip" in self.filename: return self.zip.open(document_code + '-mets.xml') else: return open_stream(self.filename + "/" + document_code + '-mets.xml')
def open_page(self, document_code, page_code): """ Opens page file. :param document_code: page file code :type document_code: str or unicode :param page_code: file code :type page_code: str or unicode :return: stream """ if ".zip" in self.filename: return self.zip.open(page_code) else: return open_stream(self.filename + "/" + page_code)
def open_page(self, document_code, page_code): """ Opens page file. :param document_code: page file code :type document_code: str or unicode :param page_code: file code :type page_code: str or unicode :return: stream :rtype: zipfile.ZipExt """ if ".zip" in self.filename: return self.zip.open(document_code + '_' + page_code + '.xml') else: return open_stream(self.filename + "/" + document_code + '_' + page_code + '.xml')
def __init__(self, filename): """ Constructor. :param filename: XML filename :type: filename: str or unicode """ self.filename = filename stream = open_stream(self.filename) parser = etree.XMLParser(recover=True) self.xml_tree = etree.parse(stream, parser) self.articles = [ Article(article, self.filename) for article in self.query('.//result') ] self.document_type = "newspaper" self.model = "nzpp"
def __init__(self, filename): """ Constructor :param filename: archive filename :type: filename: str or unicode """ self.filename = filename stream = open_stream(self.filename) self.zip = zipfile.ZipFile(stream) self.filenames = [entry.filename for entry in self.zip.infolist()] document_pattern = re.compile(self.get_document_pattern()) page_pattern = re.compile(self.get_page_pattern()) document_matches = [ _f for _f in [document_pattern.match(name) for name in self.filenames] if _f] page_matches = [ _f for _f in [page_pattern.match(name) for name in self.filenames] if _f] self.document_codes = {match.group(1): [] for match in document_matches} for match in page_matches: self.document_codes[match.group(1)].append(match.group(2))
def __init__(self, filename) -> None: self.filename = filename stream = open_stream(self.filename) tree = etree.parse(stream) self.headings = parse_tree(tree)
def __init__(self, filename): self.filename = filename stream = open_stream(filename) output_string = StringIO() extract_text_to_fp(stream, output_string, laparams=LAParams()) self.raw_text = output_string