def __str__(self): string = "\n".join([line.strip() for line in self.header.splitlines() if line.strip()]) + '\n\n' written_keys = [] for key, value in self.sorted_defaults: if value["comment"]: string += "\n" + "\n".join(line.strip() for line in value["comment"].splitlines() if line.strip()) # string += "\n%s = %s\n" % (key,repr(self.data[key])) string += "\n%s = %s\n" % (key, pretty_print(self.data[key])) written_keys.append(key) for key in self.data: if key not in written_keys: # string += "\n%s = %s\n" % (key,repr(self.data[key])) string += "\n%s = %s\n" % (key, pretty_print(self.data[key])) written_keys.append(key) return string
def parse_dc_header(self): load_metadata = [] for filename in self.list_files(): data = {} fn = self.textdir + filename header = "" with open(fn) as fh: for line in fh: start_scan = re.search("<teiheader>|<temphead>|<head>", line, re.IGNORECASE) end_scan = re.search("</teiheader>|<\/?temphead>|</head>", line, re.IGNORECASE) if start_scan: header += line[start_scan.start():] elif end_scan: header += line[:end_scan.end()] break else: header += line matches = re.findall('<meta name="DC\.([^"]+)" content="([^"]+)"', header) if not matches: matches = re.findall('<dc:([^>]+)>([^>]+)>', header) for metadata_name, metadata_value in matches: metadata_value = metadata_value metadata_value = convert_entities(metadata_value.decode('utf-8')).encode('utf-8') metadata_name = metadata_name.lower() data[metadata_name] = metadata_value data["filename"] = filename # place at the end in case the value was in the header data = self.create_year_field(data) if self.debug: print(pretty_print(data)) load_metadata.append(data) return load_metadata
def parse_tei_header(self): load_metadata = [] metadata_xpaths = self.parser_config["doc_xpaths"] deleted_files = [] for file in os.scandir(self.textdir): data = {"filename": file.name} header = "" with open(file.path) as text_file: try: file_content = "".join(text_file.readlines()) except UnicodeDecodeError: deleted_files.append(file.name) continue try: start_header_index = re.search(r'<teiheader', file_content, re.I).start() end_header_index = re.search(r'</teiheader', file_content, re.I).start() except AttributeError: # tag not found deleted_files.append(file.name) continue header = file_content[start_header_index:end_header_index] header = convert_entities(header) if self.debug: print("parsing %s header..." % file.name) parser = etree.XMLParser(recover=True) try: tree = etree.fromstring(header, parser) trimmed_metadata_xpaths = [] for field in metadata_xpaths: for xpath in metadata_xpaths[field]: attr_pattern_match = re.search(r"@([^\/\[\]]+)$", xpath) if attr_pattern_match: xp_prefix = xpath[:attr_pattern_match.start(0)] attr_name = attr_pattern_match.group(1) elements = tree.findall(xp_prefix) for el in elements: if el is not None and el.get(attr_name, ""): data[field] = el.get(attr_name, "") break else: el = tree.find(xpath) if el is not None and el.text is not None: data[field] = el.text break trimmed_metadata_xpaths = [ (metadata_type, xpath, field) for metadata_type in ["div", "para", "sent", "word", "page"] if metadata_type in metadata_xpaths for field in metadata_xpaths[metadata_type] for xpath in metadata_xpaths[metadata_type][field] ] data = self.create_year_field(data) if self.debug: print(pretty_print(data)) data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths} load_metadata.append(data) except etree.XMLSyntaxError: deleted_files.append(f) if deleted_files: for f in deleted_files: print("%s has no valid TEI header or contains invalid data: removing from database load..." % f) return load_metadata
def parse_dc_header(self): """Parse Dublin Core header""" load_metadata = [] for file in os.scandir(self.textdir): data = {} header = "" with open(file.path) as fh: for line in fh: start_scan = re.search(r"<teiheader>|<temphead>|<head>", line, re.IGNORECASE) end_scan = re.search(r"</teiheader>|<\/?temphead>|</head>", line, re.IGNORECASE) if start_scan: header += line[start_scan.start():] elif end_scan: header += line[:end_scan.end()] break else: header += line matches = re.findall(r'<meta name="DC\.([^"]+)" content="([^"]+)"', header) if not matches: matches = re.findall(r"<dc:([^>]+)>([^>]+)>", header) for metadata_name, metadata_value in matches: metadata_value = convert_entities(metadata_value) metadata_name = metadata_name.lower() data[metadata_name] = metadata_value data[ "filename"] = file.name # place at the end in case the value was in the header data = self.create_year_field(data) if self.debug: print(pretty_print(data)) load_metadata.append(data) return load_metadata
def __str__(self): string = "\n".join([ line.strip() for line in self.header.splitlines() if line.strip() ]) + "\n\n" written_keys = [] for key, value in self.defaults.items(): if value["comment"]: string += "\n" + "\n".join( line.strip() for line in value["comment"].splitlines() if line.strip()) string += "\n%s = %s\n" % (key, pretty_print(self.data[key])) written_keys.append(key) for key in self.data: if key not in written_keys: string += "\n%s = %s\n" % (key, pretty_print(self.data[key])) written_keys.append(key) return string
def parse_tei_header(self): """Parse header in TEI files""" load_metadata = [] metadata_xpaths = self.parser_config["doc_xpaths"] doc_count = len(os.listdir(self.textdir)) for pos, file in enumerate(os.scandir(self.textdir)): data = {"filename": file.name} header = "" with open(file.path) as text_file: try: file_content = "".join(text_file.readlines()) except UnicodeDecodeError: self.deleted_files.append(file.name) continue try: start_header_index = re.search(r"<teiheader", file_content, re.I).start() end_header_index = re.search(r"</teiheader", file_content, re.I).start() except AttributeError: # tag not found self.deleted_files.append(file.name) continue header = file_content[start_header_index:end_header_index] header = convert_entities(header) if self.debug: print("parsing %s header..." % file.name) parser = lxml.etree.XMLParser(recover=True) try: tree = lxml.etree.fromstring(header, parser) trimmed_metadata_xpaths = [] for field in metadata_xpaths: for xpath in metadata_xpaths[field]: xpath = xpath.rstrip( "/" ) # make sure there are no trailing slashes which make lxml die try: elements = tree.xpath(xpath) except lxml.etree.XPathEvalError: continue for element in elements: if element is not None: value = "" if isinstance(element, lxml.etree._Element ) and element.text is not None: value = element.text.strip() elif isinstance( element, lxml.etree._ElementUnicodeResult): value = str(element).strip() if value: data[field] = value break else: # only continue looping over xpaths if no break in inner loop continue break trimmed_metadata_xpaths = [ (metadata_type, xpath, field) for metadata_type in ["div", "para", "sent", "word", "page"] if metadata_type in metadata_xpaths for field in metadata_xpaths[metadata_type] for xpath in metadata_xpaths[metadata_type][field] ] data = self.create_year_field(data) if self.debug: print(pretty_print(data)) data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths} load_metadata.append(data) except lxml.etree.XMLSyntaxError: self.deleted_files.append(file.name) print( f"\r{time.ctime()}: Parsing document level metadata: {pos+1}/{doc_count} done...", flush=True, end="") if self.deleted_files: for f in self.deleted_files: print( "%s has no valid TEI header or contains invalid data: removing from database load..." % f) return load_metadata
def __str__(self): """String representation of parsed loader config.""" return pretty_print(self.values)
def parse_tei_header(self): """Parse header in TEI files""" load_metadata = [] metadata_xpaths = self.parser_config["doc_xpaths"] self.deleted_files = [] for file in os.scandir(self.textdir): data = {"filename": file.name} header = "" with open(file.path) as text_file: try: file_content = "".join(text_file.readlines()) except UnicodeDecodeError: self.deleted_files.append(file.name) continue try: start_header_index = re.search(r"<teiheader", file_content, re.I).start() end_header_index = re.search(r"</teiheader", file_content, re.I).start() except AttributeError: # tag not found self.deleted_files.append(file.name) continue header = file_content[start_header_index:end_header_index] header = convert_entities(header) if self.debug: print("parsing %s header..." % file.name) parser = etree.XMLParser(recover=True) try: tree = etree.fromstring(header, parser) trimmed_metadata_xpaths = [] for field in metadata_xpaths: for xpath in metadata_xpaths[field]: attr_pattern_match = re.search(r"@([^\/\[\]]+)$", xpath) if attr_pattern_match: xp_prefix = xpath[: attr_pattern_match.start(0)] attr_name = attr_pattern_match.group(1) elements = tree.findall(xp_prefix) for el in elements: if el is not None and el.get(attr_name, ""): data[field] = el.get(attr_name, "") break else: el = tree.find(xpath) if el is not None and el.text is not None: data[field] = el.text break trimmed_metadata_xpaths = [ (metadata_type, xpath, field) for metadata_type in ["div", "para", "sent", "word", "page"] if metadata_type in metadata_xpaths for field in metadata_xpaths[metadata_type] for xpath in metadata_xpaths[metadata_type][field] ] data = self.create_year_field(data) if self.debug: print(pretty_print(data)) data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths} load_metadata.append(data) except etree.XMLSyntaxError: self.deleted_files.append(file.name) if self.deleted_files: for f in self.deleted_files: print("%s has no valid TEI header or contains invalid data: removing from database load..." % f) return load_metadata
def parse_tei_header(self): """Parse header in TEI files""" load_metadata = [] metadata_xpaths = self.parser_config["doc_xpaths"] for file in os.scandir(self.textdir): data = {"filename": file.name} header = "" with open(file.path) as text_file: try: file_content = "".join(text_file.readlines()) except UnicodeDecodeError: self.deleted_files.append(file.name) continue try: start_header_index = re.search(r"<teiheader", file_content, re.I).start() end_header_index = re.search(r"</teiheader", file_content, re.I).start() except AttributeError: # tag not found self.deleted_files.append(file.name) continue header = file_content[start_header_index:end_header_index] header = convert_entities(header) if self.debug: print("parsing %s header..." % file.name) parser = lxml.etree.XMLParser(recover=True) try: tree = lxml.etree.fromstring(header, parser) trimmed_metadata_xpaths = [] for field in metadata_xpaths: for xpath in metadata_xpaths[field]: xpath = xpath.rstrip("/") # make sure there are no trailing slashes which make lxml die try: elements = tree.xpath(xpath) except lxml.etree.XPathEvalError: continue for element in elements: if element is not None: value = "" if isinstance(element, lxml.etree._Element) and element.text is not None: value = element.text.strip() elif isinstance(element, lxml.etree._ElementUnicodeResult): value = str(element).strip() if value: data[field] = value break else: # only continue looping over xpaths if no break in inner loop continue break trimmed_metadata_xpaths = [ (metadata_type, xpath, field) for metadata_type in ["div", "para", "sent", "word", "page"] if metadata_type in metadata_xpaths for field in metadata_xpaths[metadata_type] for xpath in metadata_xpaths[metadata_type][field] ] data = self.create_year_field(data) if self.debug: print(pretty_print(data)) data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths} load_metadata.append(data) except lxml.etree.XMLSyntaxError: self.deleted_files.append(file.name) if self.deleted_files: for f in self.deleted_files: print("%s has no valid TEI header or contains invalid data: removing from database load..." % f) return load_metadata