def get_metadata(self): """Returns metadata from both the info field (older PDFs) and XMP (newer PDFs). Return format is a .modules.metadata.Metadata object """ with PdfMinerWrapper(self.path) as pdf_miner: metadata = Metadata() for i in pdf_miner.document.info: metadata.add(i) if 'Metadata' in pdf_miner.document.catalog: catalog = pdf_miner.document.catalog['Metadata'] xmp_metadata = resolve1(catalog).get_data() xmp_dict = xmp_to_dict(xmp_metadata) # Let's add only the most useful one if "xap" in xmp_dict: metadata.add(xmp_dict["xap"]) if "pdf" in xmp_dict: metadata.add(xmp_dict["pdf"]) if "dc" in xmp_dict: metadata.add(xmp_dict["dc"], metadataType="dc") self.metadata = metadata return metadata
def get_metadata(self): """Returns a .modules.metadata.Metadata object """ self.metadata = Metadata() document = openxmllib.openXmlDocument(path=self.path) self.metadata.add(document.allProperties, "ooxml") return self.metadata
def get_metadata(self): """Returns metadata from both the info field (older PDFs) and XMP (newer PDFs). Return format is a .modules.metadata.Metadata object """ file_pointer = open(self.path, 'rb') parser = PDFParser(file_pointer) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() metadata = Metadata() for i in doc.info: metadata.add(i) if 'Metadata' in doc.catalog: xmp_metadata = resolve1(doc.catalog['Metadata']).get_data() xmp_dict = xmp_to_dict(xmp_metadata) #Let's add only the most useful one if "xap" in xmp_dict: metadata.add(xmp_dict["xap"]) if "pdf" in xmp_dict: metadata.add(xmp_dict["pdf"]) if "dc" in xmp_dict: metadata.add(xmp_dict["dc"], metadataType="dc") file_pointer.close() self.metadata = metadata return metadata
def get_metadata(self): """Returns a .modules.metadata.Metadata object """ command = 'wvSummary ' + self.path metadata = Metadata() for line in run_command(command): parts = line.strip().replace("\"", "").split(" = ") if len(parts) == 2: metadata.add({parts[0]: parts[1]}, "mso") self.metadata = metadata return metadata
def get_metadata(self): """Returns a metadata.Metadata object Using wVSummary """ command = 'wvSummary ' + self.path output = subprocess.check_output(command, shell=True) metadata = Metadata() for line in output.split("\n"): parts = line.strip().replace("\"", "").split(" = ") if len(parts) == 2: metadata.add({parts[0]: parts[1]}, "mso") return metadata
def run(): # Destruction of the temporary directory on completion with tempfile.TemporaryDirectory() as tmp_dir: # Create core object instace core = Core(tmp_dir) # Checks file_checks(core.argv.input_file) file_checks(core.argv.metadata) path_checks(core.argv.output_folder) dependencies_checks() # Retrieve ISBN json_file = os.path.abspath(core.argv.metadata) with open(json_file) as json_data: isbn = json.load(json_data)['isbn'].replace('-', '') # Create object instaces metadata = Metadata(isbn) pdf = Pdf(core.argv.input_file, tmp_dir) page_ranges = [] output_file_names = [] # Iterate over chapters metadata for chapter_data in metadata.chapters_data: page_ranges.append(chapter_data['page'].split('-')) output_file_names.append(chapter_data['DOI'].split('/')[1] + '.pdf') # Merge PDFs with concurrent.futures.ProcessPoolExecutor() as executor: executor.map(pdf.merge_pdfs, page_ranges, output_file_names) # Write metadata for output_file_name, chapter_data in zip(output_file_names, metadata.chapters_data): output_file_path = os.path.join(tmp_dir, output_file_name) Metadata.write_metadata(chapter_data, output_file_path) # PDFs are temporarely stored in tmp_dir if core.argv.compress: # Output a zip archive core.output_archive(metadata.get_doi_suffix()) else: # Output loose PDFs core.output_pdfs()
def get_metadata(self): """Returns a metadata.Metadata object See http://www.biblioscape.com/rtf15_spec.htm for RTF metadata specification """ import os temp_filename = os.path.join("temp", "tmp.rtf.xml") parse_obj = rtf2xml.ParseRtf.ParseRtf(in_file=self.path, out_file=temp_filename) parse_obj.parse_rtf() metadata = Metadata() import xml.etree.ElementTree as ET tree = ET.parse(temp_filename) root = tree.getroot() section = root.find(".//{%s}doc-information" % self.NS_RTF) if len(section) > 0: for tag in section.iterfind(".//*"): tag_name = self._tag_name(tag) if tag.text is not None: metadata.add({tag_name: tag.text}) elif tag.get("year") is not None and tag.get("year") != "0": date_parts = [] date_parts.append(tag.get("year")) date_parts.append(tag.get("month").zfill(2) or "01") date_parts.append(tag.get("day").zfill(2) or "01") date_str = "-".join(date_parts) metadata.add({tag_name: date_str}) os.unlink(temp_filename) return metadata
def main(message): try: model, data_prediction, data_train, path, time_total = get_result( message) hadoop_metadata_path = message["metadata_path"] metadata = Metadata(spark, hadoop_metadata_path, PipelineType.FIT, None, data_prediction, data_train, model, message, time_total) metadata.process_metadata() except Exception as er: traceback.print_exc(file=sys.stdout) hadoop_metadata_path = message["metadata_path"] metadata = Metadata(spark, hadoop_metadata_path, PipelineType.FIT, str(er)) metadata.process_metadata() spark.stop()
def get_metadata(self): """Returns a metadata.Metadata object """ if self.soup is None: self._load_content() metadata = Metadata() try: metadata.add({"title": self.soup.title.string}) except Exception: pass for meta_tag in self.soup.find_all('meta'): key = meta_tag.get('name', None) or meta_tag.get('property', None) metadata.add({key: meta_tag.get('content')}, metadataType="html") for link_tag in self.soup.find_all('link'): metadata.add({meta_tag.get('rel'): meta_tag.get("href")}, metadataType="html") return metadata
class DocxExtractor(ExtractorBase): """Class for getting plain text from a Office Open XML file. """ def get_metadata(self): """Returns a .modules.metadata.Metadata object """ self.metadata = Metadata() document = openxmllib.openXmlDocument(path=self.path) self.metadata.add(document.allProperties, "ooxml") return self.metadata def get_header(self): """Our docx library has no support for headers yet. For now carve out the header ourselves by parsing xml files. """ headers = [] namespaces = dict( w="http://schemas.openxmlformats.org/wordprocessingml/2006/main", v="urn:schemas-microsoft-com:vml", r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" ) import xml.etree.ElementTree as ET """ Docx is basically a zip file with embedded xml and media files """ import zipfile z = zipfile.ZipFile(self.path) """ Get all header files and parse them for text """ docx_files = z.namelist() header_xml_files = filter(lambda x: x.startswith('word/header'), docx_files) for header_xml in header_xml_files: # header_name = header_xml.replace('word/', '') xml_file = z.open(header_xml, 'r') root = ET.fromstring(xml_file.read()) text_elements = root.findall(".//w:t", namespaces) for text_element in text_elements: if text_element.text is not None: headers.append(text_element.text) """ Headers might also include images with text To get those we find all images in header => get id's of those images => find corresponding image files in the /media directory => OCR """ """ # Disiable OCR, as PIL on *nix can't handle WMF images images = root.findall(".//v:imagedata", namespaces) for image in images: id_key = '{%s}id' % namespaces['r'] image_id = image.attrib[id_key] relation_file_path = 'word/_rels/%s.rels' % header_name relation_file = z.open(relation_file_path, 'r') relation_file_root = ET.fromstring(relation_file.read()) xpath = ".//*[@Id='%s']" % image_id image_path = 'word/%s' % relation_file_root.find(xpath, namespaces).attrib['Target'] # OCR import pytesseract from PIL import Image import cStringIO img = Image.open(cStringIO.StringIO(z.open(image_path).read())) string_from_image = pytesseract.image_to_string(img, lang='swe') headers.append(string_from_image.decode('utf-8')) """ return "\n".join(headers) def get_next_page(self): """Returns all the text in one single page. We might be able to use e.g. Abiword to calculate approximate page breaks. """ page = DocxPage() page._text = self.get_text() yield page def get_text(self): """Returns all text content from the document as plain text. """ try: return self._text_cache except AttributeError: # not cached pass paratextlist = [] document = Document(self.path) for paragraph in document.paragraphs: paratextlist.append(paragraph.text) self._text_cache = "\n".join(paratextlist) return self._text_cache