Пример #1
0
    def get_metadata(self):
        """Returns metadata from both
           the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
        """
        with PdfMinerWrapper(self.path) as pdf_miner:
            metadata = Metadata()

            for i in pdf_miner.document.info:
                metadata.add(i)

            if 'Metadata' in pdf_miner.document.catalog:
                catalog = pdf_miner.document.catalog['Metadata']
                xmp_metadata = resolve1(catalog).get_data()
                xmp_dict = xmp_to_dict(xmp_metadata)
                # Let's add only the most useful one
                if "xap" in xmp_dict:
                    metadata.add(xmp_dict["xap"])
                if "pdf" in xmp_dict:
                    metadata.add(xmp_dict["pdf"])
                if "dc" in xmp_dict:
                    metadata.add(xmp_dict["dc"], metadataType="dc")

            self.metadata = metadata
            return metadata
Пример #2
0
 def get_metadata(self):
     """Returns a .modules.metadata.Metadata object
     """
     self.metadata = Metadata()
     document = openxmllib.openXmlDocument(path=self.path)
     self.metadata.add(document.allProperties, "ooxml")
     return self.metadata
Пример #3
0
    def get_metadata(self):
        """Returns metadata from both
           the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
        """
        with PdfMinerWrapper(self.path) as pdf_miner:
            metadata = Metadata()

            for i in pdf_miner.document.info:
                metadata.add(i)

            if 'Metadata' in pdf_miner.document.catalog:
                catalog = pdf_miner.document.catalog['Metadata']
                xmp_metadata = resolve1(catalog).get_data()
                xmp_dict = xmp_to_dict(xmp_metadata)
                # Let's add only the most useful one
                if "xap" in xmp_dict:
                    metadata.add(xmp_dict["xap"])
                if "pdf" in xmp_dict:
                    metadata.add(xmp_dict["pdf"])
                if "dc" in xmp_dict:
                    metadata.add(xmp_dict["dc"], metadataType="dc")

            self.metadata = metadata
            return metadata
Пример #4
0
    def get_metadata(self):
        """Returns metadata from both
    	   the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
    	"""
        file_pointer = open(self.path, 'rb')
        parser = PDFParser(file_pointer)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        metadata = Metadata()
        for i in doc.info:
            metadata.add(i)
        if 'Metadata' in doc.catalog:
            xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
            xmp_dict = xmp_to_dict(xmp_metadata)
            #Let's add only the most useful one
            if "xap" in xmp_dict:
                metadata.add(xmp_dict["xap"])
            if "pdf" in xmp_dict:
                metadata.add(xmp_dict["pdf"])
            if "dc" in xmp_dict:
                metadata.add(xmp_dict["dc"], metadataType="dc")
        file_pointer.close()

        self.metadata = metadata
        return metadata
Пример #5
0
 def get_metadata(self):
     """Returns a .modules.metadata.Metadata object
     """
     command = 'wvSummary ' + self.path
     metadata = Metadata()
     for line in run_command(command):
         parts = line.strip().replace("\"", "").split(" = ")
         if len(parts) == 2:
             metadata.add({parts[0]: parts[1]}, "mso")
     self.metadata = metadata
     return metadata
Пример #6
0
 def get_metadata(self):
     """Returns a metadata.Metadata object
        Using wVSummary
     """
     command = 'wvSummary ' + self.path
     output = subprocess.check_output(command, shell=True)
     metadata = Metadata()
     for line in output.split("\n"):
         parts = line.strip().replace("\"", "").split(" = ")
         if len(parts) == 2:
             metadata.add({parts[0]: parts[1]}, "mso")
     return metadata
Пример #7
0
 def get_metadata(self):
     """Returns a metadata.Metadata object
        Using wVSummary
     """
     command = 'wvSummary ' + self.path
     output = subprocess.check_output(command, shell=True)
     metadata = Metadata()
     for line in output.split("\n"):
         parts = line.strip().replace("\"", "").split(" = ")
         if len(parts) == 2:
             metadata.add({parts[0]: parts[1]}, "mso")
     return metadata
Пример #8
0
def run():
    # Destruction of the temporary directory on completion
    with tempfile.TemporaryDirectory() as tmp_dir:

        # Create core object instace
        core = Core(tmp_dir)

        # Checks
        file_checks(core.argv.input_file)
        file_checks(core.argv.metadata)
        path_checks(core.argv.output_folder)
        dependencies_checks()

        # Retrieve ISBN
        json_file = os.path.abspath(core.argv.metadata)
        with open(json_file) as json_data:
            isbn = json.load(json_data)['isbn'].replace('-', '')

        # Create object instaces
        metadata = Metadata(isbn)
        pdf = Pdf(core.argv.input_file, tmp_dir)

        page_ranges = []
        output_file_names = []

        # Iterate over chapters metadata
        for chapter_data in metadata.chapters_data:
            page_ranges.append(chapter_data['page'].split('-'))
            output_file_names.append(chapter_data['DOI'].split('/')[1] +
                                     '.pdf')

        # Merge PDFs
        with concurrent.futures.ProcessPoolExecutor() as executor:
            executor.map(pdf.merge_pdfs, page_ranges, output_file_names)

        # Write metadata
        for output_file_name, chapter_data in zip(output_file_names,
                                                  metadata.chapters_data):
            output_file_path = os.path.join(tmp_dir, output_file_name)
            Metadata.write_metadata(chapter_data, output_file_path)

        # PDFs are temporarely stored in tmp_dir
        if core.argv.compress:
            # Output a zip archive
            core.output_archive(metadata.get_doi_suffix())
        else:
            # Output loose PDFs
            core.output_pdfs()
Пример #9
0
 def get_metadata(self):
     """Returns a .modules.metadata.Metadata object
     """
     self.metadata = Metadata()
     document = openxmllib.openXmlDocument(path=self.path)
     self.metadata.add(document.allProperties, "ooxml")
     return self.metadata
Пример #10
0
    def get_metadata(self):
        """Returns a metadata.Metadata object

           See http://www.biblioscape.com/rtf15_spec.htm
           for RTF metadata specification
        """
        import os
        temp_filename = os.path.join("temp", "tmp.rtf.xml")

        parse_obj = rtf2xml.ParseRtf.ParseRtf(in_file=self.path,
                                              out_file=temp_filename)
        parse_obj.parse_rtf()
        metadata = Metadata()

        import xml.etree.ElementTree as ET
        tree = ET.parse(temp_filename)
        root = tree.getroot()
        section = root.find(".//{%s}doc-information" % self.NS_RTF)
        if len(section) > 0:
            for tag in section.iterfind(".//*"):
                tag_name = self._tag_name(tag)
                if tag.text is not None:
                    metadata.add({tag_name: tag.text})
                elif tag.get("year") is not None and tag.get("year") != "0":
                    date_parts = []
                    date_parts.append(tag.get("year"))
                    date_parts.append(tag.get("month").zfill(2) or "01")
                    date_parts.append(tag.get("day").zfill(2) or "01")
                    date_str = "-".join(date_parts)
                    metadata.add({tag_name: date_str})

        os.unlink(temp_filename)
        return metadata
Пример #11
0
def main(message):
    try:
        model, data_prediction, data_train, path, time_total = get_result(
            message)
        hadoop_metadata_path = message["metadata_path"]
        metadata = Metadata(spark, hadoop_metadata_path, PipelineType.FIT,
                            None, data_prediction, data_train, model, message,
                            time_total)
        metadata.process_metadata()

    except Exception as er:
        traceback.print_exc(file=sys.stdout)
        hadoop_metadata_path = message["metadata_path"]
        metadata = Metadata(spark, hadoop_metadata_path, PipelineType.FIT,
                            str(er))
        metadata.process_metadata()
    spark.stop()
Пример #12
0
    def get_metadata(self):
        """Returns a metadata.Metadata object
        """
        if self.soup is None:
            self._load_content()

        metadata = Metadata()
        try:
            metadata.add({"title": self.soup.title.string})
        except Exception:
            pass
        for meta_tag in self.soup.find_all('meta'):
            key = meta_tag.get('name', None) or meta_tag.get('property', None)
            metadata.add({key: meta_tag.get('content')}, metadataType="html")
        for link_tag in self.soup.find_all('link'):
            metadata.add({meta_tag.get('rel'): meta_tag.get("href")},
                         metadataType="html")

        return metadata
Пример #13
0
    def get_metadata(self):
        """Returns a metadata.Metadata object
        """
        if self.soup is None:
            self._load_content()

        metadata = Metadata()
        try:
            metadata.add({"title": self.soup.title.string})
        except Exception:
            pass
        for meta_tag in self.soup.find_all('meta'):
            key = meta_tag.get('name', None) or meta_tag.get('property', None)
            metadata.add({key: meta_tag.get('content')},
                         metadataType="html")
        for link_tag in self.soup.find_all('link'):
            metadata.add({meta_tag.get('rel'): meta_tag.get("href")},
                         metadataType="html")

        return metadata
Пример #14
0
class DocxExtractor(ExtractorBase):
    """Class for getting plain text from a Office Open XML file.
    """
    def get_metadata(self):
        """Returns a .modules.metadata.Metadata object
        """
        self.metadata = Metadata()
        document = openxmllib.openXmlDocument(path=self.path)
        self.metadata.add(document.allProperties, "ooxml")
        return self.metadata

    def get_header(self):
        """Our docx library has no support for headers yet.
           For now carve out the header ourselves by parsing xml files.
        """
        headers = []
        namespaces = dict(
            w="http://schemas.openxmlformats.org/wordprocessingml/2006/main",
            v="urn:schemas-microsoft-com:vml",
            r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
        )

        import xml.etree.ElementTree as ET
        """ Docx is basically a zip file with embedded xml and media files
        """
        import zipfile
        z = zipfile.ZipFile(self.path)
        """ Get all header files and parse them for text
        """
        docx_files = z.namelist()
        header_xml_files = filter(lambda x: x.startswith('word/header'),
                                  docx_files)

        for header_xml in header_xml_files:
            # header_name = header_xml.replace('word/', '')
            xml_file = z.open(header_xml, 'r')
            root = ET.fromstring(xml_file.read())

            text_elements = root.findall(".//w:t", namespaces)
            for text_element in text_elements:
                if text_element.text is not None:
                    headers.append(text_element.text)
            """ Headers might also include images with text
                To get those we find all images in header
                    => get id's of those images
                    => find corresponding image files in the /media directory
                    => OCR
            """
            """
            # Disiable OCR, as PIL on *nix can't handle WMF images
            images = root.findall(".//v:imagedata", namespaces)
            for image in images:
                id_key = '{%s}id' % namespaces['r']
                image_id = image.attrib[id_key]
                relation_file_path = 'word/_rels/%s.rels' % header_name
                relation_file = z.open(relation_file_path, 'r')
                relation_file_root = ET.fromstring(relation_file.read())
                xpath = ".//*[@Id='%s']" % image_id
                image_path = 'word/%s' % relation_file_root.find(xpath, namespaces).attrib['Target']

                # OCR
                import pytesseract
                from PIL import Image
                import cStringIO
                img = Image.open(cStringIO.StringIO(z.open(image_path).read()))
                string_from_image = pytesseract.image_to_string(img, lang='swe')
                headers.append(string_from_image.decode('utf-8'))
            """
        return "\n".join(headers)

    def get_next_page(self):
        """Returns all the text in one single page.
           We might be able to use e.g. Abiword to calculate
           approximate page breaks.
        """
        page = DocxPage()
        page._text = self.get_text()
        yield page

    def get_text(self):
        """Returns all text content from the document as plain text.
        """
        try:
            return self._text_cache
        except AttributeError:  # not cached
            pass

        paratextlist = []
        document = Document(self.path)
        for paragraph in document.paragraphs:
            paratextlist.append(paragraph.text)
        self._text_cache = "\n".join(paratextlist)
        return self._text_cache
Пример #15
0
class DocxExtractor(ExtractorBase):
    """Class for getting plain text from a Office Open XML file.
    """

    def get_metadata(self):
        """Returns a .modules.metadata.Metadata object
        """
        self.metadata = Metadata()
        document = openxmllib.openXmlDocument(path=self.path)
        self.metadata.add(document.allProperties, "ooxml")
        return self.metadata

    def get_header(self):
        """Our docx library has no support for headers yet.
           For now carve out the header ourselves by parsing xml files.
        """
        headers = []
        namespaces = dict(
            w="http://schemas.openxmlformats.org/wordprocessingml/2006/main",
            v="urn:schemas-microsoft-com:vml",
            r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
        )

        import xml.etree.ElementTree as ET

        """ Docx is basically a zip file with embedded xml and media files
        """
        import zipfile
        z = zipfile.ZipFile(self.path)

        """ Get all header files and parse them for text
        """
        docx_files = z.namelist()
        header_xml_files = filter(lambda x: x.startswith('word/header'),
                                  docx_files)

        for header_xml in header_xml_files:
            # header_name = header_xml.replace('word/', '')
            xml_file = z.open(header_xml, 'r')
            root = ET.fromstring(xml_file.read())

            text_elements = root.findall(".//w:t", namespaces)
            for text_element in text_elements:
                if text_element.text is not None:
                    headers.append(text_element.text)

            """ Headers might also include images with text
                To get those we find all images in header
                    => get id's of those images
                    => find corresponding image files in the /media directory
                    => OCR
            """
            """
            # Disiable OCR, as PIL on *nix can't handle WMF images
            images = root.findall(".//v:imagedata", namespaces)
            for image in images:
                id_key = '{%s}id' % namespaces['r']
                image_id = image.attrib[id_key]
                relation_file_path = 'word/_rels/%s.rels' % header_name
                relation_file = z.open(relation_file_path, 'r')
                relation_file_root = ET.fromstring(relation_file.read())
                xpath = ".//*[@Id='%s']" % image_id
                image_path = 'word/%s' % relation_file_root.find(xpath, namespaces).attrib['Target']

                # OCR
                import pytesseract
                from PIL import Image
                import cStringIO
                img = Image.open(cStringIO.StringIO(z.open(image_path).read()))
                string_from_image = pytesseract.image_to_string(img, lang='swe')
                headers.append(string_from_image.decode('utf-8'))
            """
        return "\n".join(headers)


    def get_next_page(self):
        """Returns all the text in one single page.
           We might be able to use e.g. Abiword to calculate
           approximate page breaks.
        """
        page = DocxPage()
        page._text = self.get_text()
        yield page

    def get_text(self):
        """Returns all text content from the document as plain text.
        """
        try:
            return self._text_cache
        except AttributeError:  # not cached
            pass

        paratextlist = []
        document = Document(self.path)
        for paragraph in document.paragraphs:
            paratextlist.append(paragraph.text)
        self._text_cache = "\n".join(paratextlist)
        return self._text_cache