def saveResults(audioMetadata, metadata, children): for key in audioMetadata.keys(): #cherrypy.log("key:", key) if not in_blacklist(key, audioMetadata[key]): #cherrypy.log("TYPE:", type(audioMetadata[key])) value = audioMetadata[key] if hasattr(value, 'data'): #an embedded binary name = "cover" if hasattr(value, 'desc'): name = value.desc childpath = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1] try: tmp_handle = open(childpath, "wb") tmp_handle.write(value.data) tmp_handle.close() child = insiderer.get_metadata(childpath, name) children.append(child) finally: insiderer.safedelete(childpath) continue if not hasattr(value, '__len__') and hasattr(value, 'text'): value = value.text if hasattr(value, '__len__') and len(value) == 1: value = value[0] metadata[key] = value
def saveResults(audioMetadata, metadata, children): for key in audioMetadata.keys(): #cherrypy.log("key:", key) if not in_blacklist(key, audioMetadata[key]): #cherrypy.log("TYPE:", type(audioMetadata[key])) value = audioMetadata[key] if hasattr(value, 'data'): #an embedded binary name = "cover" if hasattr(value, 'desc'): name = value.desc childpath = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1] try: with open(childpath, "wb") as tmp_handle: tmp_handle.write(value.data) tmp_handle.close() child = insiderer.get_metadata(childpath, name) children.append(child) finally: insiderer.safedelete(childpath) continue if not hasattr(value, '__len__') and hasattr(value, 'text'): value = value.text if hasattr(value, '__len__') and len(value) == 1: value = value[0] metadata[key] = value
def process_a_jpeg(jpeg_data): try: jpeg_path = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1] jpeg_handle = open(jpeg_path, "wb") jpeg_handle.write(jpeg_data) jpeg_handle.close() import mimes.image jpeg_metadata = dict() mimes.image.image(jpeg_path, jpeg_metadata, None) except Exception as e: cherrypy.log("PDF JPEG exception", e) finally: insiderer.safedelete(jpeg_path) return jpeg_metadata
def image_svg_xml(path, metadata, children): svg = etree.parse(path) fields = { 'export-filename': '//@inkscape:export-filename', 'docname': '//@sodipodi:docname', 'description': '//svg:desc', 'title': '//svg:title', 'dca': '//@*[namespace-uri()="%s"][not(ancestor::dc:*)]' % namespaces[ "dc"], #because selecting by namespace attributes doesn't seem to work 'rdf': '//rdf:*[not(ancestor::rdf:*)]' } for key in fields.keys(): results = svg.xpath(fields[key], namespaces=namespaces) for result in results: key_name = key if '*' in fields[key] and len(results) > 0: if hasattr(result, "attrname"): key_name = re.sub(r'\{.*?\}', '', result.attrname) else: key_name = result.xpath('name()') key_name = insiderer.de_dup(key_name, metadata) if isinstance(result, etree._ElementUnicodeResult): metadata[key_name] = str(result) else: metadata[key_name] = xmltodict.parse( etree.tostring(result, encoding='utf8', method='xml').decode('utf-8')) base64images = svg.xpath('//svg:image', namespaces=namespaces) for image in base64images: key = image.xpath('local-name()') href = image.attrib.get('{%s}href' % namespaces["xlink"]) name = image.attrib.get('id') or "unknown" if href.startswith("data:") and base64_prefix in href[0:100]: image_data = base64.standard_b64decode( href[href.find(base64_prefix) + len(base64_prefix):]) try: image_path = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1] image_handle = open(image_path, "wb") image_handle.write(image_data) image_handle.close() child = insiderer.get_metadata(image_path, name) children.append(child) finally: insiderer.safedelete(image_path)
def application_vnd_oasis_opendocument_text(path, metadata, children, from_doc=False): odtzip = zipfile.ZipFile(path) try: tmp_path = None tmp_path = tempfile.mkdtemp(dir=insiderer.TMP_DIR) odtzip.extractall(tmp_path) for name in odtzip.namelist(): childpath = tmp_path + "/" + name try: #cherrypy.log(childpath, name) if name == "meta.xml": metaxml = etree.parse(childpath) metadatas = metaxml.xpath('/*/*/*') for item in metadatas: key = item.xpath('local-name()') if from_doc is True and key == "generator": pass else: metadata[key] = item.text elif name == "content.xml": metaxml = etree.parse(childpath) trackchanges = metaxml.xpath('//*[local-name() = "change-info"]') if len(trackchanges) > 0: metadata['track_changes'] = [] for trackchange in trackchanges: trackchange_dict = {} for item in trackchange.xpath('*'): trackchange_dict[item.xpath('local-name()')] = item.text metadata['track_changes'].append(trackchange_dict) elif os.path.isdir(childpath): pass elif name == "mimetype" or name == "manifest.rdf" or name == "META-INF/manifest.xml" or name == "current.xml" or name == "styles.xml" or name == "settings.xml" or name == "" or name == "Configurations2/accelerator/current.xml" or name == "layout-cache": pass elif from_doc is True and name == "Thumbnails/thumbnail.png": pass else: child = insiderer.get_metadata(childpath, name) children.append(child) finally: if not os.path.isdir(childpath): insiderer.safedelete(childpath) finally: if tmp_path: shutil.rmtree(tmp_path)
def application_pdf(path, metadata, children): try: uncompressed_pdf = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1] stdout = subprocess.check_output( ["pdftk", path, "output", uncompressed_pdf, "uncompress"]) with open(uncompressed_pdf, 'rb') as pdfhandle: pdfdata = pdfhandle.read() children.extend(extract_jpegs(pdfdata)) pdf_document = PyPDF2.PdfFileReader(pdfhandle) if pdf_document.isEncrypted: pdf_document.decrypt("") metadata["info"] = pdf_document.getDocumentInfo() metadata["xmp"] = dict() xmp = pdf_document.getXmpMetadata() if xmp: for name in dir(xmp): try: xmp_data = getattr(xmp, name) str_xmp_data = "" if isinstance(xmp_data, datetime.datetime): metadata["xmp"][name] = str(xmp_data.now()) else: str_xmp_data = str(xmp_data) try: metadata["xmp"][name] = json.loads( str_xmp_data) except Exception as e: if str_xmp_data is None: pass elif str_xmp_data.startswith("<"): try: metadata["xmp"][ name] = xmltodict.parse( xmp_data.toxml()) except Exception as e: pass else: metadata["xmp"][name] = str_xmp_data except Exception as e: cherrypy.log("Can't serialize %s. %s", name, e) except Exception as e: cherrypy.log("PDF exception", e) finally: insiderer.safedelete(uncompressed_pdf)
def image_svg_xml(path, metadata, children): svg = etree.parse(path) fields = { 'export-filename': '//@inkscape:export-filename', 'docname': '//@sodipodi:docname', 'description': '//svg:desc', 'title': '//svg:title', 'dca': '//@*[namespace-uri()="%s"][not(ancestor::dc:*)]' % namespaces["dc"], #because selecting by namespace attributes doesn't seem to work 'rdf': '//rdf:*[not(ancestor::rdf:*)]' } for key in fields.keys(): results = svg.xpath(fields[key], namespaces=namespaces) for result in results: key_name = key if '*' in fields[key] and len(results) > 0: if hasattr(result, "attrname"): key_name = re.sub(r'\{.*?\}', '', result.attrname) else: key_name = result.xpath('name()') key_name = insiderer.de_dup(key_name, metadata) if isinstance(result, etree._ElementUnicodeResult): metadata[key_name] = str(result) else: metadata[key_name] = xmltodict.parse(etree.tostring(result, encoding='utf8', method='xml').decode('utf-8')) base64images = svg.xpath('//svg:image', namespaces=namespaces) for image in base64images: key = image.xpath('local-name()') href = image.attrib.get('{%s}href' % namespaces["xlink"]) name = image.attrib.get('id') or "unknown" if href.startswith("data:") and base64_prefix in href[0:100]: image_data = base64.standard_b64decode( href[href.find(base64_prefix) + len(base64_prefix):] ) try: image_path = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1] image_handle = open(image_path, "wb") image_handle.write(image_data) image_handle.close() child = insiderer.get_metadata(image_path, name) children.append(child) finally: insiderer.safedelete(image_path)
def application_zip(path, metadata, children, from_doc=False): aZipFile = zipfile.ZipFile(path) try: tmp_path = None tmp_path = tempfile.mkdtemp(dir=insiderer.TMP_DIR) aZipFile.extractall(tmp_path) for name in aZipFile.namelist(): childpath = os.path.join(tmp_path, name) try: if os.path.isdir(childpath): pass else: child = insiderer.get_metadata(childpath, name) children.append(child) finally: if not os.path.isdir(childpath): insiderer.safedelete(childpath) finally: if tmp_path: shutil.rmtree(tmp_path)