Пример #1
0
def saveResults(audioMetadata, metadata, children):
  for key in audioMetadata.keys():
    #cherrypy.log("key:", key)
    if not in_blacklist(key, audioMetadata[key]):
      #cherrypy.log("TYPE:", type(audioMetadata[key]))  
      value = audioMetadata[key]
      if hasattr(value, 'data'):
        #an embedded binary
        name = "cover"
        if hasattr(value, 'desc'):
          name = value.desc
        childpath = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1]
        try:
          tmp_handle = open(childpath, "wb")
          tmp_handle.write(value.data)
          tmp_handle.close()
          child = insiderer.get_metadata(childpath, name)
          children.append(child)
        finally:
           insiderer.safedelete(childpath)
        continue
      if not hasattr(value, '__len__') and hasattr(value, 'text'):
        value = value.text
      if hasattr(value, '__len__') and len(value) == 1:
        value = value[0]
      metadata[key] = value
Пример #2
0
def saveResults(audioMetadata, metadata, children):
    for key in audioMetadata.keys():
        #cherrypy.log("key:", key)
        if not in_blacklist(key, audioMetadata[key]):
            #cherrypy.log("TYPE:", type(audioMetadata[key]))
            value = audioMetadata[key]
            if hasattr(value, 'data'):
                #an embedded binary
                name = "cover"
                if hasattr(value, 'desc'):
                    name = value.desc
                childpath = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1]
                try:
                    with open(childpath, "wb") as tmp_handle:
                        tmp_handle.write(value.data)
                        tmp_handle.close()
                        child = insiderer.get_metadata(childpath, name)
                        children.append(child)
                finally:
                    insiderer.safedelete(childpath)
                continue
            if not hasattr(value, '__len__') and hasattr(value, 'text'):
                value = value.text
            if hasattr(value, '__len__') and len(value) == 1:
                value = value[0]
            metadata[key] = value
Пример #3
0
def process_a_jpeg(jpeg_data):
    try:
        jpeg_path = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1]
        jpeg_handle = open(jpeg_path, "wb")
        jpeg_handle.write(jpeg_data)
        jpeg_handle.close()
        import mimes.image
        jpeg_metadata = dict()
        mimes.image.image(jpeg_path, jpeg_metadata, None)
    except Exception as e:
        cherrypy.log("PDF JPEG exception", e)
    finally:
        insiderer.safedelete(jpeg_path)

    return jpeg_metadata
Пример #4
0
def image_svg_xml(path, metadata, children):
    svg = etree.parse(path)
    fields = {
        'export-filename': '//@inkscape:export-filename',
        'docname': '//@sodipodi:docname',
        'description': '//svg:desc',
        'title': '//svg:title',
        'dca': '//@*[namespace-uri()="%s"][not(ancestor::dc:*)]' % namespaces[
            "dc"],  #because selecting by namespace attributes doesn't seem to work
        'rdf': '//rdf:*[not(ancestor::rdf:*)]'
    }
    for key in fields.keys():
        results = svg.xpath(fields[key], namespaces=namespaces)

        for result in results:
            key_name = key
            if '*' in fields[key] and len(results) > 0:
                if hasattr(result, "attrname"):
                    key_name = re.sub(r'\{.*?\}', '', result.attrname)
                else:
                    key_name = result.xpath('name()')

            key_name = insiderer.de_dup(key_name, metadata)

            if isinstance(result, etree._ElementUnicodeResult):
                metadata[key_name] = str(result)
            else:
                metadata[key_name] = xmltodict.parse(
                    etree.tostring(result, encoding='utf8',
                                   method='xml').decode('utf-8'))

    base64images = svg.xpath('//svg:image', namespaces=namespaces)
    for image in base64images:
        key = image.xpath('local-name()')
        href = image.attrib.get('{%s}href' % namespaces["xlink"])
        name = image.attrib.get('id') or "unknown"
        if href.startswith("data:") and base64_prefix in href[0:100]:
            image_data = base64.standard_b64decode(
                href[href.find(base64_prefix) + len(base64_prefix):])
            try:
                image_path = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1]
                image_handle = open(image_path, "wb")
                image_handle.write(image_data)
                image_handle.close()
                child = insiderer.get_metadata(image_path, name)
                children.append(child)
            finally:
                insiderer.safedelete(image_path)
Пример #5
0
def application_vnd_oasis_opendocument_text(path, metadata, children, from_doc=False):
  odtzip = zipfile.ZipFile(path)
  try:
    tmp_path = None
    tmp_path = tempfile.mkdtemp(dir=insiderer.TMP_DIR)
    odtzip.extractall(tmp_path)
    for name in odtzip.namelist():
      childpath = tmp_path + "/" + name
      try:
        #cherrypy.log(childpath, name)
        if name == "meta.xml":
          metaxml = etree.parse(childpath)
          metadatas = metaxml.xpath('/*/*/*')
          for item in metadatas:
            key = item.xpath('local-name()')
            if from_doc is True and key == "generator":
              pass
            else:
              metadata[key] = item.text
        elif name == "content.xml":
          metaxml = etree.parse(childpath)
          trackchanges = metaxml.xpath('//*[local-name() = "change-info"]')
          if len(trackchanges) > 0:
            metadata['track_changes'] = []
            for trackchange in trackchanges:
              trackchange_dict = {}
              for item in trackchange.xpath('*'):
                trackchange_dict[item.xpath('local-name()')] = item.text
              metadata['track_changes'].append(trackchange_dict)
        elif os.path.isdir(childpath):
          pass
        elif name == "mimetype" or name == "manifest.rdf" or name == "META-INF/manifest.xml" or name == "current.xml" or name == "styles.xml" or name == "settings.xml" or name == "" or name == "Configurations2/accelerator/current.xml" or name == "layout-cache":
          pass
        elif from_doc is True and name == "Thumbnails/thumbnail.png":
          pass
        else:
          child = insiderer.get_metadata(childpath, name)
          children.append(child)
      finally:
        if not os.path.isdir(childpath):
          insiderer.safedelete(childpath)
  finally:
    if tmp_path:
      shutil.rmtree(tmp_path)
Пример #6
0
def application_pdf(path, metadata, children):
    try:
        uncompressed_pdf = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1]
        stdout = subprocess.check_output(
            ["pdftk", path, "output", uncompressed_pdf, "uncompress"])
        with open(uncompressed_pdf, 'rb') as pdfhandle:
            pdfdata = pdfhandle.read()
            children.extend(extract_jpegs(pdfdata))
            pdf_document = PyPDF2.PdfFileReader(pdfhandle)
            if pdf_document.isEncrypted:
                pdf_document.decrypt("")
            metadata["info"] = pdf_document.getDocumentInfo()
            metadata["xmp"] = dict()
            xmp = pdf_document.getXmpMetadata()
            if xmp:
                for name in dir(xmp):
                    try:
                        xmp_data = getattr(xmp, name)
                        str_xmp_data = ""
                        if isinstance(xmp_data, datetime.datetime):
                            metadata["xmp"][name] = str(xmp_data.now())
                        else:
                            str_xmp_data = str(xmp_data)
                            try:
                                metadata["xmp"][name] = json.loads(
                                    str_xmp_data)
                            except Exception as e:
                                if str_xmp_data is None:
                                    pass
                                elif str_xmp_data.startswith("<"):
                                    try:
                                        metadata["xmp"][
                                            name] = xmltodict.parse(
                                                xmp_data.toxml())
                                    except Exception as e:
                                        pass
                                else:
                                    metadata["xmp"][name] = str_xmp_data
                    except Exception as e:
                        cherrypy.log("Can't serialize %s. %s", name, e)
    except Exception as e:
        cherrypy.log("PDF exception", e)
    finally:
        insiderer.safedelete(uncompressed_pdf)
Пример #7
0
def image_svg_xml(path, metadata, children):
  svg = etree.parse(path)
  fields = {
    'export-filename': '//@inkscape:export-filename',
    'docname':         '//@sodipodi:docname',
    'description':     '//svg:desc',
    'title':           '//svg:title',
    'dca':             '//@*[namespace-uri()="%s"][not(ancestor::dc:*)]' % namespaces["dc"], #because selecting by namespace attributes doesn't seem to work
    'rdf':             '//rdf:*[not(ancestor::rdf:*)]'
  }
  for key in fields.keys():
    results = svg.xpath(fields[key], namespaces=namespaces)

    for result in results:
      key_name = key
      if '*' in fields[key] and len(results) > 0:
        if hasattr(result, "attrname"):
          key_name = re.sub(r'\{.*?\}', '', result.attrname)
        else:
          key_name = result.xpath('name()')

      key_name = insiderer.de_dup(key_name, metadata)

      if isinstance(result, etree._ElementUnicodeResult):
        metadata[key_name] = str(result)
      else:
        metadata[key_name] = xmltodict.parse(etree.tostring(result, encoding='utf8', method='xml').decode('utf-8'))

  base64images = svg.xpath('//svg:image', namespaces=namespaces)
  for image in base64images:
    key = image.xpath('local-name()')
    href = image.attrib.get('{%s}href' % namespaces["xlink"])
    name = image.attrib.get('id') or "unknown"
    if href.startswith("data:") and base64_prefix in href[0:100]:
      image_data = base64.standard_b64decode( href[href.find(base64_prefix) + len(base64_prefix):] )
      try:
        image_path = tempfile.mkstemp(dir=insiderer.TMP_DIR)[1]
        image_handle = open(image_path, "wb")
        image_handle.write(image_data)
        image_handle.close()
        child = insiderer.get_metadata(image_path, name)
        children.append(child)
      finally:
        insiderer.safedelete(image_path)
Пример #8
0
def application_zip(path, metadata, children, from_doc=False):
    aZipFile = zipfile.ZipFile(path)
    try:
        tmp_path = None
        tmp_path = tempfile.mkdtemp(dir=insiderer.TMP_DIR)
        aZipFile.extractall(tmp_path)
        for name in aZipFile.namelist():
            childpath = os.path.join(tmp_path, name)
            try:
                if os.path.isdir(childpath):
                    pass
                else:
                    child = insiderer.get_metadata(childpath, name)
                    children.append(child)
            finally:
                if not os.path.isdir(childpath):
                    insiderer.safedelete(childpath)
    finally:
        if tmp_path:
            shutil.rmtree(tmp_path)
Пример #9
0
def application_zip(path, metadata, children, from_doc=False):
  aZipFile = zipfile.ZipFile(path)
  try:
    tmp_path = None
    tmp_path = tempfile.mkdtemp(dir=insiderer.TMP_DIR)
    aZipFile.extractall(tmp_path)
    for name in aZipFile.namelist():
      childpath = os.path.join(tmp_path, name)
      try:
        if os.path.isdir(childpath):
          pass
        else:
          child = insiderer.get_metadata(childpath, name)
          children.append(child)
      finally:
        if not os.path.isdir(childpath):
          insiderer.safedelete(childpath)
  finally:
    if tmp_path:
      shutil.rmtree(tmp_path)