def doc_title_with_good_suffix(doc): """Return title (perhaps unchanged) which reflects the MIME type. For example, a text/plain 'Readme' will be mapped to 'Readme.txt'.""" orig_filename, orig_ext = os.path.splitext(originals_filename_for_doc(doc)) wpc = _check_for_webpage_complete(doc.originals_path()) if wpc: orig_filespec, orig_ext = os.path.splitext(wpc) filename = doc.get_metadata('title') or doc.original_name() or doc.id filename = os.path.basename(filename).strip(' \t.,-;') filename = re.sub('[^A-Za-z0-9 \._-]+', '_', filename) # sanitize charset assert '\r' not in filename assert '\n' not in filename orig_ext = orig_ext.lower() # e.g. for 'ReadMe', orig_ext == '' if orig_ext and \ filename.lower().endswith(orig_ext) and \ is_good_extension(orig_ext): return filename # Popular entries from mimetypes.common_types not found in CONTENT_TYPES: # image/pict .{pct,pic,pict}, application/rtf .rtf (no uplib parser) # Popular entries from mimetypes.types_map not found in CONTENT_TYPES: # audio .aiff + many more, video .avi + many more, text .bat .h .c .css, # html .htm .xml, msword .dot, image .bmp .jpeg .tiff, postscript .eps # Originals lacked an extension, or had a bad extension. Synthesize one. fallback_ext = get_extension_for_type(doc.get_metadata('apparent-mime-type') or \ doc.get_metadata('content-type') or \ 'text/plain') ext = ((orig_ext and is_good_extension(orig_ext) and orig_ext) or fallback_ext) assert is_good_extension(ext) if not ext.startswith("."): ext = "." + ext return filename + ext
def is_good_extension(ext): """Boolean, true if uploading with this file extension could succeed. That is, after mapping the extension to a content-type, we will want extensions/UploadDocument.py's _add_internal to find it in CONTENT_TYPES. """ if ext.startswith('.'): # e.g. might be '' ext = ext[1:] # get_content_type() punts on '.JPG' &c., letting mimetypes.guess_type() deal with it. ext = ext.lower() ct = get_content_type('foo.' + ext) if ct == 'application/octet-stream': return False # This binary type is absolutely unacceptable to _add_internal. return get_extension_for_type(ct) == ext and ext in CONTENT_TYPES.values()
def _separate_images (html): images = {} counter = 0 m = _DATA_URI_PATTERN.search(html) while m: maintype = m.group("maintype") subtype = m.group("subtype") encoding = m.group("encoding") params = m.group("params") data = m.group("data") content_type = "%s/%s" % (maintype, subtype) if encoding == "base64": data = base64.decodestring(data) image_name = "image-%s.%s" % (counter, get_extension_for_type(content_type)) counter += 1 images[image_name] = (content_type, data) html = html[:m.start()] + ('src="images/%s"' % image_name) + html[m.end():] m = _DATA_URI_PATTERN.search(html) return html, images