def is_good_extension(ext): """Boolean, true if uploading with this file extension could succeed. That is, after mapping the extension to a content-type, we will want extensions/UploadDocument.py's _add_internal to find it in CONTENT_TYPES. """ if ext.startswith('.'): # e.g. might be '' ext = ext[1:] # get_content_type() punts on '.JPG' &c., letting mimetypes.guess_type() deal with it. ext = ext.lower() ct = get_content_type('foo.' + ext) if ct == 'application/octet-stream': return False # This binary type is absolutely unacceptable to _add_internal. return get_extension_for_type(ct) == ext and ext in CONTENT_TYPES.values()
def _add_internal (ostream, percent_done_fn, repo, response, params, content, wait): # this can be called in several different ways. # In general, you post a multipart/form-data body which # contains a "contenttype" for the document, and either a "URL" # for the content, or a "content" parameter containing the # the actual content. If both "URL" and "content" are present, # the URL is added as the "original-url" value for the metadata, # and if the content is HTML, it's used as the "original.html" # and the URL is used to pull ancillary content referenced in it. content_type = params.get("contenttype") url = params.get("URL") noredir = params.get("no-redirect") noredir = noredir and (noredir.lower() == "true") uploadloc = url docname = params.get("documentname") tempf = None suppress_duplicates = params.get("suppress-duplicates") suppress_duplicates = suppress_duplicates and (suppress_duplicates.lower() == "true") bury = params.get("bury") bury = bury and (bury.lower() == "true") verbosity = int(params.get("verbosity") or "0") if content: if wait and ostream: _rewrite_job_output(ostream, '{ state: 0, msg: "Caching page..."}') extension = CONTENT_TYPES.get(content_type) if not extension: if wait: msg = "Don't know what to do with contenttype \"%s\"" % content_type if ostream: _rewrite_job_output(ostream, '{state: 1, msg: "' + urllib.quote(msg) + '"}') else: response.error(HTTPCodes.UNSUPPORTED_MEDIA_TYPE, msg) return # special case HTML/XHTML if content and (content_type.lower() in ("text/html", "application/xhtml+xml")): tempf = tempfile.mkdtemp() uploadloc = os.path.join(tempf, "original.html") # make sure that the folder for other parts exists, even if empty os.mkdir(os.path.join(tempf, "original_files")) # remove our bookmarklet, if present content = _BOOKMARKLET_PATTERN.sub('', content) content = _ADD_FORM_PATTERN.sub('', content) c = _OurCacher(url, filename=uploadloc, bits=content, content_type=content_type) # make sure that the folder for other parts exists, even if empty other_parts = os.path.join(tempf, "original_files") if not os.path.exists(other_parts): os.mkdir(other_parts) # special case 3x5 cards elif (docname and (content_type.lower() == "text/plain") and os.path.splitext(docname)[1] == ".3x5"): fd, tempf = tempfile.mkstemp(".3x5") fp = os.fdopen(fd, "wb") fp.write(content) fp.close() uploadloc = tempf else: fd, tempf = tempfile.mkstemp("." + extension) fp = os.fdopen(fd, "wb") fp.write(content) fp.close() uploadloc = tempf if suppress_duplicates: hash = calculate_originals_fingerprint(tempf) results = repo.do_query("sha-hash:"+hash) if results: # it's a duplicate doc = results[0][1] if os.path.isdir(tempf): shutil.rmtree(tempf) elif os.path.exists(tempf): os.remove(tempf) if ostream: _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc.id + '"}') elif noredir: response.reply(doc.id, "text/plain") else: response.redirect("/action/basic/dv_show?doc_id=%s" % doc.id) return try: try: # get a cookie for authentication cookie = repo.new_cookie(url or content[:min(100, len(content))]) cookie_str = '%s=%s; path=/; Secure' % (cookie.name(), cookie.value()) os.environ["UPLIB_COOKIE"] = cookie_str doctitle = params.get("md-title") docauthors = params.get("md-authors") docdate = params.get("md-date") doccats = params.get("md-categories") metadata = params.get("metadata") if metadata: mdtmpfile = tempfile.mktemp() open(mdtmpfile, "w").write(metadata) # check to see if we're replacing an existing document md2 = read_metadata(StringIO.StringIO(metadata)) existing_doc_id = md2.get("replacement-contents-for") if existing_doc_id and not repo.valid_doc_id(existing_doc_id): raise ValueError("Invalid doc ID %s specified for replacement" % existing_doc_id) else: mdtmpfile = None existing_doc_id = None # now form the command scheme = ((repo.get_param("use-http", "false").lower() == "true") or _use_http) and "http" or "https" cmd = '%s --verbosity=%s --repository=%s://127.0.0.1:%s ' % (_uplib_add_document, verbosity, scheme, repo.port()) if doctitle: cmd += ' --title=%s' % pipes.quote(doctitle) if docauthors: cmd += ' --authors=%s' % pipes.quote(docauthors) if docdate: cmd += ' --date="%s"' % docdate if doccats: cmd += ' --categories=%s' % pipes.quote(doccats) if mdtmpfile: cmd += ' --metadata="%s"' % mdtmpfile cmd += ' "%s"' % uploadloc if ostream: _rewrite_job_output(ostream, '{state: 0, msg: "' + urllib.quote(cmd) + '"}') # and invoke the command status, output, tsignal = subproc(cmd) note(4, "cmd is %s, status is %s, output is %s", repr(cmd), status, repr(output.strip())) if mdtmpfile: os.unlink(mdtmpfile) if status == 0: # success; output should be doc-id doc_id = existing_doc_id or output.strip().split()[-1] note(4, "output is '%s'; doc_id for new doc is %s", output.strip(), doc_id) if wait and ostream: _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(output) + '"}') # wait for it to come on-line if percent_done_fn: percent_done_fn(40) # estimate 40% of work done on client side while not repo.valid_doc_id(doc_id): if ostream: pending = repo.list_pending(full=True) s = _first(pending, lambda x: x['id'] == doc_id) if not s: break dstatus = s['status'] if dstatus == 'error': msg = 'server-side error incorporating document' _rewrite_job_output(ostream, '{ state: 3, doc_id: "' + doc_id + '", msg: "' + urllib.quote(s['error']) + '"}') break if dstatus == 'unpacking': msg = 'starting ripper process...' elif dstatus == 'ripping': msg = "ripping with ripper '" + s['ripper'] + "'..." elif dstatus == 'moving': msg = 'adding to registered document set...' _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(msg) + '"}') time.sleep(1.0) if percent_done_fn: percent_done_fn(100) # finished if repo.valid_doc_id(doc_id): if bury: # wait up to 100 seconds for it to show up in history list # after that, wait another second, then bury it counter = 100 while counter > 0: h = [x.id for x in repo.history()] if doc_id in h: break counter -= 1 time.sleep(1) time.sleep(1) repo.touch_doc(doc_id, bury=True, notify=False) note(3, "buried %s", doc_id) if wait: if ostream: _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc_id + '"}') elif noredir: response.reply(doc_id, "text/plain") else: response.redirect("/action/basic/dv_show?doc_id=%s" % doc_id) else: note("cmd <<%s>> failed with status %s:\n%s", cmd, status, output) if wait: if ostream: _rewrite_job_output(ostream, '{ state: 3, msg: "' + urllib.quote('Error processing the document:\n' + output) + '"}') else: response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "<pre>" + htmlescape(output) + "</pre>") except: e = ''.join(traceback.format_exception(*sys.exc_info())) if wait: note(3, "Exception processing uplib-add-document request:\n%s", htmlescape(e)) if ostream: _rewrite_job_output(ostream, '{state: 3, msg: "' + urllib.quote("Exception processing uplib-add-document request:\n" + e) + '"}') else: response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "Exception processing uplib-add-document request:\n<pre>" + htmlescape(e) + "\n</pre>") else: note("Exception processing uplib-add-document request:\n%s", e) finally: if tempf and os.path.isfile(tempf): os.unlink(tempf) elif tempf and os.path.isdir(tempf): shutil.rmtree(tempf)