def _add_internal (ostream, percent_done_fn, repo, response, params, content, wait): # this can be called in several different ways. # In general, you post a multipart/form-data body which # contains a "contenttype" for the document, and either a "URL" # for the content, or a "content" parameter containing the # the actual content. If both "URL" and "content" are present, # the URL is added as the "original-url" value for the metadata, # and if the content is HTML, it's used as the "original.html" # and the URL is used to pull ancillary content referenced in it. content_type = params.get("contenttype") url = params.get("URL") noredir = params.get("no-redirect") noredir = noredir and (noredir.lower() == "true") uploadloc = url docname = params.get("documentname") tempf = None suppress_duplicates = params.get("suppress-duplicates") suppress_duplicates = suppress_duplicates and (suppress_duplicates.lower() == "true") bury = params.get("bury") bury = bury and (bury.lower() == "true") verbosity = int(params.get("verbosity") or "0") if content: if wait and ostream: _rewrite_job_output(ostream, '{ state: 0, msg: "Caching page..."}') extension = CONTENT_TYPES.get(content_type) if not extension: if wait: msg = "Don't know what to do with contenttype \"%s\"" % content_type if ostream: _rewrite_job_output(ostream, '{state: 1, msg: "' + urllib.quote(msg) + '"}') else: response.error(HTTPCodes.UNSUPPORTED_MEDIA_TYPE, msg) return # special case HTML/XHTML if content and (content_type.lower() in ("text/html", "application/xhtml+xml")): tempf = tempfile.mkdtemp() uploadloc = os.path.join(tempf, "original.html") # make sure that the folder for other parts exists, even if empty os.mkdir(os.path.join(tempf, "original_files")) # remove our bookmarklet, if present content = _BOOKMARKLET_PATTERN.sub('', content) content = _ADD_FORM_PATTERN.sub('', content) c = _OurCacher(url, filename=uploadloc, bits=content, content_type=content_type) # make sure that the folder for other parts exists, even if empty other_parts = os.path.join(tempf, "original_files") if not os.path.exists(other_parts): os.mkdir(other_parts) # special case 3x5 cards elif (docname and (content_type.lower() == "text/plain") and os.path.splitext(docname)[1] == ".3x5"): fd, tempf = tempfile.mkstemp(".3x5") fp = os.fdopen(fd, "wb") fp.write(content) fp.close() uploadloc = tempf else: fd, tempf = tempfile.mkstemp("." + extension) fp = os.fdopen(fd, "wb") fp.write(content) fp.close() uploadloc = tempf if suppress_duplicates: hash = calculate_originals_fingerprint(tempf) results = repo.do_query("sha-hash:"+hash) if results: # it's a duplicate doc = results[0][1] if os.path.isdir(tempf): shutil.rmtree(tempf) elif os.path.exists(tempf): os.remove(tempf) if ostream: _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc.id + '"}') elif noredir: response.reply(doc.id, "text/plain") else: response.redirect("/action/basic/dv_show?doc_id=%s" % doc.id) return try: try: # get a cookie for authentication cookie = repo.new_cookie(url or content[:min(100, len(content))]) cookie_str = '%s=%s; path=/; Secure' % (cookie.name(), cookie.value()) os.environ["UPLIB_COOKIE"] = cookie_str doctitle = params.get("md-title") docauthors = params.get("md-authors") docdate = params.get("md-date") doccats = params.get("md-categories") metadata = params.get("metadata") if metadata: mdtmpfile = tempfile.mktemp() open(mdtmpfile, "w").write(metadata) # check to see if we're replacing an existing document md2 = read_metadata(StringIO.StringIO(metadata)) existing_doc_id = md2.get("replacement-contents-for") if existing_doc_id and not repo.valid_doc_id(existing_doc_id): raise ValueError("Invalid doc ID %s specified for replacement" % existing_doc_id) else: mdtmpfile = None existing_doc_id = None # now form the command scheme = ((repo.get_param("use-http", "false").lower() == "true") or _use_http) and "http" or "https" cmd = '%s --verbosity=%s --repository=%s://127.0.0.1:%s ' % (_uplib_add_document, verbosity, scheme, repo.port()) if doctitle: cmd += ' --title=%s' % pipes.quote(doctitle) if docauthors: cmd += ' --authors=%s' % pipes.quote(docauthors) if docdate: cmd += ' --date="%s"' % docdate if doccats: cmd += ' --categories=%s' % pipes.quote(doccats) if mdtmpfile: cmd += ' --metadata="%s"' % mdtmpfile cmd += ' "%s"' % uploadloc if ostream: _rewrite_job_output(ostream, '{state: 0, msg: "' + urllib.quote(cmd) + '"}') # and invoke the command status, output, tsignal = subproc(cmd) note(4, "cmd is %s, status is %s, output is %s", repr(cmd), status, repr(output.strip())) if mdtmpfile: os.unlink(mdtmpfile) if status == 0: # success; output should be doc-id doc_id = existing_doc_id or output.strip().split()[-1] note(4, "output is '%s'; doc_id for new doc is %s", output.strip(), doc_id) if wait and ostream: _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(output) + '"}') # wait for it to come on-line if percent_done_fn: percent_done_fn(40) # estimate 40% of work done on client side while not repo.valid_doc_id(doc_id): if ostream: pending = repo.list_pending(full=True) s = _first(pending, lambda x: x['id'] == doc_id) if not s: break dstatus = s['status'] if dstatus == 'error': msg = 'server-side error incorporating document' _rewrite_job_output(ostream, '{ state: 3, doc_id: "' + doc_id + '", msg: "' + urllib.quote(s['error']) + '"}') break if dstatus == 'unpacking': msg = 'starting ripper process...' elif dstatus == 'ripping': msg = "ripping with ripper '" + s['ripper'] + "'..." elif dstatus == 'moving': msg = 'adding to registered document set...' _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(msg) + '"}') time.sleep(1.0) if percent_done_fn: percent_done_fn(100) # finished if repo.valid_doc_id(doc_id): if bury: # wait up to 100 seconds for it to show up in history list # after that, wait another second, then bury it counter = 100 while counter > 0: h = [x.id for x in repo.history()] if doc_id in h: break counter -= 1 time.sleep(1) time.sleep(1) repo.touch_doc(doc_id, bury=True, notify=False) note(3, "buried %s", doc_id) if wait: if ostream: _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc_id + '"}') elif noredir: response.reply(doc_id, "text/plain") else: response.redirect("/action/basic/dv_show?doc_id=%s" % doc_id) else: note("cmd <<%s>> failed with status %s:\n%s", cmd, status, output) if wait: if ostream: _rewrite_job_output(ostream, '{ state: 3, msg: "' + urllib.quote('Error processing the document:\n' + output) + '"}') else: response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "<pre>" + htmlescape(output) + "</pre>") except: e = ''.join(traceback.format_exception(*sys.exc_info())) if wait: note(3, "Exception processing uplib-add-document request:\n%s", htmlescape(e)) if ostream: _rewrite_job_output(ostream, '{state: 3, msg: "' + urllib.quote("Exception processing uplib-add-document request:\n" + e) + '"}') else: response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "Exception processing uplib-add-document request:\n<pre>" + htmlescape(e) + "\n</pre>") else: note("Exception processing uplib-add-document request:\n%s", e) finally: if tempf and os.path.isfile(tempf): os.unlink(tempf) elif tempf and os.path.isdir(tempf): shutil.rmtree(tempf)
def _add_vcards_file (repo, response, tfile): try: fp = response.open("text/plain") conf = configurator.default_configurator() update_configuration(conf) tal = ensure_assembly_line(conf.get("assembly-line")) cards = [] try: parsed = vCards.myformat(tfile) parsed['upload'] = False parsed['usepng'] = True for card in parsed.get('parsed-cards'): # see if there's already a card for this name query = 'apparent-mime-type:"%s" AND vcard-name:"%s"' % ( vCard.format_mimetype, card.fn.value) hits = repo.do_query(query) if hits: if 'metadata' not in parsed: parsed['metadata'] = {} parsed['metadata']['version-of'] = hits[0][1].id p = vCard(card, parsed) # calculate fingerprint fd, filename = tempfile.mkstemp() fp = os.fdopen(fd, "wb") p.write_to_file(fp) fp.close() fingerprint = calculate_originals_fingerprint(filename) # look up fingerprint in repo to see if we already have it hits = repo.do_query('sha-hash:%s' % fingerprint) if hits: # already there, so skip this one note(3, "skipping '%s', already in repo...", card.fn.value) continue # new card, so add it pinst = p.process() if isinstance(pinst, DocumentParser): try: folder = repo.create_document_folder(repo.pending_folder()) id = os.path.basename(folder) note("using id %s for %s...", id, card.fn.value) # add the tfolder to the repository process_folder(repo, id, pinst.folder, True) flesh_out_folder(id, None, None, repo, None, None) note("added card for %s\n" % card.fn.value) cards.append((id, card.fn.value)) except: msg = "Exception processing vCard; vCard is\n%s\nException was\n%s\n" % ( card, ''.join(traceback.format_exception(*sys.exc_info()))) note(0, msg) finally: if tal: from uplib.addDocument import AssemblyLine shutil.rmtree(AssemblyLine) if os.path.exists(tfile): os.unlink(tfile) except: msg = "Exception processing vcards:\n%s\n" % ''.join(traceback.format_exception(*sys.exc_info())) note(0, msg) response.error(HTTPCodes.INTERNAL_SERVER_ERROR, msg) else: response.reply('\n'.join(['%20s: %s' % (x[0], x[1]) for x in cards]))
def _add_icalendar_file (repo, response, tfile): try: conf = configurator.default_configurator() update_configuration(conf) tal = ensure_assembly_line(conf.get("assembly-line")) try: parsed = iCalendar.myformat(tfile) if not isinstance(parsed, dict): note(0, "Can't parse supposed iCalendar file %s", tfile) response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "Can't parse file") return resp = response.open("text/plain") for event, name, uid in parsed.get('parsed-events'): if hasattr(event, "dtstart"): identifier = "%s @ %s" % (name, event.dtstart.value) else: identifier = name # see if there's already a event for this name query = 'apparent-mime-type:"%s" AND event-uid:"%s"' % ( iCalendarEventParser.format_mimetype, uid) hits = repo.do_query(query) if hits: if 'metadata' not in parsed: parsed['metadata'] = {} parsed['metadata']['version-of'] = hits[0][1].id if event.name == "VEVENT": p = iCalendarEventParser(name, {"icsname": name, "icsuid": uid, "icsevent": event, "upload": False, "usepng": True, "metadata": parsed.get("metadata") or {}, }) else: note(3, "No supported iCalendar subtype found in %s", identifier) p = None if p: # calculate fingerprint fd, filename = tempfile.mkstemp(".ics") fp = os.fdopen(fd, "wb") p.write_to_file(fp) fp.close() fingerprint = calculate_originals_fingerprint(filename) # look up fingerprint in repo to see if we already have it hits = repo.do_query('sha-hash:%s' % fingerprint) if hits: # already there, so skip this one note(3, "skipping '%s', already in repo...", identifier) resp.write("skipping '%s', already in repo\n" % identifier) continue # new event, so add it p.metadata["sha-hash"] = fingerprint pinst = p.process() if isinstance(pinst, DocumentParser): try: folder = repo.create_document_folder(repo.pending_folder()) id = os.path.basename(folder) # add the tfolder to the repository process_folder(repo, id, pinst.folder, True) flesh_out_folder(id, None, None, repo, None, None) resp.write("added event for %s\n" % identifier) except: msg = "Exception processing event; event is\n%s\nException was\n%s\n" % ( event, ''.join(traceback.format_exception(*sys.exc_info()))) note(0, msg) resp.write(msg) finally: if tal: from uplib.addDocument import AssemblyLine shutil.rmtree(AssemblyLine) if os.path.exists(tfile): os.unlink(tfile) except: msg = "Exception processing iCalendar:\n%s\n" % ''.join(traceback.format_exception(*sys.exc_info())) note(0, msg) response.error(HTTPCodes.INTERNAL_SERVER_ERROR, msg)