def get_page_images (self):

            tf = mktempfile(".pnm")
            cmd = ('"%s" --input-format jp2 --input "%s" --output-format pnm --output "%s"'
                   % (JASPER, self.doc, tf))
            try:
                status, output, tsignal = subproc(cmd)
                if status == 0:
                    # success
                    img = Image.open(tf)
                    imagespath = self.images_path()
                    os.mkdir(imagespath)
                    if self.uses_png:
                        png_file_name = os.path.join(imagespath, "page00001.png")
                        img.save(png_file_name, "PNG")
                    else:
                        if (convert_image_to_tiff(tf, imagespath)):
                            note(3, "created tiff file in %s", imagespath)
                else:
                    note("Can't convert %s.  Output was %s.", self.doc, output)
                    note(4, "cmd was %s", cmd)
                    note(4, "tfile %s %s", tf, (os.path.exists(tf) and "exists") or "does not exist")
                    raise RuntimeError(output)
            finally:
                if os.path.exists(tf):
                    os.unlink(tf)
 def get_images_for_page (page_index, wordboxes, dpi, images_dir):
     pageimages = []
     filepath = os.path.join(images_dir, "page%05d.png" % (page_index + 1))
     if os.path.exists(filepath):
         wordboxes_file = tempfile.mktemp()
         try:
             boxlist = []
             if wordboxes:
                 # first, write out list of wordboxes, in Leptonica BOXA format
                 for i in range(len(wordboxes)):
                     box = boxes[i]
                     x, y, w, h = (int(box.left() * dpi / 72.0), int(box.top() * dpi / 72.0),
                                   int(box.width() * dpi / 72.0), int(box.height() * dpi / 72.0))
                     if (w > 0) and (h > 0):
                         boxlist.append((x, y, w, h))
                 if len(boxlist) > 0:
                     fp = open(wordboxes_file, "wb")
                     fp.write("\nBoxa Version 2\nNumber of boxes = %d\n" % len(boxlist))
                     for i in range(len(boxlist)):
                         fp.write("  Box[%d]: " % i + "x = %d, y = %d, w = %d, h = %d\n" % boxlist[i])
                     fp.close()
             # now, run the finder on the page image plus the list of wordboxes
             debug_arg = (debug and "--debug") or " "
             cmd = "%s %s %s %s %s" % (FINDIMAGES_PROGRAM, debug_arg, dpi, filepath, (boxlist and wordboxes_file) or "-")
             note(4, "findimages cmd is <<%s>>", cmd)
             status, output, tsignal = subproc(cmd)
             if status == 0:
                 for line in [x.strip() for x in output.split('\n') if x.strip()]:
                     if not line.startswith("halftone "):
                         continue
                     pageimages.append((str(page_index) + " " + line.strip()).split())
             else:
                 note(3, "findimages command <%s> returns bad status %s:\n%s\n" % (cmd, status, output))
         finally:
             # remove the temp file
             if os.path.exists(wordboxes_file):
                 os.unlink(wordboxes_file)
                 # note("%d:  wordboxes file is %s", page_index, wordboxes_file)
     return pageimages
def process_tarred_folder (repo, id, tarfile, metadata):
    # create a new folder, and populate it
    dirname = tempfile.mktemp()
    try:
        os.mkdir(dirname)
        os.chmod(dirname, 0700)
        cmd = UNTAR_CMD % (dirname, TAR, tarfile)
        note(2, "Untarring folder into temporary directory %s", dirname)
        status, output, signal = subproc(cmd)
        if status == 0:
            note(2, "Successfully untarred folder into %s", dirname)
            if metadata:
                update_metadata(os.path.join(dirname, "metadata.txt"), metadata)
            if (os.path.exists(os.path.join(dirname, "document.tiff")) or
                os.path.isdir(os.path.join(dirname, "page-images"))):
                return process_folder(repo, id, dirname, true)
            else:
                raise Error("invalid folder -- no page images file")
        else:
            raise Error("Problem untarring folder:\n%s" % output)
    finally:
        if os.path.exists(dirname):
            shutil.rmtree(dirname)        
def index_folder (folder, repo_index_dir):

    update_configuration()

    docs_dir, doc_id = os.path.split(folder)
    SECTION_LOCK.acquire()
    try:
        try:
            if LUCENE == 'jcc':
                c = get_context(repo_index_dir)
                c.index(folder, doc_id)
            else:
                indexingcmd = INDEXING_ADD_CMD % (JAVA, DEBUG_FLAGS, INDEXING_PROPERTIES, LUCENE_JAR, INDEXING_JAR, repo_index_dir, docs_dir, doc_id)
                note(3, "  indexing with %s", indexingcmd)
                status, output, tsignal = subproc(indexingcmd)
        except:
            note(0, "Can't index folder %s:\n%s",
                 folder, ''.join(traceback.format_exception(*sys.exc_info())))
    finally:
        SECTION_LOCK.release()
    if LUCENE != 'jcc':
        note(3, "  indexing output is <%s>", output)
        if status != 0:
            raise Error ("%s signals non-zero exit status %d attempting to index %s:\n%s" % (JAVA, status, folder, output))
def remove_from_index (repo_index_dir, doc_id):

    update_configuration()

    if LUCENE == 'jcc':
        SECTION_LOCK.acquire()
        try:
            c = get_context(repo_index_dir)
            c.remove(doc_id)
        finally:
            SECTION_LOCK.release()

    else:

        indexingcmd = INDEXING_REMOVE_CMD % (JAVA, DEBUG_FLAGS, "", LUCENE_JAR, INDEXING_JAR, repo_index_dir, doc_id)
        note(3, "  de-indexing with %s", indexingcmd)
        SECTION_LOCK.acquire()
        try:
            status, output, tsignal = subproc(indexingcmd)
        finally:
            SECTION_LOCK.release()
        note(3, "  indexing output is <%s>", output)
        if status != 0:
            raise Error ("%s signals non-zero exit status %d attempting to remove %s:\n%s" % (JAVA, status, doc_id, output))
def do_thumbnails (dirpath, output_dir, **params):
    note(2, "  thumbnailing in %s...", dirpath)
    tmpdir = tempfile.mktemp()
    retval = params.get('returnvalue', false)
    doc_metadata_path = os.path.join(dirpath, "metadata.txt")
    try:
        os.mkdir(tmpdir)
        os.chmod(tmpdir, 0700)
        try:

            md = read_metadata(doc_metadata_path)
            is_temporary_doc = md.get("temporary-contents")
            if is_temporary_doc and (is_temporary_doc == "true"):
                # temporary -- don't spend much time on this
                create_temporary_icons (md, dirpath, output_dir, params)
                retval = true
                return

            if os.path.exists(os.path.join(dirpath, "document.tiff")):
                # contains one single-page TIFF file
                tiffmaster = os.path.join(tmpdir, "master.tiff")
                split_command = (TIFF_SPLIT_CMD
                                 % (TIFFCP, os.path.join(dirpath, "document.tiff"), tiffmaster,
                                    TIFFSPLIT, tiffmaster, os.path.join(tmpdir, "x")))
                status, output, tsignal = subproc(split_command)
                if status != 0:
                    raise Error ("'%s' signals non-zero exit status %d in %s => %s"
                                 % (split_command, status, dirpath, tmpdir))
                parts_dir = tmpdir
                filecheck_fn = lambda fn: fn[0] == "x"
            elif (os.path.exists(os.path.join(dirpath, "page-images")) and
                  os.path.isdir(os.path.join(dirpath, "page-images"))):
                # contains directory full of PNG page images
                parts_dir = os.path.join(dirpath, "page-images")
                filecheck_fn = lambda fn: (fn.startswith('page') and fn.endswith('.png'))
            else:
                raise Error("No page images for document in %s" % dirpath)

            tiff_parts = os.listdir(parts_dir)
            if len(tiff_parts) < 1:
                raise Error("No pages in split tiff file directory after split!")
            # either a PNG-images or a TIFF split will sort properly in lexicographic order
            tiff_parts.sort()

            # see if there's a document icon legend and info about the DPI of the tiff file
            legend = md.get('document-icon-legend')
            tiff_dpi = int(md.get('images-dpi') or md.get('tiff-dpi') or params.get('images-dpi') or 0)
            page_numbers_v = md.get('page-numbers')
            page_numbers = (page_numbers_v and figure_page_numbers(page_numbers_v, dirpath))
            first_page = int(md.get('first-page-number', 1))
            skips = md.get('document-bbox-pages-to-skip', '')
            if skips:
                parts = string.split(skips, ':')
                bbox_skips = []
                for part in parts:
                    bbox_skips = bbox_skips + map(int, string.split(part, ','))
            else:
                bbox_skips = None

            # figure bounding box for imaged page
            page_count = 0
            bbox = None
            note(2, "    calculating bounding box for large pages...")
            dont_crop = md.get('dont-crop-big-thumbnails', false)
            if AUTO_CROP_BIG_THUMBNAILS and not dont_crop:
                do_bbox = true
            else:
                do_bbox = false
            for tiff_part in tiff_parts:
                if not filecheck_fn(tiff_part):
                    continue
                if page_count == 0:
                    # find the width and height of the document
                    docwidth, docheight = figure_doc_size(os.path.join(parts_dir, tiff_part))
                    if not do_bbox:
                        bbox = (0, 0, docwidth, docheight)
                if do_bbox:
                    bbox = figure_bbox (os.path.join(parts_dir, tiff_part), page_count, bbox, bbox_skips)
                if (bbox and bbox[0] == 0) and (bbox[1] == 0) and (bbox[2] >= docwidth) and (bbox[3] >= docheight):
                    # don't bother, there's no area to crop already
                    do_bbox = false
                page_count = page_count + 1
            if page_count == 0:
                raise Error("No pages in split tiff file directory after split!")
            note(2, "      final bbox is %s, page_count is %d", bbox, page_count)

            if USE_VIRTUAL_INK:
                note(2, "      alpha channels will be added to large thumbnails...")

            # now make the thumbnails
            big_thumbnail_size = []
            small_thumbnail_size = []
            icon_size = []
            page_index = 0
            for tiff_part in tiff_parts:
                if not filecheck_fn(tiff_part):
                    note(3, "    skipping %s", tiff_part)
                    continue
                tiff_path = os.path.join(parts_dir, tiff_part)
                if page_numbers:
                    page_no_string = page_numbers.get(page_index)
                else:
                    page_no_string = None
                note (2, "    page %d%s", page_index, (page_no_string and "   (%s)" % page_no_string) or "")
                try:
                    if not create_thumbnail(tiff_path, tiff_dpi, output_dir,
                                            page_index, first_page, page_count, bbox, bbox_skips,
                                            big_thumbnail_size, small_thumbnail_size, icon_size,
                                            params.get('maxwidth'), params.get('maxheight'), params.get('maxscaling'),
                                            params.get('thumbnail_strategy'), legend, page_no_string):
                        raise Error ("Can't create thumbnail for page %d in %s (of %s)" % (page_index, tiff_path, dirpath))
                except Exception, x:
                    doc_id = os.path.split(dirpath)[1]
                    note("exception creating thumbnails for page %d of document %s:\n%s", page_index, doc_id,
                         string.join(traceback.format_exception(*sys.exc_info()), ""))
                    raise AbortDocumentIncorporation(doc_id, str(x))

                if page_index == 0:
                    bt_width = big_thumbnail_size[0]
                    bt_height = big_thumbnail_size[1]
                    st_width = small_thumbnail_size[0]
                    st_height = small_thumbnail_size[1]
                else:
                    bt_width = max(bt_width, big_thumbnail_size[0])
                    bt_height = max(bt_height, big_thumbnail_size[1])
                    st_width = max(st_width, small_thumbnail_size[0])
                    st_height = max(st_height, small_thumbnail_size[1])
                st_scaling = (float(st_width)/float(docwidth) + float(st_height)/float(docheight)) / 2.0
                page_index = page_index + 1

            d = {"page-count" : str(page_count),
                 "tiff-width" : str(docwidth),
                 "images-width" : str(docwidth),
                 "images-size" : "%d,%d" % (docwidth, docheight),
                 "cropping-bounding-box" : "%d,%d;%d,%d" % (bbox),
                 "big-thumbnail-size" : "%s,%s" % (bt_width, bt_height),
                 "small-thumbnail-size" : "%s,%s" % (st_width, st_height),
                 "small-thumbnail-scaling" : "%f" % st_scaling,
                 "icon-size" : "%d,%d" % icon_size[0],
                 "images-height" : str(docheight),
                 "tiff-height" : str(docheight) }

            translation, scaling = thumbnail_translation_and_scaling(dirpath, d, false, true)
            d["big-thumbnail-translation-points"] = "%f,%f" % translation
            d["big-thumbnail-scaling-factor"] = "%f,%f" % scaling
            update_metadata(os.path.join(dirpath, "metadata.txt"), d)

        finally:
            shutil.rmtree(tmpdir)

        # indicate successful completion
        note(2, "  finished.")
        retval = true
def _add_internal (ostream, percent_done_fn, repo, response, params, content, wait):

    # this can be called in several different ways.
    # In general, you post a multipart/form-data body which
    # contains a "contenttype" for the document, and either a "URL"
    # for the content, or a "content" parameter containing the
    # the actual content.  If both "URL" and "content" are present,
    # the URL is added as the "original-url" value for the metadata,
    # and if the content is HTML, it's used as the "original.html"
    # and the URL is used to pull ancillary content referenced in it.

    content_type = params.get("contenttype")
    url = params.get("URL")
    noredir = params.get("no-redirect")
    noredir = noredir and (noredir.lower() == "true")
    uploadloc = url
    docname = params.get("documentname")
    tempf = None
    suppress_duplicates = params.get("suppress-duplicates")
    suppress_duplicates = suppress_duplicates and (suppress_duplicates.lower() == "true")
    bury = params.get("bury")
    bury = bury and (bury.lower() == "true")
    verbosity = int(params.get("verbosity") or "0")
    if content:
        if wait and ostream:
            _rewrite_job_output(ostream, '{ state: 0, msg: "Caching page..."}')
        extension = CONTENT_TYPES.get(content_type)
        if not extension:
            if wait:
                msg = "Don't know what to do with contenttype \"%s\"" % content_type
                if ostream:
                    _rewrite_job_output(ostream, '{state: 1, msg: "' + urllib.quote(msg) + '"}')
                else:
                    response.error(HTTPCodes.UNSUPPORTED_MEDIA_TYPE, msg)
            return
        # special case HTML/XHTML
        if content and (content_type.lower() in ("text/html", "application/xhtml+xml")):
            tempf = tempfile.mkdtemp()
            uploadloc = os.path.join(tempf, "original.html")
            # make sure that the folder for other parts exists, even if empty
            os.mkdir(os.path.join(tempf, "original_files"))
            # remove our bookmarklet, if present
            content = _BOOKMARKLET_PATTERN.sub('', content)
            content = _ADD_FORM_PATTERN.sub('', content)
            c = _OurCacher(url, filename=uploadloc, bits=content, content_type=content_type)
            # make sure that the folder for other parts exists, even if empty
            other_parts = os.path.join(tempf, "original_files")
            if not os.path.exists(other_parts):
                os.mkdir(other_parts)
        # special case 3x5 cards
        elif (docname and (content_type.lower() == "text/plain") and os.path.splitext(docname)[1] == ".3x5"):
            fd, tempf = tempfile.mkstemp(".3x5")
            fp = os.fdopen(fd, "wb")
            fp.write(content)
            fp.close()
            uploadloc = tempf
        else:
            fd, tempf = tempfile.mkstemp("." + extension)
            fp = os.fdopen(fd, "wb")
            fp.write(content)
            fp.close()
            uploadloc = tempf
        if suppress_duplicates:
            hash = calculate_originals_fingerprint(tempf)
            results = repo.do_query("sha-hash:"+hash)
            if results:
                # it's a duplicate
                doc = results[0][1]
                if os.path.isdir(tempf):
                    shutil.rmtree(tempf)
                elif os.path.exists(tempf):
                    os.remove(tempf)
                if ostream:
                    _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc.id + '"}')
                elif noredir:
                    response.reply(doc.id, "text/plain")
                else:
                    response.redirect("/action/basic/dv_show?doc_id=%s" % doc.id)
                return
    try:
        try:
            # get a cookie for authentication
            cookie = repo.new_cookie(url or content[:min(100, len(content))])
            cookie_str = '%s=%s; path=/; Secure' % (cookie.name(), cookie.value())
            os.environ["UPLIB_COOKIE"] = cookie_str
            doctitle = params.get("md-title")
            docauthors = params.get("md-authors")
            docdate = params.get("md-date")
            doccats = params.get("md-categories")
            metadata = params.get("metadata")
            if metadata:
                mdtmpfile = tempfile.mktemp()
                open(mdtmpfile, "w").write(metadata)
                # check to see if we're replacing an existing document
                md2 = read_metadata(StringIO.StringIO(metadata))
                existing_doc_id = md2.get("replacement-contents-for")
                if existing_doc_id and not repo.valid_doc_id(existing_doc_id):
                    raise ValueError("Invalid doc ID %s specified for replacement" % existing_doc_id)
            else:
                mdtmpfile = None
                existing_doc_id = None
            # now form the command
            scheme = ((repo.get_param("use-http", "false").lower() == "true") or _use_http) and "http" or "https"
            cmd = '%s --verbosity=%s --repository=%s://127.0.0.1:%s ' % (_uplib_add_document, verbosity, scheme, repo.port())
            if doctitle:
                cmd += ' --title=%s' % pipes.quote(doctitle)
            if docauthors:
                cmd += ' --authors=%s' % pipes.quote(docauthors)
            if docdate:
                cmd += ' --date="%s"' % docdate
            if doccats:
                cmd += ' --categories=%s' % pipes.quote(doccats)
            if mdtmpfile:
                cmd += ' --metadata="%s"' % mdtmpfile
            cmd += ' "%s"' % uploadloc
            if ostream:
                _rewrite_job_output(ostream, '{state: 0, msg: "' + urllib.quote(cmd) + '"}')
            # and invoke the command
            status, output, tsignal = subproc(cmd)
            note(4, "cmd is %s, status is %s, output is %s", repr(cmd), status, repr(output.strip()))
            if mdtmpfile:
                os.unlink(mdtmpfile)
            if status == 0:
                # success; output should be doc-id
                doc_id = existing_doc_id or output.strip().split()[-1]
                note(4, "output is '%s'; doc_id for new doc is %s", output.strip(), doc_id)
                if wait and ostream:
                    _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(output) + '"}')
                # wait for it to come on-line
                if percent_done_fn:
                    percent_done_fn(40)         # estimate 40% of work done on client side
                while not repo.valid_doc_id(doc_id):
                    if ostream:
                        pending = repo.list_pending(full=True)
                        s = _first(pending, lambda x: x['id'] == doc_id)
                        if not s:
                            break
                        dstatus = s['status']
                        if dstatus == 'error':
                            msg = 'server-side error incorporating document'
                            _rewrite_job_output(ostream, '{ state: 3, doc_id: "' + doc_id
                                                + '", msg: "' + urllib.quote(s['error']) + '"}')
                            break
                        if dstatus == 'unpacking':
                            msg = 'starting ripper process...'
                        elif dstatus == 'ripping':
                            msg = "ripping with ripper '" + s['ripper'] + "'..."
                        elif dstatus == 'moving':
                            msg = 'adding to registered document set...'
                        _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id
                                            + '", msg: "' + urllib.quote(msg) + '"}')
                    time.sleep(1.0)
                if percent_done_fn:
                    percent_done_fn(100)        # finished
                if repo.valid_doc_id(doc_id):
                    if bury:
                        # wait up to 100 seconds for it to show up in history list
                        # after that, wait another second, then bury it
                        counter = 100
                        while counter > 0:
                            h = [x.id for x in repo.history()]
                            if doc_id in h:
                                break
                            counter -= 1
                            time.sleep(1)
                        time.sleep(1)
                        repo.touch_doc(doc_id, bury=True, notify=False)
                        note(3, "buried %s", doc_id)
                    if wait:
                        if ostream:
                            _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc_id + '"}')
                        elif noredir:
                            response.reply(doc_id, "text/plain")
                        else:
                            response.redirect("/action/basic/dv_show?doc_id=%s" % doc_id)
            else:
                note("cmd <<%s>> failed with status %s:\n%s", cmd, status, output)
                if wait:
                    if ostream:
                        _rewrite_job_output(ostream, '{ state: 3, msg: "' + urllib.quote('Error processing the document:\n' + output) + '"}')
                    else:
                        response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "<pre>" + htmlescape(output) + "</pre>")
        except:
            e = ''.join(traceback.format_exception(*sys.exc_info()))
            if wait:
                note(3, "Exception processing uplib-add-document request:\n%s", htmlescape(e))
                if ostream:
                    _rewrite_job_output(ostream, '{state: 3, msg: "' + urllib.quote("Exception processing uplib-add-document request:\n" + e) + '"}')
                else:
                    response.error(HTTPCodes.INTERNAL_SERVER_ERROR,
                                   "Exception processing uplib-add-document request:\n<pre>" +
                                   htmlescape(e) + "\n</pre>")
            else:
                note("Exception processing uplib-add-document request:\n%s", e)
    finally:
        if tempf and os.path.isfile(tempf):
            os.unlink(tempf)
        elif tempf and os.path.isdir(tempf):
            shutil.rmtree(tempf)
def flesh_out_folder(id, tmpfilename, metadata, repo, unpack_fn, counter):
    try:
        try:
#             note(3, "CODETIMER_ON is %s", CODETIMER_ON)
#             if CODETIMER_ON:
#                 code_timer.Init()
#                 code_timer.CreateTable("uplib")
#                 code_timer.CodeTimerOn()
#                 code_timer.StartInt("newFolder$unpack", "uplib")
#             else:
#                 code_timer.CodeTimerOff()

            if unpack_fn and tmpfilename and os.path.exists(tmpfilename):
                unpack_fn(repo, id, tmpfilename, metadata)

#             if CODETIMER_ON:
#                 code_timer.StopInt("newFolder$unpack", "uplib")
            folderpath = repo.pending_location(id)
            try:
                note("unpacked new folder in %s", folderpath)
                if not sys.platform.lower().startswith("win"):
                    s, o, t = subproc("ls -Rl %s" % folderpath)
                    note("%s\n" % o)

                fp = open(os.path.join(folderpath, "UNPACKED"), 'w')
                fp.flush()
                fp.close()

                # as of this point, we can restart the inclusion of the document

                md = read_metadata(os.path.join(folderpath, "metadata.txt"))
                replacement_id = md.get("replacement-contents-for")
                if replacement_id:
                    if repo.valid_doc_id(replacement_id):
                        # contents to replace another document
                        md["replacement-contents-for"] = ""
                        update_metadata(os.path.join(folderpath, "metadata.txt"), md)
                        note(2, "replacing contents of %s with this data...", replacement_id)
                        existing_document = repo.get_document(replacement_id)
                        new_folder = existing_document.folder()
                        process_folder(repo, replacement_id, folderpath, false, new_folder)
                        _run_rippers(new_folder, repo, replacement_id)
                        existing_document.recache()
                        repo.touch_doc(existing_document)
                        raise AbortDocumentIncorporation(id, "replacement for existing document %s" % replacement_id)
                    else:
                        raise AbortDocumentIncorporation(id, "replacement for non-existent document %s" % replacement_id)

                _finish_inclusion (repo, folderpath, id)

#                 if CODETIMER_ON:
#                     noteOut = StringIO.StringIO()
#                     noteOut.write("\nCode Timer statistics (what took time, in milliseconds):\n")
#                     code_timer.PrintTable(noteOut, "uplib")
#                     noteOut.write("\n")
#                     noteOutString = noteOut.getvalue()
#                     note(3, noteOutString)

            except:
                type, value, tb = sys.exc_info()
                note("%s", ''.join(traceback.format_exception(type, value, tb)))
                note_error(folderpath, (type, value, tb))
                raise value, None, tb

        except AbortDocumentIncorporation, x:
            # ripper signalled to stop adopting this document, for good
            note(2, "AbortDocumentIncorporation exception on %s:  %s", x.id, x.message)
            if (x.id == id):
                shutil.rmtree(folderpath)
            remove_from_index(repo.index_path(), id)

        except:
            type, value, tb = sys.exc_info()
            note("Exception processing new folder:\n%s", ''.join(traceback.format_exception(type, value, tb)))
def index_folders (docs_dir, doc_ids, repo_index_dir):

    update_configuration()

    if not doc_ids:
        return

    if LUCENE == 'jcc':

        c = get_context(repo_index_dir)
        SECTION_LOCK.acquire()
        try:
            for id in doc_ids:
                folderpath = os.path.join(docs_dir, id)
                if os.path.isdir(folderpath):
                    lock_folder(folderpath)
                    try:
                        try:
                            c.index(folderpath, id, False)
                        except:
                            note(0, "Can't index folder %s:\n%s",
                                 folderpath, ''.join(traceback.format_exception(*sys.exc_info())))
                    finally:
                        unlock_folder(folderpath)
            c.reopen()
        finally:
            SECTION_LOCK.release()
        return

    else:

        # invoke Java to do indexing

        if len(doc_ids) > 6:

            fname = tempfile.mktemp()
            fp = open(fname, "w")
            fp.write(string.join(doc_ids, '\n'))
            fp.close()
            indexingcmd = INDEXING_BATCHADD_CMD % (JAVA, DEBUG_FLAGS, INDEXING_PROPERTIES, LUCENE_JAR, INDEXING_JAR, repo_index_dir, docs_dir, fname)
            note(3, "  indexing with %s", indexingcmd)
            SECTION_LOCK.acquire()
            try:
                status, output, tsignal = subproc(indexingcmd)
            finally:
                SECTION_LOCK.release()
                os.unlink(fname)
            note(3, "  indexing output is <%s>", output)
            if status != 0:
                raise Error ("%s signals non-zero exit status %d attempting to index %s:\n%s" % (JAVA, status, doc_ids, output))

        else:

            folders = string.join(doc_ids, ' ')
            indexingcmd = INDEXING_ADD_CMD % (JAVA, DEBUG_FLAGS, INDEXING_PROPERTIES, LUCENE_JAR, INDEXING_JAR, repo_index_dir, docs_dir, folders)
            note(3, "  indexing with %s", indexingcmd)
            SECTION_LOCK.acquire()
            try:
                status, output, tsignal = subproc(indexingcmd)
            finally:
                SECTION_LOCK.release()
            note(3, "  indexing output is <%s>", output)
            if status != 0:
                raise Error ("%s signals non-zero exit status %d attempting to index %s:\n%s" % (JAVA, status, doc_ids, output))
def manipulate_server_internal (repo, params, response=None, ipaddr=None, lgr=None):

    # regular UpLib action

    conf = params.get("configurator")
    if not conf:
        conf = configurator()
    imap_ssl_port = conf.get_int("imap-server-ssl-port", -1)
    imap_localhost_port = conf.get_int("imap-server-localhost-port", 8143)
    stunnel = conf.get("stunnel")
    expunge_deletes_docs = conf.get_bool("imap-expunge-deletes-documents", False)
    global CHECKPOINT_PERIOD
    CHECKPOINT_PERIOD = conf.get_int("imap-server-checkpoint-interval", 600)
    allow_anonymous_readers = ((not repo.has_password) and
                               conf.get_bool("imap-server-allow-anonymous-readers", True))
    use_for_email = conf.get_bool("imap-server-use-for-email", False)

    imap_dir = os.path.join(repo.overhead_folder(), "imap")
    if not os.path.isdir(imap_dir):
        os.mkdir(imap_dir)

    stunnel_pid_filepath = os.path.join(imap_dir, "stunnel.pid")
    if os.path.exists(stunnel_pid_filepath):
        stunnel_pid = int(open(stunnel_pid_filepath, 'r').read().strip())
    else:
        stunnel_pid = None

    # we cache the reference to the existing server in another
    # module so that we can reload this one with impunity
    current_server = emailParser.__dict__.get("IMAP_SERVER")
    note("current server is %s", current_server)

    action = params.get('action')
    newcontext = params.get('newcontext', False)

    if response:
        fp = response.open()
    else:
        fp = StringIO()

    fp.write('<body bgcolor="%s">\n' % STANDARD_BACKGROUND_COLOR)
    if current_server:
        s = current_server.status()
        m = s.more()
        while m:
            fp.write(m)
            m = s.more()
        fp.write('\n<hr>\n')
    else:
        fp.write('<h2>UpLib IMAP Server control panel</h2>\n')

    current_context = None
    if current_server and ((action == 'Stop') or (action == 'Restart')):

        if stunnel_pid:
            try:
                os.kill(stunnel_pid, signal.SIGKILL)
                time.sleep(4)
            except:
                pass
            stunnel_pid = None

        current_context = current_server.mailcontext
        current_server.close()
        current_server = None
        del emailParser.__dict__["IMAP_SERVER"]
        fp.write("<p>Closed current server.\n")

    if os.path.exists(stunnel_pid_filepath):
        os.unlink(stunnel_pid_filepath)

    if (action == 'Start') or (action == 'Restart'):

        cert_filepath = os.path.join(repo.overhead_folder(), repo.certfilename())
        
        try:
            port = params.get("port")
            if port:
                port = int(port)
            else:
                port = imap_localhost_port

            if stunnel and ((not ssl) or (imap_ssl_port > 0)):

                # start stunnel
                stunnel_conf_filepath = os.path.join(imap_dir, "stunnel.conf")
                f = open(stunnel_conf_filepath, 'w')
                f.write("debug = 7\n\ncert = %s\noutput = %s\npid = %s\n\n[imapuplib]\naccept = %s\nconnect = 127.0.0.1:%s\n" %
                        (cert_filepath, os.path.join(imap_dir, "stunnel.log"), stunnel_pid_filepath,
                         str(imap_ssl_port), str(port)))
                f.close()
                status, tsignal, output = subproc("%s %s" % (stunnel, stunnel_conf_filepath))
                note("status from '%s %s' (on %s) is %s, output is <%s>", stunnel, stunnel_conf_filepath, imap_ssl_port, status, output)
                if status != 0:
                    raise RuntimeError("Can't start stunnel with '%s %s'; status is %s, output is %s" % (stunnel, stunnel_conf_filepath, status, output))
                stunnel_pid = int(open(stunnel_pid_filepath, 'r').read().strip())
                note("stunnel_pid is %s", stunnel_pid)

            else:
                stunnel_pid = None

            if newcontext or (not current_context):
                current_context = uplib_mailcontext(repo,
                                                    expunge_deletes_docs=expunge_deletes_docs,
                                                    allow_readers=allow_anonymous_readers,
                                                    use_for_email=use_for_email,
                                                    ip=get_fqdn(),
                                                    server_certificate_file=cert_filepath)
            if current_context.inbox:
                current_context.inbox.rescan()
            if stunnel_pid is not None:
                ipaddr = '127.0.0.1'
            else:
                ipaddr = '0.0.0.0'

            if not lgr:
                lgr = logger.rotating_file_logger (os.path.join(imap_dir, "imap.log"), "weekly", None, True)
                lgr = logger.unresolving_logger(lgr)

            imaps = imap_server (current_context, ipaddr, port, logger=lgr, stunnel_pid=stunnel_pid)
            emailParser.__dict__["IMAP_SERVER"] = imaps
            current_server = imaps

            hooked = emailParser.__dict__.get("IMAP_SERVER_SHUTDOWN_HOOK")
            if not hooked:
                repo.add_shutdown_hook(lambda x=repo: shutdown_server(x))
                emailParser.__dict__["IMAP_SERVER_SHUTDOWN_HOOK"] = True

            if stunnel_pid:
                fp.write("<p>Started new IMAP4 server for %s on ports %s/%s."
                         % (repr(repo), str(imap_ssl_port), str(port)))
            else:
                fp.write("<p>Started new IMAP4 server for %s on port %s."
                         % (repr(repo), str(port)))
            if current_context.inbox:
                fp.write("<p>Inbox:  %d messages, %d recent, %d unseen."
                         % (len(current_context.inbox.msgs),
                            len(current_context.inbox.recent()),
                            current_context.inbox.min_unseen()))
        except:
            type, value, tb = sys.exc_info()
            s = string.join(traceback.format_exception(type, value, tb))
            note("Can't establish IMAP server:  exception:  " + s)
            fp.write(s)

    fp.write('<form method=GET action="/action/IMAPServer/manipulate_server">\n')
    fp.write('<input type=submit name=action value="Start" %s>\n' % ((current_server and "disabled") or ""))
    fp.write('<input type=submit name=action value="Stop" %s>\n' % (((current_server == None) and "disabled") or ""))
    fp.write('<input type=submit name=action value="Restart" %s>\n' % (((current_server == None) and "disabled") or ""))
    fp.write('<input type=checkbox name="newcontext" %s> Use fresh mail context\n' % (newcontext and "checked") or "")
    fp.write('</form>\n')
    fp.write('</body>\n')
    def rip (self, location, doc_id):

        global CITATION_PARSER, HEADER_PARSER

        omd = self.get_folder_metadata(location)

        # CiteSeer really only works on traditional publications, so let's stay
        # with PDF and Word docs
        mimetype = omd.get("apparent-mime-type")
        if mimetype not in TRADITIONAL_PAPER_FORMATS:
            return

        text, language = self.get_folder_text(location)
        if not text:
            # no text to look at
            return

        m = REFERENCES_PATTERN.search(text)
        if not m:
            # no REFERENCES_PATTERN in text
            return

        # just a note if we're re-ripping something
        if self.repository().valid_doc_id(doc_id):
            note(3, "%s is a technical report", self.repository().get_document(doc_id))

        cp = self.__citation_parser or CITATION_PARSER
        if cp:
            status, output, tsig = subproc('%s "%s"' % (cp, self.folder_text_path(location)))
            if status == 0:
                parsed = BeautifulStoneSoup(output.strip())
                citations = parsed.findAll("citation")
                note(3, "found %d citations", len(citations))
                fp = open(os.path.join(location, "citeseerx-citations.xml"), "w")
                fp.write(output.strip())

        hp = self.__header_parser or HEADER_PARSER
        if hp:
            tfile = tempfile.mktemp()
            fp = codecs.open(tfile, "w", "UTF-8")
            fp.write(text)
            fp.close()
            try:
                status, output, tsig = subproc('%s "%s"' % (hp, tfile))
                if status == 0:
                    md = dict()
                    parsed = BeautifulStoneSoup(output.strip())
                    title = parsed.find("title")
                    if title:
                        if title.string:
                            md['citeseer-title'] = title.string
                        else:
                            note(3, "Non-string title found: %s", title)
                    authors = set()
                    for author in parsed.findAll("author"):
                        n = author.find("name")
                        if n:
                            authors.add(n.string)
                        else:
                            authors.add(author.string)
                    if authors:
                        md['citeseer-authors'] = " and ".join(list(authors))
                    abstract = parsed.find("abstract")
                    if abstract:
                        if abstract.string:
                            md['citeseer-abstract'] = abstract.string
                        else:
                            note(3, "Non-string abstract found: %s", abstract)
                    note(3, "citeseer metadata is %s", pprint.pformat(md))
                    if "citeseer-title" in md:
                        # use CiteSeer data to fix up document metadata, if necessary
                        if ((not omd.get("title")) or
                            (omd.get("title-is-original-filepath", "false").lower() == "true")):
                            md['title'] = md.get("citeseer-title")
                            md['title-is-original-filepath'] = None
                            md['title-is-citeseer-extracted'] = "true"
                        if ("citeseer-authors" in md) and (not omd.get("authors")):
                            md['authors'] = md.get("citeseer-authors")
                        if ("citeseer-abstract" in md) and (not md.get("abstract")):
                            abs = md.get("citeseer-abstract")
                            prefix = ABSTRACT_PREFIX.match(abs)
                            if prefix:
                                realstart = prefix.end("prefix")
                                note(3, "trimming abstract prefix of %s", repr(abs[:realstart]))
                                abs = abs[realstart:]
                            md['abstract'] = abs                            
                        note(3, "updated missing metadata with CiteSeer versions")
                    self.update_folder_metadata(location, md)
            finally:
                if os.path.exists(tfile):
                    os.unlink(tfile)