Python update_metadata 예제들, uplib.plibUtil.update_metadata Python 예제들

예제 #1

0

파일 보기

파일: createThumbnails.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

def create_temporary_icons (metadata, dirpath, output_dir, params):
    global TEMPORARY_BACKGROUND, UNDER_CONSTRUCTION
    thumbnails_path = output_dir
    os.mkdir(thumbnails_path)
    note("thumbnails_path is %s", thumbnails_path)
    title = metadata.get("title")
    document_icon = Image.new("RGB", (150, 194), TEMPORARY_BACKGROUND)
    draw = ImageDraw.Draw(document_icon)
    draw.line((0,0) + document_icon.size, LEGEND_COLOR)
    draw.line((0, document_icon.size[1], document_icon.size[0], 0), LEGEND_COLOR)
    draw.rectangle((0, 0, document_icon.size[0]-1, document_icon.size[1] -1), outline=LEGEND_COLOR)
    if title: document_icon = add_legend(document_icon, ("(255, 255, 255)" + title,))
    document_icon.save(os.path.join(thumbnails_path, "first.png"), "PNG")
    page_1_big = Image.new("RGB", (425, 550), TEMPORARY_BACKGROUND)
    legend = []
    legend.append("(255,255,255)[temporary document]")
    if title: legend.append("(0,255,0)%s" % title)
    page_1_big = add_legend(page_1_big, legend)
    page_1_big.save(os.path.join(thumbnails_path, "big1.png"), "PNG")
    page_1_small = Image.new("RGB", (85, 110), TEMPORARY_BACKGROUND)
    add_page_no (page_1_small, (5, 5), "1")
    page_1_small.save(os.path.join(thumbnails_path, "1.png"), "PNG")
    update_metadata(os.path.join(dirpath, "metadata.txt"), {"page-count" : "1",
                                                            "tiff-width" : 2550,
                                                            "images-width" : 2550,
                                                            "images-size" : "2550,3300",
                                                            "cropping-bounding-box" : "0,0;2550,3300",
                                                            "big-thumbnail-size" : "425,550",
                                                            "small-thumbnail-size" : "85,110",
                                                            "small-thumbnail-scaling" : ("%f" % (float(1)/float(30))),
                                                            "images-height" : "3300",
                                                            "tiff-height" : "3300",
                                                            })

예제 #2

0

파일 보기

파일: FetchACMBibTex.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

def update_document_metadata_from_acm_diglib (location):

    def charref_replace(matchobj):
        return unichr(int(matchobj.group('charcode')))

    def parse_endnote(newdict, md, endnote):
        parts = endnote.strip().split("\n")
        authors = ""
        for part in parts:
            p = ENDNOTE_CHARREF.sub(charref_replace, part.strip())
            if p.startswith("%T "):
                newdict['title'] = p[3:].strip()
                newdict['title-is-original-filepath'] = ''
            elif p.startswith("%P "):
                newdict['page-numbers'] = p[3:].strip()
            elif p.startswith("%D "):
                # we override any existing date, because often the PDF file had
                # a bad date in it -- the date it was scanned to add to the library
                year, month, day = parse_date(p[3:].strip())
                newdict['date'] = "%s/%s/%s" % (month, day, year)
            elif p.startswith("%A "):
                # ignore any author metadata in the PDF file
                if authors:
                    authors += " and "
                authors += p[3:].strip()
        if authors:
            d['authors'] = authors

    mdpath = os.path.join(location, "metadata.txt")
    md = read_metadata(mdpath)
    if md.has_key("original-url") and "portal.acm.org" in md.get("original-url"):
        bibtex, endnote, abstract = fetch_bibtex_and_endnote_from_acm_diglib(md.get("original-url"))
        if bibtex or endnote:
            d = {}
            if bibtex:
                d['bibtex-citation'] = re.sub("\n", " ", bibtex)
            if endnote:
                parse_endnote(d, md, endnote)
                d['endnote-citation'] = re.sub("\n", " / ", endnote)
            if bibtex and not md.has_key("citation"):
                d["citation"] = re.sub("\n", " ", bibtex)
            if abstract and not md.has_key("abstract"):
                d["abstract"] = re.sub("\n|<par>|</par>", " ", abstract)
            update_metadata(mdpath, d)
        else:
            note("Couldn't fetch citation info for URL \"%s\".", md.get("original-url"))

예제 #3

0

파일 보기

파일: newFolder.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

def process_tarred_folder (repo, id, tarfile, metadata):
    # create a new folder, and populate it
    dirname = tempfile.mktemp()
    try:
        os.mkdir(dirname)
        os.chmod(dirname, 0700)
        cmd = UNTAR_CMD % (dirname, TAR, tarfile)
        note(2, "Untarring folder into temporary directory %s", dirname)
        status, output, signal = subproc(cmd)
        if status == 0:
            note(2, "Successfully untarred folder into %s", dirname)
            if metadata:
                update_metadata(os.path.join(dirname, "metadata.txt"), metadata)
            if (os.path.exists(os.path.join(dirname, "document.tiff")) or
                os.path.isdir(os.path.join(dirname, "page-images"))):
                return process_folder(repo, id, dirname, true)
            else:
                raise Error("invalid folder -- no page images file")
        else:
            raise Error("Problem untarring folder:\n%s" % output)
    finally:
        if os.path.exists(dirname):
            shutil.rmtree(dirname)

예제 #4

0

파일 보기

파일: newFolder.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

def process_zipped_folder (repo, id, zipfile, metadata):
    # create a new folder, and populate it
    dirname = tempfile.mktemp()
    try:
        os.mkdir(dirname)
        os.chmod(dirname, 0700)
        note(2, "Unzipping folder into temporary directory %s", dirname)
        try:
            unzip(dirname, zipfile)
            note(2, "Successfully unzipped folder into %s", dirname)
            if metadata:
                update_metadata(os.path.join(dirname, "metadata.txt"), metadata)
        except:
            typ, ex, tb = sys.exc_info()
            s = string.join(traceback.format_exception(typ, ex, tb))
            raise Error("Problem unzipping folder:\n%s" % s)
        if (os.path.exists(os.path.join(dirname, "document.tiff")) or
            os.path.isdir(os.path.join(dirname, "page-images"))):
            return process_folder(repo, id, dirname, true)
        else:
            raise Error("invalid folder -- no page images")
    finally:
        if os.path.exists(dirname):
            shutil.rmtree(dirname)

예제 #5

0

파일 보기

파일: createPageBboxes.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

def do_page_bounding_boxes (dirpath):

    textfilepath = os.path.join(dirpath, "contents.txt")
    wordbox_file = open(os.path.join(dirpath, "wordbboxes"), 'rb')
    pos_filepath = os.path.join(dirpath, "contents.ind")
    para_filepath = os.path.join(dirpath, "paragraphs.txt")

    note ("doing page bboxes for %s...", dirpath)

    if os.path.exists(pos_filepath):
        fp = open(pos_filepath, 'r')
        postags = POSTag.parse_parseinfo(fp)
        fp.close()
    else:
        postags = None

    bbox_iterator = wordboxes_page_iterator(dirpath)

    text_file = open(textfilepath, 'rb')
    firstline = text_file.readline()
    charsetmatch = CHARSETPATTERN.match(firstline)
    if charsetmatch:
        charsetname = charsetmatch.group(1)
        text_file.readline()
        first_byte = text_file.tell()
    else:
        charsetname = "latin_1"
        readlines = false
        first_byte = 0
    if charsetname not in UTF8_ALIASES:
        raise ValueError("Charset in contents.txt must be UTF-8 for page bounding boxes to be created.  Apparently it's %s, instead." % charsetname)
    text_file.seek(first_byte)

    paras = read_paragraphs_file(para_filepath)
    if paras: paras.sort(key=lambda x: x.first_byte)

    from createThumbnails import thumbnail_translation_and_scaling
    translation, scaling = thumbnail_translation_and_scaling (dirpath)
    note(4, "   translation and scaling are %s and %s...", translation, scaling)

    def update_stats (stats, page_stats):
        if stats:
            stats += ", "
        stats += "%d:%.3f:%d:%d:%d:%d:%.3f" % (page_stats[0],
                                               ((page_stats[0] > 0) and float(page_stats[1])/float(page_stats[0]) or 0.0),
                                               page_stats[2], page_stats[3], page_stats[4], page_stats[5],
                                               ((page_stats[0] > 0) and float(page_stats[6])/float(page_stats[0]) or 0.0))
        return stats
        

    page_index = 0
    out_page_index = 0
    last_cindex = 0
    bboxes = []
    postags_index = 0

    stats = ""

    # accumulate stats
    doc_stats = [
        0,              # number of words
        0,              # total length (in characters)
        0,              # number of bold words
        0,              # number of italic words
        0,              # number of bold-italic words
        0,              # number of fixed-width words
        0.0,            # total font sizes
        ]

    for page_index, bboxes in bbox_iterator:

        page_stats = [
            0,              # number of words
            0,              # total length (in characters)
            0,              # number of bold words
            0,              # number of italic words
            0,              # number of bold-italic words
            0,              # number of fixed-width words
            0.0,            # total font sizes
            ]

        adjusted_bboxes = []

        for bbox in bboxes:

            char_count = bbox.nchars()

            doc_stats[0] += 1
            doc_stats[1] += bbox.nchars()
            if bbox.is_bold():
                doc_stats[2] += 1
            if bbox.is_italic():
                doc_stats[3] += 1
            if bbox.is_bold() and bbox.is_italic():
                doc_stats[4] += 1
            if bbox.is_fixedwidth():
                doc_stats[5] += 1
            doc_stats[6] += bbox.font_size()

            page_stats[0] += 1
            page_stats[1] += bbox.nchars()
            if bbox.is_bold():
                page_stats[2] += 1
            if bbox.is_italic():
                page_stats[3] += 1
            if bbox.is_bold() and bbox.is_italic():
                page_stats[4] += 1
            if bbox.is_fixedwidth():
                page_stats[5] += 1
            page_stats[6] += bbox.font_size()

            cindex = bbox.contents_offset()

            tag = None
            if postags:
                # advance to first POS tag which might apply to cindex
                while ((postags_index < len(postags)) and
                       (cindex >= (postags[postags_index].start + postags[postags_index].length))):
                    postags_index = postags_index + 1
                # might be cindex positions for which we have not tags -- check for that
                if ((postags_index < len(postags)) and (cindex >= postags[postags_index].start) and
                    (cindex < (postags[postags_index].start + postags[postags_index].length))):
                    tag = postags[postags_index]

            if paras and (paras[0].first_byte <= (cindex + char_count)) and (paras[0].first_byte_not >= cindex):
                # starts this paragraph
                if tag is None:
                    tag = POSTag(cindex, char_count, None, "",
                                 True, False, False)
                else:
                    tag.starts_paragraph = True
                paras = paras[1:]

            # again, add back in the 20-pixel border on the page
            ulx = trunc((bbox.left() + translation[0]) * scaling[0] + 0.5)
            uly = trunc((bbox.top() + translation[1]) * scaling[1] + 0.5)
            lrx = trunc((bbox.right() + translation[0]) * scaling[0] + 0.5)
            lry = trunc((bbox.bottom() + translation[1]) * scaling[1] + 0.5)

            adjusted_bboxes.append((bbox, tag, ulx, uly, lrx, lry))
            last_cindex = cindex

        if (len(adjusted_bboxes) > 0):

            startpoint = adjusted_bboxes[0][0].contents_offset()
            endpoint = adjusted_bboxes[-1][0].contents_offset() + (adjusted_bboxes[-1][0].nchars() * 4)
            text_file.seek(startpoint + first_byte)
            pagetext = text_file.read(endpoint - startpoint)
            pagestart = startpoint

        else:
            pagetext = ""
            pagestart = last_cindex

        flush_page (dirpath, page_index, adjusted_bboxes, pagetext, pagestart)

        stats = update_stats(stats, page_stats)

    text_file.close()
    wordbox_file.close()

    dstats = update_stats("", doc_stats)

    update_metadata(os.path.join(dirpath, "metadata.txt"), { "wordbbox-stats-pagewise": stats, "wordbbox-stats-docwise": dstats})

예제 #6

0

파일 보기

파일: NYTimes.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

    def rip (self, folder, docid):

        def encodestring(s):
            # nytimes strings have xml char refs, and we want Unicode
            if not s:
                return s

            s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s)
            s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s)
            return s

        mdpath = os.path.join(folder, "metadata.txt")
        originalspath = os.path.join(folder, "originals", "original.html")
        if not (os.path.exists(mdpath) and os.path.exists(originalspath)):
            return
        md = read_metadata(mdpath)
        url = md.get("original-url")
        if not url:
            return
        host, port, path = parse_URL(url)
        if host != "www.nytimes.com":
            return

        # OK, it's from the NY Times
        new_metadata = MetadataGatherer.parse(originalspath)

        if "source" not in md:
            md["source"] = "New York Times"

        # not all articles have metadata...
        if not ((('title' in new_metadata) or ('hdl' in new_metadata)) and ('pdate' in new_metadata)):
            note(3, "No metadata in article:  %s", new_metadata)
            return

        md["title"] = encodestring(new_metadata.get("hdl") or md.get("title"))
        if "date" not in md:
            # get the date
            d = new_metadata.get("pdate")
            md["date"] = "%s/%s/%s" % (d[4:6], d[6:], d[:4])
        if "authors" not in md:
            # get the byline
            d = new_metadata.get("byl")
            if d:
                if d.startswith("By "):
                    d = d[3:]
                # capitalize properly
                d = d.title()
                # lowercase "And"
                d = d.replace(" And ", " and ")
                md["authors"] = encodestring(d)
        d = new_metadata.get("keywords")
        d0 = md.get("keywords")
        if d0:
            d0 += ("," + d)
        else:
            d0 = d
        if d0:
            md["keywords"] = encodestring(d0)
        if new_metadata.get("description"):
            md["summary"] = encodestring(new_metadata.get("description"))
        update_metadata(mdpath, md)

예제 #7

0

파일 보기

파일: createThumbnails.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

def thumbnail_translation_and_scaling (folder, d=None, update=true, recalc=false):
    
    # 'translation' is in units of points
    # 'scaling' is in units of pixels/point

    if d is None:
        d = dict()

    def find_data (key):
        return d.get(key) or doc_metadata.get(key)

    def parse_value (x):
        if x is None:
            return None
        elif type(x) in types.StringTypes:
            return eval('(' + x + ')')
        elif type(x) in types.TupleType:
            return x
        else:
            raise ValueError("argument " + str(x) + " must be string or tuple")

    metadata_file = os.path.join(folder, "metadata.txt")
    doc_metadata = read_metadata(metadata_file)

    if recalc:
        translation = None
        scaling = None
    else:
        translation = parse_value(doc_metadata.get("big-thumbnail-translation-points"))
        scaling = parse_value(doc_metadata.get("big-thumbnail-scaling-factor"))

    if scaling is None or translation is None:

        cropbox_data = find_data("cropping-bounding-box")
        images_size = eval('(%s)' % find_data("images-size"))
        if cropbox_data:
            cropbox = [eval('(%s)' % x) for x in cropbox_data.split(';')]
        else:
            cropbox = [(0,0), images_size]
        big_thumbnail_size = find_data("big-thumbnail-size")
        if big_thumbnail_size:
            big_tn_size = eval('(%s)' % big_thumbnail_size)
        else:
            from PIL import Image
            big_tn_size = Image.open(os.path.join(folder, "thumbnails", "big1.png")).size

        ppi = int(find_data("tiff-dpi") or find_data("images-dpi") or 300)

        # Remember that cropped page images have a 20 pixel border added back after scaling.
        #

        left_crop_border = 0
        right_crop_border = 0
        top_crop_border = 0
        bottom_crop_border = 0

        if cropbox_data:        
            if cropbox[0][0] != 0:
                left_crop_border = 20
            if cropbox[0][1] != 0:
                top_crop_border = 20
            if cropbox[1][0] != images_size[0]:
                right_crop_border = 20
            if cropbox[1][1] != images_size[1]:
                bottom_crop_border = 20

        # calculate a translation quantity in "points"
        translation = (0 - float((cropbox[0][0] - left_crop_border) * 72)/ppi,
                       0 - float((cropbox[0][1] - top_crop_border) * 72)/ppi)

        # calculate a scaling factor that goes from bounding box edges in "points" to
        # scaled thumbnail coordinates in "pixels"
        #
        scaling = (float(ppi * big_tn_size[0])/float(72 * (cropbox[1][0] - cropbox[0][0] + (left_crop_border + right_crop_border))),
                   float(ppi * big_tn_size[1])/float(72 * (cropbox[1][1] - cropbox[0][1] + (top_crop_border + bottom_crop_border))))

        # now read the wordboxes and calculate the thumbnail bounding boxes for them
        note(4, "    for %s:  translation is %f, %f, scaling is %f, %f",
             folder, translation[0], translation[1], scaling[0], scaling[1])

        if update:
            update_metadata(metadata_file,
                            {'big-thumbnail-scaling-factor' : "%f,%f" % scaling,
                             'big-thumbnail-translation-points' : "%f,%f" % translation})

    return translation, scaling

예제 #8

0

파일 보기

파일: createThumbnails.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

def do_thumbnails (dirpath, output_dir, **params):
    note(2, "  thumbnailing in %s...", dirpath)
    tmpdir = tempfile.mktemp()
    retval = params.get('returnvalue', false)
    doc_metadata_path = os.path.join(dirpath, "metadata.txt")
    try:
        os.mkdir(tmpdir)
        os.chmod(tmpdir, 0700)
        try:

            md = read_metadata(doc_metadata_path)
            is_temporary_doc = md.get("temporary-contents")
            if is_temporary_doc and (is_temporary_doc == "true"):
                # temporary -- don't spend much time on this
                create_temporary_icons (md, dirpath, output_dir, params)
                retval = true
                return

            if os.path.exists(os.path.join(dirpath, "document.tiff")):
                # contains one single-page TIFF file
                tiffmaster = os.path.join(tmpdir, "master.tiff")
                split_command = (TIFF_SPLIT_CMD
                                 % (TIFFCP, os.path.join(dirpath, "document.tiff"), tiffmaster,
                                    TIFFSPLIT, tiffmaster, os.path.join(tmpdir, "x")))
                status, output, tsignal = subproc(split_command)
                if status != 0:
                    raise Error ("'%s' signals non-zero exit status %d in %s => %s"
                                 % (split_command, status, dirpath, tmpdir))
                parts_dir = tmpdir
                filecheck_fn = lambda fn: fn[0] == "x"
            elif (os.path.exists(os.path.join(dirpath, "page-images")) and
                  os.path.isdir(os.path.join(dirpath, "page-images"))):
                # contains directory full of PNG page images
                parts_dir = os.path.join(dirpath, "page-images")
                filecheck_fn = lambda fn: (fn.startswith('page') and fn.endswith('.png'))
            else:
                raise Error("No page images for document in %s" % dirpath)

            tiff_parts = os.listdir(parts_dir)
            if len(tiff_parts) < 1:
                raise Error("No pages in split tiff file directory after split!")
            # either a PNG-images or a TIFF split will sort properly in lexicographic order
            tiff_parts.sort()

            # see if there's a document icon legend and info about the DPI of the tiff file
            legend = md.get('document-icon-legend')
            tiff_dpi = int(md.get('images-dpi') or md.get('tiff-dpi') or params.get('images-dpi') or 0)
            page_numbers_v = md.get('page-numbers')
            page_numbers = (page_numbers_v and figure_page_numbers(page_numbers_v, dirpath))
            first_page = int(md.get('first-page-number', 1))
            skips = md.get('document-bbox-pages-to-skip', '')
            if skips:
                parts = string.split(skips, ':')
                bbox_skips = []
                for part in parts:
                    bbox_skips = bbox_skips + map(int, string.split(part, ','))
            else:
                bbox_skips = None

            # figure bounding box for imaged page
            page_count = 0
            bbox = None
            note(2, "    calculating bounding box for large pages...")
            dont_crop = md.get('dont-crop-big-thumbnails', false)
            if AUTO_CROP_BIG_THUMBNAILS and not dont_crop:
                do_bbox = true
            else:
                do_bbox = false
            for tiff_part in tiff_parts:
                if not filecheck_fn(tiff_part):
                    continue
                if page_count == 0:
                    # find the width and height of the document
                    docwidth, docheight = figure_doc_size(os.path.join(parts_dir, tiff_part))
                    if not do_bbox:
                        bbox = (0, 0, docwidth, docheight)
                if do_bbox:
                    bbox = figure_bbox (os.path.join(parts_dir, tiff_part), page_count, bbox, bbox_skips)
                if (bbox and bbox[0] == 0) and (bbox[1] == 0) and (bbox[2] >= docwidth) and (bbox[3] >= docheight):
                    # don't bother, there's no area to crop already
                    do_bbox = false
                page_count = page_count + 1
            if page_count == 0:
                raise Error("No pages in split tiff file directory after split!")
            note(2, "      final bbox is %s, page_count is %d", bbox, page_count)

            if USE_VIRTUAL_INK:
                note(2, "      alpha channels will be added to large thumbnails...")

            # now make the thumbnails
            big_thumbnail_size = []
            small_thumbnail_size = []
            icon_size = []
            page_index = 0
            for tiff_part in tiff_parts:
                if not filecheck_fn(tiff_part):
                    note(3, "    skipping %s", tiff_part)
                    continue
                tiff_path = os.path.join(parts_dir, tiff_part)
                if page_numbers:
                    page_no_string = page_numbers.get(page_index)
                else:
                    page_no_string = None
                note (2, "    page %d%s", page_index, (page_no_string and "   (%s)" % page_no_string) or "")
                try:
                    if not create_thumbnail(tiff_path, tiff_dpi, output_dir,
                                            page_index, first_page, page_count, bbox, bbox_skips,
                                            big_thumbnail_size, small_thumbnail_size, icon_size,
                                            params.get('maxwidth'), params.get('maxheight'), params.get('maxscaling'),
                                            params.get('thumbnail_strategy'), legend, page_no_string):
                        raise Error ("Can't create thumbnail for page %d in %s (of %s)" % (page_index, tiff_path, dirpath))
                except Exception, x:
                    doc_id = os.path.split(dirpath)[1]
                    note("exception creating thumbnails for page %d of document %s:\n%s", page_index, doc_id,
                         string.join(traceback.format_exception(*sys.exc_info()), ""))
                    raise AbortDocumentIncorporation(doc_id, str(x))

                if page_index == 0:
                    bt_width = big_thumbnail_size[0]
                    bt_height = big_thumbnail_size[1]
                    st_width = small_thumbnail_size[0]
                    st_height = small_thumbnail_size[1]
                else:
                    bt_width = max(bt_width, big_thumbnail_size[0])
                    bt_height = max(bt_height, big_thumbnail_size[1])
                    st_width = max(st_width, small_thumbnail_size[0])
                    st_height = max(st_height, small_thumbnail_size[1])
                st_scaling = (float(st_width)/float(docwidth) + float(st_height)/float(docheight)) / 2.0
                page_index = page_index + 1

            d = {"page-count" : str(page_count),
                 "tiff-width" : str(docwidth),
                 "images-width" : str(docwidth),
                 "images-size" : "%d,%d" % (docwidth, docheight),
                 "cropping-bounding-box" : "%d,%d;%d,%d" % (bbox),
                 "big-thumbnail-size" : "%s,%s" % (bt_width, bt_height),
                 "small-thumbnail-size" : "%s,%s" % (st_width, st_height),
                 "small-thumbnail-scaling" : "%f" % st_scaling,
                 "icon-size" : "%d,%d" % icon_size[0],
                 "images-height" : str(docheight),
                 "tiff-height" : str(docheight) }

            translation, scaling = thumbnail_translation_and_scaling(dirpath, d, false, true)
            d["big-thumbnail-translation-points"] = "%f,%f" % translation
            d["big-thumbnail-scaling-factor"] = "%f,%f" % scaling
            update_metadata(os.path.join(dirpath, "metadata.txt"), d)

        finally:
            shutil.rmtree(tmpdir)

        # indicate successful completion
        note(2, "  finished.")
        retval = true

예제 #9

0

파일 보기

파일: WashPost.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

    def rip(self, folder, docid):
        def encodestring(s):
            # WashPost strings have xml char refs, and we want Unicode
            if not s:
                return s

            s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s)
            s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s)
            return s

        def dequote(s):
            return re.sub(r"\\'", "'", s)

        def catclean(s):
            return re.sub(r"[/,]", "_", s)

        mdpath = os.path.join(folder, "metadata.txt")
        originalspath = os.path.join(folder, "originals", "original.html")
        if not (os.path.exists(mdpath) and os.path.exists(originalspath)):
            return
        md = read_metadata(mdpath)
        url = md.get("original-url")
        if not url:
            return
        host, port, path = parse_URL(url)
        if host != "www.washingtonpost.com":
            return

        # OK, it's from the Post
        new_metadata = MetadataGatherer.parse(originalspath)
        for line in open(originalspath):
            if line.startswith(_HEADLINE):
                line = line[len(_HEADLINE) :].strip("\n")
                t = _TITLEPATTERN.match(line)
                if t:
                    new_metadata["hdl"] = dequote(t.group("title"))
            m = _AUTHORSPATTERN.search(line)
            if m:
                new_metadata["authors"] = dequote(line[len(m.group(0)) :].strip(" ';\n"))
            if line.startswith(_CONTENTID):
                new_metadata["content-id"] = line[len(_CONTENTID) :].strip(" ';\n")
            if line.startswith(_SECTION):
                section = line[len(_SECTION) :].strip(" ';\n")
                i = section.index("'")
                new_metadata["section"] = section[:i]

        if "source" not in md:
            md["source"] = "Washington Post"

        # not all articles have metadata...
        if not ("hdl" in new_metadata):
            note(3, "No metadata in article:  %s", new_metadata)
            return

        md["title"] = encodestring(new_metadata.get("hdl") or md.get("title"))

        if "date" not in md:
            # get the date
            d = _URLDATEPATTERN.match(url)
            if d:
                md["date"] = "%s/%s/%s" % (d.group("month"), d.group("day"), d.group("year"))

        if "authors" not in md:
            # get the byline
            d = new_metadata.get("authors")
            if d:
                md["authors"] = encodestring(d)

        d = new_metadata.get("keywords")
        d0 = md.get("keywords")
        if d and d0:
            d0 = [x.strip() for x in d0.split(",")] + [x.strip() for x in d.split(";")]
        elif d:
            d0 = [x.strip() for x in d.split(";")]
        if d0:
            md["keywords"] = encodestring(",".join(d0))
        if new_metadata.get("description"):
            md["summary"] = encodestring(new_metadata.get("description"))
            md["abstract"] = encodestring(new_metadata.get("description"))
        section = new_metadata.get("section")
        if section:
            c = md.get("categories")
            if c:
                c = [x.strip() for x in c.split(",")]
            else:
                c = []
            c = c + ["article", "Washington Post/%s" % catclean(section)]
            md["categories"] = ",".join(c)
        content_id = new_metadata.get("content-id")
        if content_id:
            md["citation"] = "Washington Post article %s" % content_id
        update_metadata(mdpath, md)

예제 #10

0

파일 보기

파일: ripper.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

 def update_folder_metadata(self, location, md):
     return update_metadata(self.folder_metadata_path(location), md)

예제 #11

0

파일 보기

파일: newFolder.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

def flesh_out_folder(id, tmpfilename, metadata, repo, unpack_fn, counter):
    try:
        try:
#             note(3, "CODETIMER_ON is %s", CODETIMER_ON)
#             if CODETIMER_ON:
#                 code_timer.Init()
#                 code_timer.CreateTable("uplib")
#                 code_timer.CodeTimerOn()
#                 code_timer.StartInt("newFolder$unpack", "uplib")
#             else:
#                 code_timer.CodeTimerOff()

            if unpack_fn and tmpfilename and os.path.exists(tmpfilename):
                unpack_fn(repo, id, tmpfilename, metadata)

#             if CODETIMER_ON:
#                 code_timer.StopInt("newFolder$unpack", "uplib")
            folderpath = repo.pending_location(id)
            try:
                note("unpacked new folder in %s", folderpath)
                if not sys.platform.lower().startswith("win"):
                    s, o, t = subproc("ls -Rl %s" % folderpath)
                    note("%s\n" % o)

                fp = open(os.path.join(folderpath, "UNPACKED"), 'w')
                fp.flush()
                fp.close()

                # as of this point, we can restart the inclusion of the document

                md = read_metadata(os.path.join(folderpath, "metadata.txt"))
                replacement_id = md.get("replacement-contents-for")
                if replacement_id:
                    if repo.valid_doc_id(replacement_id):
                        # contents to replace another document
                        md["replacement-contents-for"] = ""
                        update_metadata(os.path.join(folderpath, "metadata.txt"), md)
                        note(2, "replacing contents of %s with this data...", replacement_id)
                        existing_document = repo.get_document(replacement_id)
                        new_folder = existing_document.folder()
                        process_folder(repo, replacement_id, folderpath, false, new_folder)
                        _run_rippers(new_folder, repo, replacement_id)
                        existing_document.recache()
                        repo.touch_doc(existing_document)
                        raise AbortDocumentIncorporation(id, "replacement for existing document %s" % replacement_id)
                    else:
                        raise AbortDocumentIncorporation(id, "replacement for non-existent document %s" % replacement_id)

                _finish_inclusion (repo, folderpath, id)

#                 if CODETIMER_ON:
#                     noteOut = StringIO.StringIO()
#                     noteOut.write("\nCode Timer statistics (what took time, in milliseconds):\n")
#                     code_timer.PrintTable(noteOut, "uplib")
#                     noteOut.write("\n")
#                     noteOutString = noteOut.getvalue()
#                     note(3, noteOutString)

            except:
                type, value, tb = sys.exc_info()
                note("%s", ''.join(traceback.format_exception(type, value, tb)))
                note_error(folderpath, (type, value, tb))
                raise value, None, tb

        except AbortDocumentIncorporation, x:
            # ripper signalled to stop adopting this document, for good
            note(2, "AbortDocumentIncorporation exception on %s:  %s", x.id, x.message)
            if (x.id == id):
                shutil.rmtree(folderpath)
            remove_from_index(repo.index_path(), id)

        except:
            type, value, tb = sys.exc_info()
            note("Exception processing new folder:\n%s", ''.join(traceback.format_exception(type, value, tb)))

예제 #12

0

파일 보기

파일: images.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

    def rip (self, location, doc_id, debug=None):

        images = findimages(location, debug)
        val = string.join([string.join(x, ":") for x in images], ',')
        update_metadata(os.path.join(location, "metadata.txt"), { 'illustrations-bounding-boxes' : val })