Exemplo n.º 1
0
def create_html_pages(images_input_dir, blocks_input_dir, output_dir):
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    imgs_dir = output_dir + "/imgs/"
    if not os.path.isdir(imgs_dir):
        os.makedirs(imgs_dir)
    else:
        for img_fname in os.listdir(imgs_dir):
            os.remove(imgs_dir + img_fname)
    img_rel_dir = "imgs/"

    files = [fname for fname in os.listdir(images_input_dir) \
                if not fname.startswith(".") and fname[-3:] in ["bmp", "tif", "png", "svg", "jpg", 'peg']]
    if not files:
        print "FUCKUP no files in images_input_dir"
        exit()
    files.sort()
    file2page_number, gaps = assigned_page_numbers, gaps = assign_page_numbers(
        files, blocks_input_dir)
    if gaps:
        print gaps
        print "FUCKUP with page numbers. TERMINATE."
        exit()

    html = """<html><head><title></title><meta charset="UTF-8"/></head><body>\n"""

    processed = 0
    for fname in files:
        processed += 1
        if processed % 20 == 0:
            print "..processed", processed

        html_filename = output_dir + fname.split(".")[0] + ".html"

        paragraphs, letters, formulas, images, block2text = upload_page_data(
            blocks_input_dir + fname)

        page_image = Image.open(images_input_dir + fname)
        page_image = page_image.convert('LA')
        page_draw = ImageDraw.Draw(page_image)

        draw_mockup(page_image, paragraphs, letters, formulas, images,
                    block2text)

        images_count = 0
        paragraphs_count = 0
        formulas_count = 0

        all_blocks = [line for parag in paragraphs
                      for line in parag] + formulas + images
        page_block = ((0, 100), (0, 100))
        if all_blocks:
            page_block = all_blocks[0]
            for block in all_blocks:
                page_block = merge_blocks(block, page_block)
        page_block = (
            (
                0,
                page_image.size[1],
            ),
            (
                page_block[1][0] - 10,
                page_block[1][1] + 10,
            ),
        )

        html_page_width = 800
        html_page_height = html_page_width * (
            page_block[0][1] - page_block[0][0]) / (page_block[1][1] -
                                                    page_block[1][0])

        html_page_inner = ""

        for img_block in images:
            img_image = extract_block_as_image(img_block, page_image,
                                               page_draw)
            html_page_inner += convert2html_block(img_image, img_block,
                                                  page_block, html_page_height,
                                                  5, imgs_dir, img_rel_dir)

        for paragraph in paragraphs:
            by_height = [(line[0][0], line) for line in paragraph]
            by_height.sort()
            paragraph = [line for _, line in by_height]
            paragraph_block = paragraph[0]
            for line_block in paragraph:
                paragraph_block = merge_blocks(line_block, paragraph_block)
            """
            img_regions = []
            for line_block in paragraph:
                region = extract_block_as_image(line_block, page_image, page_draw)
                coords_in_paragraph = ((line_block[0][0] - paragraph_block[0][0], line_block[0][1] - paragraph_block[0][0]),
                                       (line_block[1][0] - paragraph_block[1][0], line_block[1][1] - paragraph_block[1][0]))
                img_regions += [(coords_in_paragraph, region)]
            parag_x_size, parag_y_size = paragraph_block[0][1] - paragraph_block[0][0], paragraph_block[1][1] - paragraph_block[1][0]
            parag_image = Image.new("LA", (parag_y_size, parag_x_size), 255)
            for coord, line_img in img_regions: 
                parag_image.paste(line_img, block2PIL_block(coord)) 
            html_page_inner += convert2html_block(parag_image, paragraph_block, page_block, html_page_height, 10, html_images_path, html_images_path_rel_path)
            """
            html_page_inner += convert_parag2html_block(
                paragraph, block2text, paragraph_block, page_block,
                html_page_height, 10, imgs_dir, img_rel_dir)

        for formula_block in formulas:
            formula_img = extract_block_as_image(formula_block, page_image,
                                                 page_draw)
            html_page_inner += convert2html_block(formula_img, formula_block,
                                                  page_block, html_page_height,
                                                  5, imgs_dir, img_rel_dir)

        del page_draw

        #html = """<html><head><title></title></head><body>\n<div id="page" style="width: %dpx; height: %dpx; border: 1px solid black; position: relative; " >\n%s\n</div></body></html>\n""" %\
        #        (html_page_width, html_page_height, html_page_inner)
        #open(html_filename, "w").write(html)

        html += """\n<div id="page" style="width: %dpx; height: %dpx; border: 1px solid black; position: relative; " >\n%s\n</div>\n""" %\
                (html_page_width, html_page_height, html_page_inner)

    html += "</body></html>"
    open(output_dir + "/index.html", "w").write(html.encode("utf8"))
Exemplo n.º 2
0
    def do_GET(self):
        full_query = self.path
        full_query = full_query.replace("?callback=", "&callback=")
        query = urlparse.parse_qs(urlparse.urlparse(full_query).query)
        query_type = full_query.split("?")[0]
        
        page = pages[0]
        
        response = "['', [], [], []]"
        if "/next_page" in query_type and "page" in query:
            page = query["page"][0].split("/")[-1]
            if page in pages:
                cur_index = pages.index(page)
                if cur_index < len(pages):
                    page = pages[cur_index + 1]

        if "/prev_page" in query_type and "page" in query:
            page = query["page"][0].split("/")[-1]
            if page in pages:
                cur_index = pages.index(page)
                if cur_index > 0:
                    page = pages[cur_index - 1]
        
        if "/page_send" in query_type and "page" in query:
            page = query["page"][0].split("/")[-1]
            paragraphs = []
            formulas = []
            images = []
            for field, array in [("p", paragraphs), ("f", formulas), ("i", images)]:
                if field in query:
                    for block in query[field]:
                        try:
                            x1, x2, y1, y2 = [int(chunk) for chunk in block.split(',')]
                            array += [((x1, x2), (y1, y2),)]
                        except:
                            print "fuckup:", block
            save_corrected_blocks(paragraphs, images, formulas, page) 
            print "saved"
        
        
        if 1:
            print page
            paragraphs_blocks = []
            formulas = []
            images = []
            if not load_corrected_blocks(paragraphs_blocks, images, formulas, page):
                print "load orig"
                paragraphs, _, formulas, images, _ = upload_page_data(blocks_source + page)
                paragraphs_blocks = []
                for paragraph in paragraphs:
                    by_height = [(line[0][0], line) for line in paragraph]
                    by_height.sort()
                    paragraph = [line for _, line in by_height]
                    paragraph_block = paragraph[0]
                    for line_block in paragraph:
                        paragraph_block = merge_blocks(line_block, paragraph_block)       
                    paragraphs_blocks += [paragraph_block]
            
            parags_str = ",".join([str(coord) for block in paragraphs_blocks for dim in block for coord in dim])
            formulas_str = ",".join([str(coord) for block in formulas for dim in block for coord in dim])
            images_str = ",".join([str(coord) for block in images for dim in block for coord in dim])    
            response = "[\"%s\", [%s], [%s], [%s]]" % (img_source + page, parags_str, images_str, formulas_str)
            
        
        function_name = query.has_key("callback") and query["callback"][0] or ""
        response = function_name + "(" + response + ")"    
        
        response = response.encode("utf8")    

        request_headers = self.headers.__str__().replace(chr(10), " ").replace(chr(13), " ")
        print "[STAT]\tclient:", self.client_address, "\theaders:", request_headers, "\tquery:", full_query
        sys.stdout.flush()
        self.send_response(200)
        self.send_header("Content-type", "text/plain")
        self.send_header("Content-Length", str(len(response)))
        self.end_headers()
        #self.wfile.write(json_result)
        self.wfile.write(response)
        print response