Пример #1
0
    def load_text(self, p, variant):
        filename = self.cache_dir + self.lang + '/' + str(p.latestRevision())

        if not os.path.exists(filename):
            html = self.get_html(p)
            new_html = common_html.get_head(
                u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>'

            new_html = new_html.replace(u'&nbsp;', u' ')

            root = etree.fromstring(new_html.encode('utf-8'))
            exclude = set()
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(
                    ".//{http://www.w3.org/1999/xhtml}div[@id='%s']" %
                    html_id):
                exclude.add(it)

            text = self.get_etree_text(root, exclude)
            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            utils.write_file(filename, text)
        else:
            text = utils.read_file(filename)

        return text
Пример #2
0
def handle_status(params, start_response):

    default_limit = 50
    max_limit = 1000

    state_filter = params.get('filter', '')
    limit = get_int_param(params, 'limit', default_limit, max_limit)
    offset = get_int_param(params, 'offset', 0, None)
    #print >> sys.stderr, params

    db_obj = sge_jobs.DbJob()

    text = common_html.get_head(
        'hocr', css='shared.css').encode('utf-8') + '\n  <body>\n'

    html, jobs = job_table(db_obj, state_filter, limit, offset, default_limit,
                           max_limit)
    text += html

    text += accounting_table(db_obj, jobs, state_filter, limit, offset,
                             default_limit, max_limit)

    text += '  </body>\n</html>'

    start_response('200 OK', [('Content-Type', 'text/html; charset=UTF-8'),
                              ('Content-Length', len(text)),
                              ('Access-Control-Allow-Origin', '*')])
    return [text]
Пример #3
0
    def load_text(self, p, variant):
        filename = self.cache_dir + self.lang + '/' + str(p.latestRevision())

        if not os.path.exists(filename):
            html = self.get_html(p)
            new_html = common_html.get_head(u'TITLE') + u"\n<body>"  + html + u'\n</body>\n</html>'

            new_html = new_html.replace(u'&nbsp;', u' ')

            root = etree.fromstring(new_html.encode('utf-8'))
            exclude = set()
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id):
                exclude.add(it)

            text = self.get_etree_text(root, exclude)
            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            utils.write_file(filename, text)
        else:
            text = utils.read_file(filename)

        return text
Пример #4
0
def handle_status(params, start_response):

    default_limit = 50
    max_limit = 1000

    state_filter = params.get('filter', '')
    limit = get_int_param(params, 'limit', default_limit, max_limit)
    offset = get_int_param(params, 'offset', 0, None)
    #print >> sys.stderr, params

    db_obj = sge_jobs.DbJob()

    text = common_html.get_head('hocr', css = 'shared.css').encode('utf-8') + '\n  <body>\n'

    html, jobs = job_table(db_obj, state_filter, limit, offset,
                           default_limit, max_limit)
    text += html

    text += accounting_table(db_obj, jobs, state_filter, limit, offset,
                             default_limit, max_limit)

    text += '  </body>\n</html>'

    start_response('200 OK', [('Content-Type',
                               'text/html; charset=UTF-8'),
                              ('Content-Length', len(text)),
                              ('Access-Control-Allow-Origin', '*')])
    return [ text ]
Пример #5
0
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head(u'Verify match')
    html += u"<body><div>The robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in verify match queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += u'</div></body></html>'
    return html
Пример #6
0
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head(u'Verify match')
    html += u"<body><div>The robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in verify match queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += u'</div></body></html>'
    return html
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head('Extract text layer')
    html += u"<body><div>The robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in extract queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += u'</div></body></html>'
    return html
Пример #8
0
def handle_status(params, start_response):

    text = common_html.get_head("pages without scan", css="shared.css").encode("utf-8") + "\n  <body>\n"

    text += "<h1>OK</h1>"

    text += "  </body>\n</html>"

    return return_response(start_response, text, False, "200 OK", "text/html")
Пример #9
0
def handle_status(params, start_response):

    text = common_html.get_head('modernization', css = 'shared.css').encode('utf-8') + '\n  <body>\n'

    text += '<h1>OK</h1>'

    text += '  </body>\n</html>'

    return return_response(start_response, text, False, '200 OK', 'text/html')
def handle_status(params, start_response):

    text = common_html.get_head(
        'modernization', css='shared.css').encode('utf-8') + '\n  <body>\n'

    text += '<h1>OK</h1>'

    text += '  </body>\n</html>'

    return return_response(start_response, text, False, '200 OK', 'text/html')
Пример #11
0
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head('OCR service')
    html += '<body><div>The ocr robot is runnning.<br /><hr />'
    html += "%d jobs in queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += '</div></body></html>'

    return html
Пример #12
0
def do_status(queue):
    queue = queue.copy_items(True)

    html = common_html.get_head('OCR service')
    html += '<body><div>The ocr robot is runnning.<br /><hr />'
    html += "%d jobs in queue.<br/>" % len(queue)
    html += html_for_queue(queue)
    html += '</div></body></html>'

    return html
Пример #13
0
def do_status():
    m_queue = jobs["match_queue"].copy_items(True)
    s_queue = jobs["split_queue"].copy_items(True)

    html = common_html.get_head("Match and split")

    html += u"<body><div>the robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in match queue.<br/>" % len(m_queue)
    html += html_for_queue(m_queue)
    html += u"<br/>%d jobs in split queue.<br/>" % len(s_queue)
    html += html_for_queue(s_queue)
    html += u"<br/>%(number_of_match_job)d match, %(number_of_split_job)d split since server start<br/>" % jobs
    html += u"</div></body></html>"

    return html
Пример #14
0
def do_status():
    m_queue = jobs['match_queue'].copy_items(True)
    s_queue = jobs['split_queue'].copy_items(True)

    html = common_html.get_head('Match and split')

    html += u"<body><div>the robot is running.<br/><hr/>"
    html += u"<br/>%d jobs in match queue.<br/>" % len(m_queue)
    html += html_for_queue(m_queue)
    html += u"<br/>%d jobs in split queue.<br/>" % len(s_queue)
    html += html_for_queue(s_queue)
    html += u"<br/>%(number_of_match_job)d match, %(number_of_split_job)d split since server start<br/>" % jobs
    html += u'</div></body></html>'

    return html
Пример #15
0
    def parse_global_dict(self, html):
        result = self.default_cache()

        html = common_html.get_head(u'TITLE') + u"\n<body>"  + html + u'\n</body>\n</html>'
        root = etree.fromstring(html.encode('utf-8'))
        text = u''
        for it in root.findall(".//{http://www.w3.org/1999/xhtml}li"):
            text += self.get_etree_text(it, set())

        for line in text.split(u'\n'):
            match = re.match(u'^\s*(\S[^: ]*?)(?:\s|&#160;|&nbsp;| )*:\s*([\S].+?)\s*(?:\/\/.*?)?$', line, re.UNICODE)
            if match:
                result[match.group(1)] = match.group(2)

        return result
Пример #16
0
    def parse_local_dict(self, variant, html):
        result = self.default_cache()
        html_id = self.config[variant]['modernize_div_id']

        html = html.replace(u'&nbsp;', u' ')

        html = common_html.get_head(u'TITLE') + u"\n<body>"  + html + u'\n</body>\n</html>'
        root = etree.fromstring(html.encode('utf-8'))
        text = u''
        for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id):
            text += self.get_etree_text(it, set())

        for line in text.split(u'\n'):
            match = re.match(u'^\s*(\S[^: ]*?)(?:\s|&#160;|&nbsp;| )*:\s*([\S].+?)\s*(?:\/\/.*?)?$', line, re.UNICODE)
            if match:
                result[match.group(1)] = match.group(2)

        return result
Пример #17
0
def handle_scan_query(params, start_response):
    text = common_html.get_head("pages without scan", css="shared.css").encode("utf-8") + "\n  <body>\n"

    if params["lang"]:
        try:
            offset = int(params.get("offset", 0))
            limit = min(500, int(params.get("limit", 500)))
            lang = params["lang"]
            conn = db.create_conn(domain=lang, family="wikisource")
            cursor = db.use_db(conn, domain=lang, family="wikisource")
            ns = ws_category.domain_urls[lang][0]
            page_ids = disamb_page(cursor) | page_with_scan(ns, cursor)
            all_p = all_pages(cursor)
            result = [(unicode(x[0], "utf-8"), x[1]) for x in all_p if x[2] not in page_ids]
            text += "Total: " + str(len(result)) + "<br />"
            next_link = prev_next_link(False, len(result), lang, limit, offset)
            prev_link = prev_next_link(True, len(result), lang, limit, offset)
            text += prev_link + "&#160;" + next_link + "<br /><br />"

            result = result[offset : offset + limit]
            for x in result:
                text += (
                    u'<a href="//%s.wikisource.org/wiki/%s">' % (lang, x[0])
                    + x[0].replace("_", " ")
                    + u"</a>, "
                    + str(x[1])
                    + u"<br />"
                )

            text += u"<br />" + prev_link + "&#160;" + next_link
            cursor.close()
            conn.close()
            ret_code = "200 OK"
        except:
            utils.print_traceback()
            ret_code = "500 Internal Server Error"
            text = "<h1>" + ret_code + "</h1>"
    else:
        ret_code = "400 Bad Request"
        text = "<h1>" + ret_code + "</h1>"

    text += "  </body>\n</html>"

    return return_response(start_response, text.encode("utf-8"), False, ret_code, "text/html")
def handle_scan_query(params, start_response):
    text = common_html.get_head(
        'pages without scan',
        css='shared.css').encode('utf-8') + '\n  <body>\n'

    if params['lang']:
        try:
            offset = int(params.get('offset', 0))
            limit = min(500, int(params.get('limit', 500)))
            lang = params['lang']
            conn = db.create_conn(domain=lang, family='wikisource')
            cursor = db.use_db(conn, domain=lang, family='wikisource')
            ns = ws_category.domain_urls[lang][0]
            page_ids = disamb_page(cursor) | page_with_scan(ns, cursor)
            all_p = all_pages(cursor)
            result = [(unicode(x[0], 'utf-8'), x[1]) for x in all_p
                      if x[2] not in page_ids]
            text += 'Total: ' + str(len(result)) + '<br />'
            next_link = prev_next_link(False, len(result), lang, limit, offset)
            prev_link = prev_next_link(True, len(result), lang, limit, offset)
            text += prev_link + '&#160;' + next_link + '<br /><br />'

            result = result[offset:offset + limit]
            for x in result:
                text += u'<a href="//%s.wikisource.org/wiki/%s">' % (
                    lang, x[0]) + x[0].replace('_', ' ') + u'</a>, ' + str(
                        x[1]) + u'<br />'

            text += u'<br />' + prev_link + '&#160;' + next_link
            cursor.close()
            conn.close()
            ret_code = '200 OK'
        except:
            utils.print_traceback()
            ret_code = '500 Internal Server Error'
            text = '<h1>' + ret_code + '</h1>'
    else:
        ret_code = '400 Bad Request'
        text = '<h1>' + ret_code + '</h1>'

    text += '  </body>\n</html>'

    return return_response(start_response, text.encode('utf-8'), False,
                           ret_code, 'text/html')
Пример #19
0
    def suggest_dict(self, title):
        p = self.get_page(title)
        html = self.get_html(p)

        new_html = common_html.get_head(u'TITLE') + u"\n<body>"  + html + u'\n</body>\n</html>'
        root = etree.fromstring(new_html.encode('utf-8'))

        exclude = set()

        for variant in self.variants:
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id):
                exclude.add(it)

        html_text = self.get_etree_text(root, exclude)

        # result = {
        # 'variant_name_1' : {
        #    'local_dict_used' : [(A, B), ... ],
        #    'suggest_local_dict' : { 'C' : 'D' ... },
        #    'speller_suggest' : [ ( 'E', [ 'G', 'H', ]), ... ]
        #    }
        # 'variant_name_2' : { ... }
        # }
        result = {}

        blacklist = self.load_blacklist()

        for variant in self.variants:
            speller = spell.Speller(self.config[variant]['aspell_lang'])
            cache = self.load_dicts(variant)
            if 'global_dict' in cache:
                global_dict = cache['global_dict'][1]
            else:
                global_dict = []

            other_local_dict = {}
            for key in cache:
                if key != 'global_dict':
                    d = cache[key][1]
                    for words in d:
                        other_local_dict[words] = d[words]


            local_dict = self.parse_local_dict(variant, html)

            text = html_text

            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            # set of entry used in the local dict, a set because we want
            # to keep the order in local_dict so we don't store here the repl
            # string but we will iter the ordered local_dict and check
            # if a word is present in this set.
            used_local_dict = set()
            # map of entry used in all other local dict, good suggestion to
            # give to user
            suggest_local_dict = {}
            # all other words, these will be check spelled to provide an
            # additionnal set of suggestion
            word_seen = set()

            regex_split = re.compile(u'([' + self.word_chars + u']+)')
            words_list = regex_split.findall(text)
            i = 0
            while True:
                if i >= len(words_list):
                    break

                if words_list[i] in blacklist:
                    i += 1
                    continue

                repl, glb, new_words, num = self.find_repl(words_list, i,
                                                           local_dict,
                                                           global_dict)

                if repl:
                    if not glb:
                        used_local_dict.add(new_words)
                else:
                    # not found in global or local dict, try in all other
                    # local dict to get suggestion.
                    repl, glb, new_words, num = self.find_repl(words_list, i,
                                                           other_local_dict,
                                                           {})
                    if repl:
                        suggest_local_dict[new_words] = repl

                if not repl:
                    word_seen.add(words_list[i])
                    i += 1
                else:
                    i += num

            word_seen = [x for x in word_seen if not speller.check(x)]
            speller_suggest = [(x, speller.suggest(x)[:5]) for x in word_seen]

            # local dict is an ordered dict, so we can put words in the same
            # order as the local_dict, this allow better wiki diff when a local
            # dict is updated.
            local_dict_used = [ (x, local_dict[x]) for x in local_dict if x in used_local_dict ]

            # FIXME: for suggest_local_dict, must we remove suggested words
            # from other local dict but working word for the check speller?

            result[variant] = {}
            result[variant]['local_dict_used'] = local_dict_used
            result[variant]['suggest_local_dict'] = suggest_local_dict.items()
            result[variant]['speller_suggest'] = speller_suggest

        return result
Пример #20
0
    def suggest_dict(self, title):
        p = self.get_page(title)
        html = self.get_html(p)

        new_html = common_html.get_head(
            u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>'
        root = etree.fromstring(new_html.encode('utf-8'))

        exclude = set()

        for variant in self.variants:
            html_id = self.config[variant]['modernize_div_id']

            for it in root.findall(
                    ".//{http://www.w3.org/1999/xhtml}div[@id='%s']" %
                    html_id):
                exclude.add(it)

        html_text = self.get_etree_text(root, exclude)

        # result = {
        # 'variant_name_1' : {
        #    'local_dict_used' : [(A, B), ... ],
        #    'suggest_local_dict' : { 'C' : 'D' ... },
        #    'speller_suggest' : [ ( 'E', [ 'G', 'H', ]), ... ]
        #    }
        # 'variant_name_2' : { ... }
        # }
        result = {}

        blacklist = self.load_blacklist()

        for variant in self.variants:
            speller = spell.Speller(self.config[variant]['aspell_lang'])
            cache = self.load_dicts(variant)
            if 'global_dict' in cache:
                global_dict = cache['global_dict'][1]
            else:
                global_dict = self.default_cache()

            other_local_dict = {}
            for key in cache:
                if key != 'global_dict':
                    d = cache[key][1]
                    for words in d:
                        other_local_dict[words] = d[words]

            local_dict = self.parse_local_dict(variant, html)

            text = html_text

            for d in self.config[variant]['transform']:
                text = re.sub(d[0], d[1], text)

            # set of entry used in the local dict, a set because we want
            # to keep the order in local_dict so we don't store here the repl
            # string but we will iter the ordered local_dict and check
            # if a word is present in this set.
            used_local_dict = set()
            # map of entry used in all other local dict, good suggestion to
            # give to user
            suggest_local_dict = {}
            # all other words, these will be check spelled to provide an
            # additionnal set of suggestion
            word_seen = set()

            regex_split = re.compile(u'([' + self.word_chars + u']+)')
            words_list = regex_split.findall(text)
            i = 0
            while True:
                if i >= len(words_list):
                    break

                if words_list[i] in blacklist:
                    i += 1
                    continue

                repl, glb, new_words, num = self.find_repl(
                    words_list, i, local_dict, global_dict)

                if repl:
                    if not glb:
                        used_local_dict.add(new_words)
                else:
                    # not found in global or local dict, try in all other
                    # local dict to get suggestion.
                    repl, glb, new_words, num = self.find_repl(
                        words_list, i, other_local_dict, {})
                    if repl:
                        suggest_local_dict[new_words] = repl

                if not repl:
                    word_seen.add(words_list[i])
                    i += 1
                else:
                    i += num

            word_seen = [x for x in word_seen if not speller.check(x)]
            speller_suggest = [(x, speller.suggest(x)[:5]) for x in word_seen]

            # local dict is an ordered dict, so we can put words in the same
            # order as the local_dict, this allow better wiki diff when a local
            # dict is updated.
            local_dict_used = [(x, local_dict[x]) for x in local_dict
                               if x in used_local_dict]

            # FIXME: for suggest_local_dict, must we remove suggested words
            # from other local dict but working word for the check speller?

            result[variant] = {}
            result[variant]['local_dict_used'] = local_dict_used
            result[variant]['suggest_local_dict'] = suggest_local_dict.items()
            result[variant]['speller_suggest'] = speller_suggest

        return result