def load_text(self, p, variant): filename = self.cache_dir + self.lang + '/' + str(p.latestRevision()) if not os.path.exists(filename): html = self.get_html(p) new_html = common_html.get_head( u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' new_html = new_html.replace(u' ', u' ') root = etree.fromstring(new_html.encode('utf-8')) exclude = set() html_id = self.config[variant]['modernize_div_id'] for it in root.findall( ".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): exclude.add(it) text = self.get_etree_text(root, exclude) for d in self.config[variant]['transform']: text = re.sub(d[0], d[1], text) utils.write_file(filename, text) else: text = utils.read_file(filename) return text
def handle_status(params, start_response): default_limit = 50 max_limit = 1000 state_filter = params.get('filter', '') limit = get_int_param(params, 'limit', default_limit, max_limit) offset = get_int_param(params, 'offset', 0, None) #print >> sys.stderr, params db_obj = sge_jobs.DbJob() text = common_html.get_head( 'hocr', css='shared.css').encode('utf-8') + '\n <body>\n' html, jobs = job_table(db_obj, state_filter, limit, offset, default_limit, max_limit) text += html text += accounting_table(db_obj, jobs, state_filter, limit, offset, default_limit, max_limit) text += ' </body>\n</html>' start_response('200 OK', [('Content-Type', 'text/html; charset=UTF-8'), ('Content-Length', len(text)), ('Access-Control-Allow-Origin', '*')]) return [text]
def load_text(self, p, variant): filename = self.cache_dir + self.lang + '/' + str(p.latestRevision()) if not os.path.exists(filename): html = self.get_html(p) new_html = common_html.get_head(u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' new_html = new_html.replace(u' ', u' ') root = etree.fromstring(new_html.encode('utf-8')) exclude = set() html_id = self.config[variant]['modernize_div_id'] for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): exclude.add(it) text = self.get_etree_text(root, exclude) for d in self.config[variant]['transform']: text = re.sub(d[0], d[1], text) utils.write_file(filename, text) else: text = utils.read_file(filename) return text
def handle_status(params, start_response): default_limit = 50 max_limit = 1000 state_filter = params.get('filter', '') limit = get_int_param(params, 'limit', default_limit, max_limit) offset = get_int_param(params, 'offset', 0, None) #print >> sys.stderr, params db_obj = sge_jobs.DbJob() text = common_html.get_head('hocr', css = 'shared.css').encode('utf-8') + '\n <body>\n' html, jobs = job_table(db_obj, state_filter, limit, offset, default_limit, max_limit) text += html text += accounting_table(db_obj, jobs, state_filter, limit, offset, default_limit, max_limit) text += ' </body>\n</html>' start_response('200 OK', [('Content-Type', 'text/html; charset=UTF-8'), ('Content-Length', len(text)), ('Access-Control-Allow-Origin', '*')]) return [ text ]
def do_status(queue): queue = queue.copy_items(True) html = common_html.get_head(u'Verify match') html += u"<body><div>The robot is running.<br/><hr/>" html += u"<br/>%d jobs in verify match queue.<br/>" % len(queue) html += html_for_queue(queue) html += u'</div></body></html>' return html
def do_status(queue): queue = queue.copy_items(True) html = common_html.get_head('Extract text layer') html += u"<body><div>The robot is running.<br/><hr/>" html += u"<br/>%d jobs in extract queue.<br/>" % len(queue) html += html_for_queue(queue) html += u'</div></body></html>' return html
def handle_status(params, start_response): text = common_html.get_head("pages without scan", css="shared.css").encode("utf-8") + "\n <body>\n" text += "<h1>OK</h1>" text += " </body>\n</html>" return return_response(start_response, text, False, "200 OK", "text/html")
def handle_status(params, start_response): text = common_html.get_head('modernization', css = 'shared.css').encode('utf-8') + '\n <body>\n' text += '<h1>OK</h1>' text += ' </body>\n</html>' return return_response(start_response, text, False, '200 OK', 'text/html')
def handle_status(params, start_response): text = common_html.get_head( 'modernization', css='shared.css').encode('utf-8') + '\n <body>\n' text += '<h1>OK</h1>' text += ' </body>\n</html>' return return_response(start_response, text, False, '200 OK', 'text/html')
def do_status(queue): queue = queue.copy_items(True) html = common_html.get_head('OCR service') html += '<body><div>The ocr robot is runnning.<br /><hr />' html += "%d jobs in queue.<br/>" % len(queue) html += html_for_queue(queue) html += '</div></body></html>' return html
def do_status(): m_queue = jobs["match_queue"].copy_items(True) s_queue = jobs["split_queue"].copy_items(True) html = common_html.get_head("Match and split") html += u"<body><div>the robot is running.<br/><hr/>" html += u"<br/>%d jobs in match queue.<br/>" % len(m_queue) html += html_for_queue(m_queue) html += u"<br/>%d jobs in split queue.<br/>" % len(s_queue) html += html_for_queue(s_queue) html += u"<br/>%(number_of_match_job)d match, %(number_of_split_job)d split since server start<br/>" % jobs html += u"</div></body></html>" return html
def do_status(): m_queue = jobs['match_queue'].copy_items(True) s_queue = jobs['split_queue'].copy_items(True) html = common_html.get_head('Match and split') html += u"<body><div>the robot is running.<br/><hr/>" html += u"<br/>%d jobs in match queue.<br/>" % len(m_queue) html += html_for_queue(m_queue) html += u"<br/>%d jobs in split queue.<br/>" % len(s_queue) html += html_for_queue(s_queue) html += u"<br/>%(number_of_match_job)d match, %(number_of_split_job)d split since server start<br/>" % jobs html += u'</div></body></html>' return html
def parse_global_dict(self, html): result = self.default_cache() html = common_html.get_head(u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(html.encode('utf-8')) text = u'' for it in root.findall(".//{http://www.w3.org/1999/xhtml}li"): text += self.get_etree_text(it, set()) for line in text.split(u'\n'): match = re.match(u'^\s*(\S[^: ]*?)(?:\s| | | )*:\s*([\S].+?)\s*(?:\/\/.*?)?$', line, re.UNICODE) if match: result[match.group(1)] = match.group(2) return result
def parse_local_dict(self, variant, html): result = self.default_cache() html_id = self.config[variant]['modernize_div_id'] html = html.replace(u' ', u' ') html = common_html.get_head(u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(html.encode('utf-8')) text = u'' for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): text += self.get_etree_text(it, set()) for line in text.split(u'\n'): match = re.match(u'^\s*(\S[^: ]*?)(?:\s| | | )*:\s*([\S].+?)\s*(?:\/\/.*?)?$', line, re.UNICODE) if match: result[match.group(1)] = match.group(2) return result
def handle_scan_query(params, start_response): text = common_html.get_head("pages without scan", css="shared.css").encode("utf-8") + "\n <body>\n" if params["lang"]: try: offset = int(params.get("offset", 0)) limit = min(500, int(params.get("limit", 500))) lang = params["lang"] conn = db.create_conn(domain=lang, family="wikisource") cursor = db.use_db(conn, domain=lang, family="wikisource") ns = ws_category.domain_urls[lang][0] page_ids = disamb_page(cursor) | page_with_scan(ns, cursor) all_p = all_pages(cursor) result = [(unicode(x[0], "utf-8"), x[1]) for x in all_p if x[2] not in page_ids] text += "Total: " + str(len(result)) + "<br />" next_link = prev_next_link(False, len(result), lang, limit, offset) prev_link = prev_next_link(True, len(result), lang, limit, offset) text += prev_link + " " + next_link + "<br /><br />" result = result[offset : offset + limit] for x in result: text += ( u'<a href="//%s.wikisource.org/wiki/%s">' % (lang, x[0]) + x[0].replace("_", " ") + u"</a>, " + str(x[1]) + u"<br />" ) text += u"<br />" + prev_link + " " + next_link cursor.close() conn.close() ret_code = "200 OK" except: utils.print_traceback() ret_code = "500 Internal Server Error" text = "<h1>" + ret_code + "</h1>" else: ret_code = "400 Bad Request" text = "<h1>" + ret_code + "</h1>" text += " </body>\n</html>" return return_response(start_response, text.encode("utf-8"), False, ret_code, "text/html")
def handle_scan_query(params, start_response): text = common_html.get_head( 'pages without scan', css='shared.css').encode('utf-8') + '\n <body>\n' if params['lang']: try: offset = int(params.get('offset', 0)) limit = min(500, int(params.get('limit', 500))) lang = params['lang'] conn = db.create_conn(domain=lang, family='wikisource') cursor = db.use_db(conn, domain=lang, family='wikisource') ns = ws_category.domain_urls[lang][0] page_ids = disamb_page(cursor) | page_with_scan(ns, cursor) all_p = all_pages(cursor) result = [(unicode(x[0], 'utf-8'), x[1]) for x in all_p if x[2] not in page_ids] text += 'Total: ' + str(len(result)) + '<br />' next_link = prev_next_link(False, len(result), lang, limit, offset) prev_link = prev_next_link(True, len(result), lang, limit, offset) text += prev_link + ' ' + next_link + '<br /><br />' result = result[offset:offset + limit] for x in result: text += u'<a href="//%s.wikisource.org/wiki/%s">' % ( lang, x[0]) + x[0].replace('_', ' ') + u'</a>, ' + str( x[1]) + u'<br />' text += u'<br />' + prev_link + ' ' + next_link cursor.close() conn.close() ret_code = '200 OK' except: utils.print_traceback() ret_code = '500 Internal Server Error' text = '<h1>' + ret_code + '</h1>' else: ret_code = '400 Bad Request' text = '<h1>' + ret_code + '</h1>' text += ' </body>\n</html>' return return_response(start_response, text.encode('utf-8'), False, ret_code, 'text/html')
def suggest_dict(self, title): p = self.get_page(title) html = self.get_html(p) new_html = common_html.get_head(u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(new_html.encode('utf-8')) exclude = set() for variant in self.variants: html_id = self.config[variant]['modernize_div_id'] for it in root.findall(".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): exclude.add(it) html_text = self.get_etree_text(root, exclude) # result = { # 'variant_name_1' : { # 'local_dict_used' : [(A, B), ... ], # 'suggest_local_dict' : { 'C' : 'D' ... }, # 'speller_suggest' : [ ( 'E', [ 'G', 'H', ]), ... ] # } # 'variant_name_2' : { ... } # } result = {} blacklist = self.load_blacklist() for variant in self.variants: speller = spell.Speller(self.config[variant]['aspell_lang']) cache = self.load_dicts(variant) if 'global_dict' in cache: global_dict = cache['global_dict'][1] else: global_dict = [] other_local_dict = {} for key in cache: if key != 'global_dict': d = cache[key][1] for words in d: other_local_dict[words] = d[words] local_dict = self.parse_local_dict(variant, html) text = html_text for d in self.config[variant]['transform']: text = re.sub(d[0], d[1], text) # set of entry used in the local dict, a set because we want # to keep the order in local_dict so we don't store here the repl # string but we will iter the ordered local_dict and check # if a word is present in this set. used_local_dict = set() # map of entry used in all other local dict, good suggestion to # give to user suggest_local_dict = {} # all other words, these will be check spelled to provide an # additionnal set of suggestion word_seen = set() regex_split = re.compile(u'([' + self.word_chars + u']+)') words_list = regex_split.findall(text) i = 0 while True: if i >= len(words_list): break if words_list[i] in blacklist: i += 1 continue repl, glb, new_words, num = self.find_repl(words_list, i, local_dict, global_dict) if repl: if not glb: used_local_dict.add(new_words) else: # not found in global or local dict, try in all other # local dict to get suggestion. repl, glb, new_words, num = self.find_repl(words_list, i, other_local_dict, {}) if repl: suggest_local_dict[new_words] = repl if not repl: word_seen.add(words_list[i]) i += 1 else: i += num word_seen = [x for x in word_seen if not speller.check(x)] speller_suggest = [(x, speller.suggest(x)[:5]) for x in word_seen] # local dict is an ordered dict, so we can put words in the same # order as the local_dict, this allow better wiki diff when a local # dict is updated. local_dict_used = [ (x, local_dict[x]) for x in local_dict if x in used_local_dict ] # FIXME: for suggest_local_dict, must we remove suggested words # from other local dict but working word for the check speller? result[variant] = {} result[variant]['local_dict_used'] = local_dict_used result[variant]['suggest_local_dict'] = suggest_local_dict.items() result[variant]['speller_suggest'] = speller_suggest return result
def suggest_dict(self, title): p = self.get_page(title) html = self.get_html(p) new_html = common_html.get_head( u'TITLE') + u"\n<body>" + html + u'\n</body>\n</html>' root = etree.fromstring(new_html.encode('utf-8')) exclude = set() for variant in self.variants: html_id = self.config[variant]['modernize_div_id'] for it in root.findall( ".//{http://www.w3.org/1999/xhtml}div[@id='%s']" % html_id): exclude.add(it) html_text = self.get_etree_text(root, exclude) # result = { # 'variant_name_1' : { # 'local_dict_used' : [(A, B), ... ], # 'suggest_local_dict' : { 'C' : 'D' ... }, # 'speller_suggest' : [ ( 'E', [ 'G', 'H', ]), ... ] # } # 'variant_name_2' : { ... } # } result = {} blacklist = self.load_blacklist() for variant in self.variants: speller = spell.Speller(self.config[variant]['aspell_lang']) cache = self.load_dicts(variant) if 'global_dict' in cache: global_dict = cache['global_dict'][1] else: global_dict = self.default_cache() other_local_dict = {} for key in cache: if key != 'global_dict': d = cache[key][1] for words in d: other_local_dict[words] = d[words] local_dict = self.parse_local_dict(variant, html) text = html_text for d in self.config[variant]['transform']: text = re.sub(d[0], d[1], text) # set of entry used in the local dict, a set because we want # to keep the order in local_dict so we don't store here the repl # string but we will iter the ordered local_dict and check # if a word is present in this set. used_local_dict = set() # map of entry used in all other local dict, good suggestion to # give to user suggest_local_dict = {} # all other words, these will be check spelled to provide an # additionnal set of suggestion word_seen = set() regex_split = re.compile(u'([' + self.word_chars + u']+)') words_list = regex_split.findall(text) i = 0 while True: if i >= len(words_list): break if words_list[i] in blacklist: i += 1 continue repl, glb, new_words, num = self.find_repl( words_list, i, local_dict, global_dict) if repl: if not glb: used_local_dict.add(new_words) else: # not found in global or local dict, try in all other # local dict to get suggestion. repl, glb, new_words, num = self.find_repl( words_list, i, other_local_dict, {}) if repl: suggest_local_dict[new_words] = repl if not repl: word_seen.add(words_list[i]) i += 1 else: i += num word_seen = [x for x in word_seen if not speller.check(x)] speller_suggest = [(x, speller.suggest(x)[:5]) for x in word_seen] # local dict is an ordered dict, so we can put words in the same # order as the local_dict, this allow better wiki diff when a local # dict is updated. local_dict_used = [(x, local_dict[x]) for x in local_dict if x in used_local_dict] # FIXME: for suggest_local_dict, must we remove suggested words # from other local dict but working word for the check speller? result[variant] = {} result[variant]['local_dict_used'] = local_dict_used result[variant]['suggest_local_dict'] = suggest_local_dict.items() result[variant]['speller_suggest'] = speller_suggest return result