def read_djvu(book_name, cached_text, datas, opt): import align data = align.get_djvu(cached_text, opt.site, book_name, True) for pos, text in enumerate(data): text = re.sub(u'(?ms)<noinclude>(.*?)</noinclude>', u'', text) datas.setdefault(pos + 1, []) datas[pos + 1].append(text)
def do_extract(mysite, maintitle, user, codelang, cache): prefix = unicode(page_prefixes['wikisource'].get(codelang), 'utf-8') if not prefix: return ret_val(E_ERROR, "no prefix") djvuname = maintitle.replace(u' ', u'_') print djvuname.encode('utf-8') text_layer = align.get_djvu(cache, mysite, djvuname, True) if not text_layer: return ret_val(E_ERROR, "unable to retrieve text layer") text = u'' for pos, page_text in enumerate(text_layer): text += u'==[[' + prefix + u':' + maintitle + u'/' + unicode(pos+1) + u']]==\n' text += page_text + u'\n' page = pywikibot.Page(mysite, u'User:'******'/Text') safe_put(page, text, comment = u'extract text') return ret_val(E_OK, "")
def do_match(mysite, maintitle, user, codelang): prefix = page_prefixes["wikisource"].get(codelang) if not prefix: return ret_val(E_ERROR, "no prefix") page = pywikibot.Page(mysite, maintitle) try: text = page.get() except: return ret_val(E_ERROR, "failed to get page") if text.find("{{R2Mondes") != -1: global pl_dict pl_dict = {} p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n") try: new_text = p0.sub(repl, text) except pywikibot.NoPage: return ret_val(E_ERROR, "Erreur : impossible de trouver l'index") p = re.compile("==\[\[Page:([^=]+)\]\]==\n") cache = lifo_cache.LifoCache("match_and_split_text_layer") bl = p.split(new_text) for i in range(len(bl) / 2): title = bl[i * 2 + 1] content = bl[i * 2 + 2] filename, pagenum = title.split("/") if i == 0: cached_text = align.get_djvu(cache, mysite, filename, True) else: cached_text = align.get_djvu(cache, mysite, filename, False) if not cached_text: return ret_val(E_ERROR, "Erreur : fichier absent") if content.find("R2Mondes") != -1: p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n") bl0 = p0.split(text) title0 = bl0[i * 2 + 1].encode("utf8") return ret_val(E_ERROR, "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page " + title0) r = align.match_page(content, cached_text[int(pagenum) - 1]) print "%s %s : %f" % (filename, pagenum, r) if r < 0.1: return ret_val(E_ERROR, "Erreur : Le texte ne correspond pas, page %s" % pagenum) # the page is ok new_text = re.sub(u"<references[ ]*/>", u"", new_text) new_text = re.sub(u"[ ]([,])", u"\\1", new_text) new_text = re.sub(u"([^.])[ ]([,.])", u"\\1\\2", new_text) new_text = re.sub(u"\.\.\.", u"…", new_text) new_text = re.sub(u"([^ \s])([;:!?])", u"\\1 \\2", new_text) new_text = re.sub(u"([«;:!?])([^ \s…])", u"\\1 \\2", new_text) # separated from the previous regexp else "word!»" overlap new_text = re.sub(u"([^ \s])([»])", u"\\1 \\2", new_text) # workaround some buggy text new_text = re.sub(u"([;:!?»]) \n", u"\\1\n", new_text) new_text = re.sub(u"([;:!?»])''([ \n])", u"\\1''\\2", new_text) # < ><space> # new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) # new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) new_text = re.sub(u"([;:!?»]) <br />", u"\\1<br />", new_text) new_text = new_text.replace(u"Page : ", u"Page:") new_text = new_text.replace(u"\n: ", u"\n:") new_text = new_text.replace(u"\n:: ", u"\n::") new_text = new_text.replace(u"\n::: ", u"\n:::") new_text = new_text.replace(u"\n:::: ", u"\n::::") new_text = new_text.replace(u"\n::::: ", u"\n:::::") new_text = re.sub( u"1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)", u"1{{er}} \\1", new_text, ) new_text = re.sub(u"([0-9])e ", u"\\1{{e}} ", new_text) # text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text) # try to move the title inside the M&S match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text) if match_title: pos = re.search(u"==(.*?)==", new_text) if pos: new_text = ( new_text[0 : pos.end(0)] + u"\n{{c|" + match_title.group(1) + u"|fs=140%}}\n\n\n" + new_text[pos.end(0) :] ) safe_put(page, new_text, user + ": match") jobs["number_of_split_job"] += 1 # FIXME: can we pass the request here and use a callback in the js? # FIXME: server is None? jobs["split_queue"].put(maintitle, codelang, user, time.time(), None, None, None) # FIXME: that's an abuse of E_ERROR return ret_val(E_ERROR, "ok : transfert en cours.") prefix = prefix.decode("utf-8") p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)\]\]==") m = re.search(p, text) if m: djvuname = m.group(1) number = m.group(2) pos = text.find(m.group(0)) head = text[:pos] text = text[pos + len(m.group(0)) :] else: return ret_val(E_ERROR, "match tag not found") pywikibot.output(djvuname + " " + number) try: number = int(number) except: return ret_val(E_ERROR, "illformed __MATCH__: no page number ?") cache = lifo_cache.LifoCache("match_and_split_text_layer") cached_text = align.get_djvu(cache, mysite, djvuname, True) if not cached_text: return ret_val(E_ERROR, "unable to read djvu, if the File: exists, please retry") data = align.do_match(text, cached_text, djvuname, number, verbose=False, prefix=prefix) if not data["error"]: safe_put(page, head + data["text"], user + ": match") data["text"] = "" return data
def do_match(mysite, maintitle, user, codelang): prefix = page_prefixes['wikisource'].get(codelang) if not prefix: return ret_val(E_ERROR, "no prefix") page = pywikibot.Page(mysite, maintitle) try: text = page.get() except: return ret_val(E_ERROR, "failed to get page") if text.find("{{R2Mondes") != -1: global pl_dict pl_dict = {} p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n") try: new_text = p0.sub(repl, text) except pywikibot.NoPage: return ret_val(E_ERROR, "Erreur : impossible de trouver l'index") p = re.compile('==\[\[Page:([^=]+)\]\]==\n') cache = lifo_cache.LifoCache('match_and_split_text_layer') bl = p.split(new_text) for i in range(len(bl) / 2): title = bl[i * 2 + 1] content = bl[i * 2 + 2] filename, pagenum = title.split('/') if i == 0: cached_text = align.get_djvu(cache, mysite, filename, True) else: cached_text = align.get_djvu(cache, mysite, filename, False) if not cached_text: return ret_val(E_ERROR, "Erreur : fichier absent") if content.find("R2Mondes") != -1: p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n") bl0 = p0.split(text) title0 = bl0[i * 2 + 1].encode("utf8") return ret_val( E_ERROR, "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page " + title0) r = align.match_page(content, cached_text[int(pagenum) - 1]) print "%s %s : %f" % (filename, pagenum, r) if r < 0.1: return ret_val( E_ERROR, "Erreur : Le texte ne correspond pas, page %s" % pagenum) #the page is ok new_text = re.sub(u'<references[ ]*/>', u'', new_text) new_text = re.sub(u'[ ]([,])', u'\\1', new_text) new_text = re.sub(u'([^.])[ ]([,.])', u'\\1\\2', new_text) new_text = re.sub(u'\.\.\.', u'…', new_text) new_text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', new_text) new_text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', new_text) # separated from the previous regexp else "word!»" overlap new_text = re.sub(u'([^ \s])([»])', u'\\1 \\2', new_text) # workaround some buggy text new_text = re.sub(u'([;:!?»]) \n', u'\\1\n', new_text) new_text = re.sub(u'([;:!?»])\'\'([ \n])', u'\\1\'\'\\2', new_text) # < ><space> #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) new_text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', new_text) new_text = new_text.replace(u'Page : ', u'Page:') new_text = new_text.replace(u'\n: ', u'\n:') new_text = new_text.replace(u'\n:: ', u'\n::') new_text = new_text.replace(u'\n::: ', u'\n:::') new_text = new_text.replace(u'\n:::: ', u'\n::::') new_text = new_text.replace(u'\n::::: ', u'\n:::::') new_text = re.sub( u'1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)', u'1{{er}} \\1', new_text) new_text = re.sub(u'([0-9])e ', u'\\1{{e}} ', new_text) #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text) # try to move the title inside the M&S match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text) if match_title: pos = re.search(u'==(.*?)==', new_text) if pos: new_text = new_text[ 0:pos.end(0)] + u'\n{{c|' + match_title.group( 1) + u'|fs=140%}}\n\n\n' + new_text[pos.end(0):] safe_put(page, new_text, user + ": match") jobs['number_of_split_job'] += 1 # FIXME: can we pass the request here and use a callback in the js? # FIXME: server is None? jobs['split_queue'].put(maintitle, codelang, user, time.time(), None, None, None) # FIXME: that's an abuse of E_ERROR return ret_val(E_ERROR, "ok : transfert en cours.") prefix = prefix.decode('utf-8') p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)\]\]==") m = re.search(p, text) if m: djvuname = m.group(1) number = m.group(2) pos = text.find(m.group(0)) head = text[:pos] text = text[pos + len(m.group(0)):] else: return ret_val(E_ERROR, "match tag not found") pywikibot.output(djvuname + " " + number) try: number = int(number) except: return ret_val(E_ERROR, "illformed __MATCH__: no page number ?") cache = lifo_cache.LifoCache('match_and_split_text_layer') cached_text = align.get_djvu(cache, mysite, djvuname, True) if not cached_text: return ret_val( E_ERROR, "unable to read djvu, if the File: exists, please retry") data = align.do_match(text, cached_text, djvuname, number, verbose=False, prefix=prefix) if not data['error']: safe_put(page, head + data['text'], user + ": match") data['text'] = "" return data