Exemplo n.º 1
0
def read_djvu(book_name, cached_text, datas, opt):
    import align
    data = align.get_djvu(cached_text, opt.site, book_name, True)
    for pos, text in enumerate(data):
        text = re.sub(u'(?ms)<noinclude>(.*?)</noinclude>', u'', text)
        datas.setdefault(pos + 1, [])
        datas[pos + 1].append(text)
Exemplo n.º 2
0
def read_djvu(book_name, cached_text, datas, opt):
    import align
    data = align.get_djvu(cached_text, opt.site, book_name, True)
    for pos, text in enumerate(data):
        text = re.sub(u'(?ms)<noinclude>(.*?)</noinclude>', u'', text)
        datas.setdefault(pos + 1, [])
        datas[pos + 1].append(text)
def do_extract(mysite, maintitle, user, codelang, cache):
    prefix = unicode(page_prefixes['wikisource'].get(codelang), 'utf-8')
    if not prefix:
        return ret_val(E_ERROR, "no prefix")

    djvuname = maintitle.replace(u' ', u'_')
    print djvuname.encode('utf-8')

    text_layer = align.get_djvu(cache, mysite, djvuname, True)
    if not text_layer:
        return ret_val(E_ERROR, "unable to retrieve text layer")

    text = u''
    for pos, page_text in enumerate(text_layer):
        text += u'==[[' + prefix + u':' + maintitle + u'/' + unicode(pos+1) + u']]==\n'
        text += page_text + u'\n'

    page = pywikibot.Page(mysite, u'User:'******'/Text')
    safe_put(page, text, comment = u'extract text')

    return ret_val(E_OK, "")
Exemplo n.º 4
0
def do_match(mysite, maintitle, user, codelang):
    prefix = page_prefixes["wikisource"].get(codelang)
    if not prefix:
        return ret_val(E_ERROR, "no prefix")

    page = pywikibot.Page(mysite, maintitle)
    try:
        text = page.get()
    except:
        return ret_val(E_ERROR, "failed to get page")

    if text.find("{{R2Mondes") != -1:
        global pl_dict
        pl_dict = {}
        p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n")
        try:
            new_text = p0.sub(repl, text)
        except pywikibot.NoPage:
            return ret_val(E_ERROR, "Erreur : impossible de trouver l'index")
        p = re.compile("==\[\[Page:([^=]+)\]\]==\n")

        cache = lifo_cache.LifoCache("match_and_split_text_layer")
        bl = p.split(new_text)
        for i in range(len(bl) / 2):
            title = bl[i * 2 + 1]
            content = bl[i * 2 + 2]
            filename, pagenum = title.split("/")
            if i == 0:
                cached_text = align.get_djvu(cache, mysite, filename, True)
            else:
                cached_text = align.get_djvu(cache, mysite, filename, False)
            if not cached_text:
                return ret_val(E_ERROR, "Erreur : fichier absent")
            if content.find("R2Mondes") != -1:
                p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n")
                bl0 = p0.split(text)
                title0 = bl0[i * 2 + 1].encode("utf8")
                return ret_val(E_ERROR, "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page " + title0)
            r = align.match_page(content, cached_text[int(pagenum) - 1])
            print "%s %s  : %f" % (filename, pagenum, r)
            if r < 0.1:
                return ret_val(E_ERROR, "Erreur : Le texte ne correspond pas, page %s" % pagenum)
        # the page is ok
        new_text = re.sub(u"<references[ ]*/>", u"", new_text)
        new_text = re.sub(u"[ ]([,])", u"\\1", new_text)
        new_text = re.sub(u"([^.])[ ]([,.])", u"\\1\\2", new_text)
        new_text = re.sub(u"\.\.\.", u"…", new_text)

        new_text = re.sub(u"([^ \s])([;:!?])", u"\\1 \\2", new_text)
        new_text = re.sub(u"([«;:!?])([^ \s…])", u"\\1 \\2", new_text)
        # separated from the previous regexp else "word!»" overlap
        new_text = re.sub(u"([^ \s])([»])", u"\\1 \\2", new_text)

        # workaround some buggy text
        new_text = re.sub(u"([;:!?»]) \n", u"\\1\n", new_text)
        new_text = re.sub(u"([;:!?»])''([ \n])", u"\\1''\\2", new_text)
        # <&nbsp;><space>
        # new_text = re.sub(u'  ([;:!?»])', u' \\1', new_text)
        # new_text = re.sub(u' ([;:!?»])', u' \\1', new_text)
        new_text = re.sub(u"([;:!?»]) <br />", u"\\1<br />", new_text)
        new_text = new_text.replace(u"Page : ", u"Page:")
        new_text = new_text.replace(u"\n: ", u"\n:")
        new_text = new_text.replace(u"\n:: ", u"\n::")
        new_text = new_text.replace(u"\n::: ", u"\n:::")
        new_text = new_text.replace(u"\n:::: ", u"\n::::")
        new_text = new_text.replace(u"\n::::: ", u"\n:::::")
        new_text = re.sub(
            u"1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)",
            u"1{{er}} \\1",
            new_text,
        )
        new_text = re.sub(u"([0-9])e ", u"\\1{{e}} ", new_text)
        # text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text)

        # try to move the title inside the M&S
        match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text)
        if match_title:
            pos = re.search(u"==(.*?)==", new_text)
            if pos:
                new_text = (
                    new_text[0 : pos.end(0)]
                    + u"\n{{c|"
                    + match_title.group(1)
                    + u"|fs=140%}}\n\n\n"
                    + new_text[pos.end(0) :]
                )

        safe_put(page, new_text, user + ": match")
        jobs["number_of_split_job"] += 1
        # FIXME: can we pass the request here and use a callback in the js?
        # FIXME: server is None?
        jobs["split_queue"].put(maintitle, codelang, user, time.time(), None, None, None)
        # FIXME: that's an abuse of E_ERROR
        return ret_val(E_ERROR, "ok : transfert en cours.")

    prefix = prefix.decode("utf-8")
    p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)\]\]==")
    m = re.search(p, text)
    if m:
        djvuname = m.group(1)
        number = m.group(2)
        pos = text.find(m.group(0))
        head = text[:pos]
        text = text[pos + len(m.group(0)) :]
    else:
        return ret_val(E_ERROR, "match tag not found")

    pywikibot.output(djvuname + " " + number)
    try:
        number = int(number)
    except:
        return ret_val(E_ERROR, "illformed __MATCH__: no page number ?")

    cache = lifo_cache.LifoCache("match_and_split_text_layer")
    cached_text = align.get_djvu(cache, mysite, djvuname, True)
    if not cached_text:
        return ret_val(E_ERROR, "unable to read djvu, if the File: exists, please retry")

    data = align.do_match(text, cached_text, djvuname, number, verbose=False, prefix=prefix)
    if not data["error"]:
        safe_put(page, head + data["text"], user + ": match")
        data["text"] = ""

    return data
Exemplo n.º 5
0
def do_match(mysite, maintitle, user, codelang):
    prefix = page_prefixes['wikisource'].get(codelang)
    if not prefix:
        return ret_val(E_ERROR, "no prefix")

    page = pywikibot.Page(mysite, maintitle)
    try:
        text = page.get()
    except:
        return ret_val(E_ERROR, "failed to get page")

    if text.find("{{R2Mondes") != -1:
        global pl_dict
        pl_dict = {}
        p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n")
        try:
            new_text = p0.sub(repl, text)
        except pywikibot.NoPage:
            return ret_val(E_ERROR, "Erreur : impossible de trouver l'index")
        p = re.compile('==\[\[Page:([^=]+)\]\]==\n')

        cache = lifo_cache.LifoCache('match_and_split_text_layer')
        bl = p.split(new_text)
        for i in range(len(bl) / 2):
            title = bl[i * 2 + 1]
            content = bl[i * 2 + 2]
            filename, pagenum = title.split('/')
            if i == 0:
                cached_text = align.get_djvu(cache, mysite, filename, True)
            else:
                cached_text = align.get_djvu(cache, mysite, filename, False)
            if not cached_text:
                return ret_val(E_ERROR, "Erreur : fichier absent")
            if content.find("R2Mondes") != -1:
                p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n")
                bl0 = p0.split(text)
                title0 = bl0[i * 2 + 1].encode("utf8")
                return ret_val(
                    E_ERROR,
                    "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page " +
                    title0)
            r = align.match_page(content, cached_text[int(pagenum) - 1])
            print "%s %s  : %f" % (filename, pagenum, r)
            if r < 0.1:
                return ret_val(
                    E_ERROR,
                    "Erreur : Le texte ne correspond pas, page %s" % pagenum)
        #the page is ok
        new_text = re.sub(u'<references[ ]*/>', u'', new_text)
        new_text = re.sub(u'[ ]([,])', u'\\1', new_text)
        new_text = re.sub(u'([^.])[ ]([,.])', u'\\1\\2', new_text)
        new_text = re.sub(u'\.\.\.', u'…', new_text)

        new_text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', new_text)
        new_text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', new_text)
        # separated from the previous regexp else "word!»" overlap
        new_text = re.sub(u'([^ \s])([»])', u'\\1 \\2', new_text)

        # workaround some buggy text
        new_text = re.sub(u'([;:!?»]) \n', u'\\1\n', new_text)
        new_text = re.sub(u'([;:!?»])\'\'([ \n])', u'\\1\'\'\\2', new_text)
        # <&nbsp;><space>
        #new_text = re.sub(u'  ([;:!?»])', u' \\1', new_text)
        #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text)
        new_text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', new_text)
        new_text = new_text.replace(u'Page : ', u'Page:')
        new_text = new_text.replace(u'\n: ', u'\n:')
        new_text = new_text.replace(u'\n:: ', u'\n::')
        new_text = new_text.replace(u'\n::: ', u'\n:::')
        new_text = new_text.replace(u'\n:::: ', u'\n::::')
        new_text = new_text.replace(u'\n::::: ', u'\n:::::')
        new_text = re.sub(
            u'1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)',
            u'1{{er}} \\1', new_text)
        new_text = re.sub(u'([0-9])e ', u'\\1{{e}} ', new_text)
        #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text)

        # try to move the title inside the M&S
        match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text)
        if match_title:
            pos = re.search(u'==(.*?)==', new_text)
            if pos:
                new_text = new_text[
                    0:pos.end(0)] + u'\n{{c|' + match_title.group(
                        1) + u'|fs=140%}}\n\n\n' + new_text[pos.end(0):]

        safe_put(page, new_text, user + ": match")
        jobs['number_of_split_job'] += 1
        # FIXME: can we pass the request here and use a callback in the js?
        # FIXME: server is None?
        jobs['split_queue'].put(maintitle, codelang, user, time.time(), None,
                                None, None)
        # FIXME: that's an abuse of E_ERROR
        return ret_val(E_ERROR, "ok : transfert en cours.")

    prefix = prefix.decode('utf-8')
    p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)\]\]==")
    m = re.search(p, text)
    if m:
        djvuname = m.group(1)
        number = m.group(2)
        pos = text.find(m.group(0))
        head = text[:pos]
        text = text[pos + len(m.group(0)):]
    else:
        return ret_val(E_ERROR, "match tag not found")

    pywikibot.output(djvuname + " " + number)
    try:
        number = int(number)
    except:
        return ret_val(E_ERROR, "illformed __MATCH__: no page number ?")

    cache = lifo_cache.LifoCache('match_and_split_text_layer')
    cached_text = align.get_djvu(cache, mysite, djvuname, True)
    if not cached_text:
        return ret_val(
            E_ERROR, "unable to read djvu, if the File: exists, please retry")

    data = align.do_match(text,
                          cached_text,
                          djvuname,
                          number,
                          verbose=False,
                          prefix=prefix)
    if not data['error']:
        safe_put(page, head + data['text'], user + ": match")
        data['text'] = ""

    return data