Exemplo n.º 1
0
def _wiki_dump_to_huge_math_pages_one( env_dict, wiki_xml_file ):
    """
    Grab one huge wiki page and have fun with it while creating one huge math page file.
  """
    wiki_xml_math_output = env_dict["wiki"]["xml_math_output"] % os.path.basename(wiki_xml_file)

    logger.warning(u"Started extracting math pages from [%s] to [%s]",
                   wiki_xml_file, wiki_xml_math_output)

    # load wiki dump
    #
    wiki_page_dumper = dump.pager(
        wiki_xml_file,
        env_dict["pager"]["delimiter"],
        env_dict["pager"]["buffer"])

    # for all pages and for all wiki maths
    # - try to find must_exist
    # -- if true output
    #
    must_exist = env_dict["pager"]["identify_by"]
    with codecs.open(wiki_xml_math_output, encoding='utf-8', mode='wb') as huge_math_output:
        math_pages = 0
        for pages_done, page in enumerate(wiki_page_dumper.pages()):
            if page and must_exist in page:
                math_pages += 1
                #logger.info( u"Pages done:[%d] Math:[%d]", pages_done, math_pages )
                huge_math_output.write(page)
            else:
                if not page:
                    logger_suspicious.warning(u"Page is null - [%d]", pages_done)

    logger.info(u"Stopped extracting math pages from [%s] to [%s], total [%s]",
                wiki_xml_file, wiki_xml_math_output, math_pages)
Exemplo n.º 2
0
def _wiki_dump_to_huge_math_pages_one(env_dict, wiki_xml_file):
    """
    Grab one huge wiki page and have fun with it while creating one huge math page file.
  """
    wiki_xml_math_output = env_dict["wiki"][
        "xml_math_output"] % os.path.basename(wiki_xml_file)

    logger.warning(u"Started extracting math pages from [%s] to [%s]",
                   wiki_xml_file, wiki_xml_math_output)

    # load wiki dump
    #
    wiki_page_dumper = dump.pager(wiki_xml_file,
                                  env_dict["pager"]["delimiter"],
                                  env_dict["pager"]["buffer"])

    # for all pages and for all wiki maths
    # - try to find must_exist
    # -- if true output
    #
    must_exist = env_dict["pager"]["identify_by"]
    with codecs.open(wiki_xml_math_output, encoding='utf-8',
                     mode='wb') as huge_math_output:
        math_pages = 0
        for pages_done, page in enumerate(wiki_page_dumper.pages()):
            if page and must_exist in page:
                math_pages += 1
                #logger.info( u"Pages done:[%d] Math:[%d]", pages_done, math_pages )
                huge_math_output.write(page)
            else:
                if not page:
                    logger_suspicious.warning(u"Page is null - [%d]",
                                              pages_done)

    logger.info(u"Stopped extracting math pages from [%s] to [%s], total [%s]",
                wiki_xml_file, wiki_xml_math_output, math_pages)
Exemplo n.º 3
0
def _huge_math_page_texhtml(env_dict):
    """
        Grab one huge wiki page and have fun with it while creating all pages.
    """
    wiki_xml_math_output = env_dict["wiki"]["big_xml"]
    #wiki_xml_math_output = env_dict["wiki"]["xml_math_output_big"]
    wiki_xml_math_output = env_dict["wiki"]["xml_math_output_test"]

    # load wiki dump
    #
    wiki_page_dumper = dump.pager(wiki_xml_math_output,
                                  env_dict["pager"]["delimiter"],
                                  env_dict["pager"]["buffer"])

    from HTMLParser import HTMLParser

    ht = HTMLParser()

    titles = []
    title_pattern = re.compile(env_dict["pager"]["re_title"], re.DOTALL)
    uniq = set()

    # def do_texhtml( page ):
    #     total = 0
    #     for r in re.compile(u"<.*?texhtml.*?>(.*?)</.*?>").finditer(page):
    #         found = True
    #         total += 1
    #         #html = r.group()
    #         #msg = u"%s\n\t%s\n\t%s" % (ht.unescape(html), html, r.group(1))
    #         norm = converters.latex.normalise(r.group(1).strip())
    #         if not norm in uniq:
    #             uniq.add(norm)
    #             msg = ht.unescape(r.group(1)).replace(u" ", " "). \
    #                 replace(u"<sub>", u"_"). \
    #                 replace(u"<sup>", u"^"). \
    #                 replace(u"<var >", u" ")
    #             logger.info(msg)
    #     return total

    def do_title(page):
        try:
            title = title_pattern.search(page).group(1)
            titles.append(title)
        except:
            logger.warning(u"Could not parse title [%s]", page[:500])

    # try to load pickled mathml (ok/fail)
    # &lt;span class=&quot;texhtml&quot;&gt;?&lt;/span&gt;
    total = 0
    total_pages = 0
    pages_done = 0
    for pages_done, page in enumerate(
            wiki_page_dumper.pages(templates.htmltemplate)):

        if pages_done % 100000 == 0:
            logger.info(
                u"Total formulas: %s, On pages: %s, Unique: %s, Done [%s]" %
                (total, total_pages, len(uniq), pages_done))
        do_title(page)
        # found = do_texhtml( page )
        # if found > 0:
        #     total_pages += 1

    if len(titles) > 0:
        with codecs.open("all.titles", mode="w+", encoding="utf-8") as fout:
            for title in titles:
                fout.write(title + "\n")
    print "Pages done: %s, Total formulas: %s, On pages: %s, Unique: %s" % \
          ( pages_done, total, total_pages, len(uniq) )
Exemplo n.º 4
0
def _huge_math_page_to_pages(env_dict):
    """
    Grab one huge wiki page and have fun with it while creating all pages.
    """
    import _math
    wiki_xml_math_output = env_dict["wiki"]["xml_math_output_big"]
    #wiki_xml_math_output = env_dict["wiki"]["xml_math_output_test"]

    from indexer.egomath.interface import egomath_inst

    egomath_inst.reset_logging()

    wiki_pages_output = env_dict["wiki"]["pages_output"]
    pickle_mathml_ok = env_dict["converters"]["latexml"]["pickle_ok"]
    pickle_mathml_fail = env_dict["converters"]["latexml"]["pickle_fail"]

    logger.info(u"Started separating pages from [%s] to [%s]",
                wiki_xml_math_output, wiki_pages_output)

    # load wiki dump
    #
    wiki_page_dumper = dump.pager(wiki_xml_math_output,
                                  env_dict["pager"]["delimiter"],
                                  env_dict["pager"]["buffer"])

    # try to load pickled mathml (ok/fail)
    #
    converted_mathml = None
    if env_dict["mathml"]["convert"] == "pickle":
        buffering = 100 * 1024 * 1024
        converted_mathml = _math.mathpickles(pickle_mathml_ok,
                                             pickle_mathml_fail,
                                             buffering=buffering)
    elif env_dict["mathml"]["convert"] == "db":
        converted_mathml = _math.mathdb(env_dict)

    latex_pattern = env_dict["pager"]["re_math"]
    title_pattern = re.compile(env_dict["pager"]["re_title"], re.DOTALL)
    total_formula_count = 0
    formula_unique = set() if env_dict["wiki"]["collect_stats"] else None
    pages_done = 0
    converted_mathml_cnt = 0
    from collections import defaultdict

    pages_formula = defaultdict(int)

    # for all pages and for all wiki maths
    #
    for pages_done, page in enumerate(
            wiki_page_dumper.pages(templates.htmltemplate)):
        logger.info(u'Done %d pages', pages_done)
        # if title already exists do not write
        try:
            title = title_pattern.search(page).group(1).replace(" ", "_")
            url = u"http://en.wikipedia.org/wiki/%s" % title
            assert not u"$title" in title
            page_store = _math.page_to_store(wiki_pages_output,
                                             title + ".html")
            if not env_dict["pager"]["overwrite"] and page_store.exists():
                logger.warning(u"Page exists [%s] [%d]", title, pages_done)
                continue
        except Exception, e:
            logger.error(u"Could not store page because of %s", repr(e))
            continue

        from _parser import parser
        page = parser.preprocess_page_math(env_dict, page)

        # the page we got should be wiki tag free; however, it will contain only
        # basic math &lt;math&gt; B \gt &lt;/math&gt; which can contain
        # non latex characters &gt; instead of \gt
        # - we must fix this
        #
        page_replacements = []
        page_formula_count = 0
        for wiki_math_iter in latex_pattern.finditer(page):
            page_formula_count += 1
            total_formula_count += 1

            page_replacements += \
                    _math.convert_wikimath_to_realmath(
                        env_dict,
                        wiki_math_iter,
                        converted_mathml,
                        url,
                        title,
                        total_formula_count,
                        formula_unique)
        pages_formula[page_formula_count] += 1

        info_msg = u"# of formulae on page [%s] is [%d], total [%d]" % (
            utils.ascii(title), page_formula_count, total_formula_count)
        if page_formula_count == 0:
            logger_suspicious.warning(info_msg + u" -> skipping 0.")
            logger.warning(info_msg)
            continue
        else:
            logger.warning(info_msg)

        # create the page
        #
        tmp = ""
        last = 0
        e = None
        for (s, e, r) in page_replacements:
            tmp += page[last:s] + r
            last = e
        tmp += page[e:]
        page = tmp

        # store the page
        try:
            page_store.store(page)
        except IOError, e:
            logger.error(u"Could not store [%s] page because of %s", title,
                         repr(e))
Exemplo n.º 5
0
import dump
import utils


class elements(object):
    logger = utils.logger('wiki.statistics.elements')

    def __init__(self, re_expression):
        elements.logger.info(u"Initializing with re_expression=%s",
                             re_expression)
        self._re = re.compile(re_expression, re.DOTALL)

    def __call__(self, pages):
        size = 0
        for page in pages:
            els = self._re.findall(page)
            print utils.ascii(u"\n".join(els))
            size += len(els)
        print u"Total size: %d" % size


if __name__ == "__main__":
    import logging

    logging.basicConfig(level=logging.DEBUG)
    MB = 1024 * 1024
    wikier = dump.pager(r"../output_math/math.pages", 50 * MB)
    elements(u"<title>(.*?)</title>")(wikier.pages())

print "Finished importing %s" % __file__
Exemplo n.º 6
0
import re
import dump
import utils


class elements(object):
    logger = utils.logger("wiki.statistics.elements")

    def __init__(self, re_expression):
        elements.logger.info(u"Initializing with re_expression=%s", re_expression)
        self._re = re.compile(re_expression, re.DOTALL)

    def __call__(self, pages):
        size = 0
        for page in pages:
            els = self._re.findall(page)
            print utils.ascii(u"\n".join(els))
            size += len(els)
        print u"Total size: %d" % size


if __name__ == "__main__":
    import logging

    logging.basicConfig(level=logging.DEBUG)
    MB = 1024 * 1024
    wikier = dump.pager(r"../output_math/math.pages", 50 * MB)
    elements(u"<title>(.*?)</title>")(wikier.pages())

print "Finished importing %s" % __file__
Exemplo n.º 7
0
def _huge_math_page_texhtml( env_dict ):
    """
        Grab one huge wiki page and have fun with it while creating all pages.
    """
    wiki_xml_math_output = env_dict["wiki"]["big_xml"]
    #wiki_xml_math_output = env_dict["wiki"]["xml_math_output_big"]
    wiki_xml_math_output = env_dict["wiki"]["xml_math_output_test"]

    # load wiki dump
    #
    wiki_page_dumper = dump.pager(wiki_xml_math_output,
                                  env_dict["pager"]["delimiter"],
                                  env_dict["pager"]["buffer"])

    from HTMLParser import HTMLParser

    ht = HTMLParser()

    titles = []
    title_pattern = re.compile(env_dict["pager"]["re_title"], re.DOTALL)
    uniq = set()

    # def do_texhtml( page ):
    #     total = 0
    #     for r in re.compile(u"&lt;.*?texhtml.*?&gt;(.*?)&lt;/.*?&gt;").finditer(page):
    #         found = True
    #         total += 1
    #         #html = r.group()
    #         #msg = u"%s\n\t%s\n\t%s" % (ht.unescape(html), html, r.group(1))
    #         norm = converters.latex.normalise(r.group(1).strip())
    #         if not norm in uniq:
    #             uniq.add(norm)
    #             msg = ht.unescape(r.group(1)).replace(u"&nbsp;", " "). \
    #                 replace(u"<sub>", u"_"). \
    #                 replace(u"<sup>", u"^"). \
    #                 replace(u"<var >", u" ")
    #             logger.info(msg)
    #     return total

    def do_title( page ):
        try:
            title = title_pattern.search(page).group(1)
            titles.append(title)
        except:
            logger.warning(u"Could not parse title [%s]", page[:500])

    # try to load pickled mathml (ok/fail)
    # &lt;span class=&quot;texhtml&quot;&gt;?&lt;/span&gt;
    total = 0
    total_pages = 0
    pages_done = 0
    for pages_done, page in enumerate(wiki_page_dumper.pages(templates.htmltemplate)):

        if pages_done % 100000 == 0:
            logger.info(u"Total formulas: %s, On pages: %s, Unique: %s, Done [%s]" %
                        ( total, total_pages, len(uniq), pages_done ))
        do_title(page)
        # found = do_texhtml( page )
        # if found > 0:
        #     total_pages += 1

    if len(titles) > 0:
        with codecs.open("all.titles", mode="w+", encoding="utf-8") as fout:
            for title in titles:
                fout.write(title + "\n")
    print "Pages done: %s, Total formulas: %s, On pages: %s, Unique: %s" % \
          ( pages_done, total, total_pages, len(uniq) )
Exemplo n.º 8
0
def _huge_math_page_to_pages( env_dict ):
    """
    Grab one huge wiki page and have fun with it while creating all pages.
    """
    import _math
    wiki_xml_math_output = env_dict["wiki"]["xml_math_output_big"]
    #wiki_xml_math_output = env_dict["wiki"]["xml_math_output_test"]

    from indexer.egomath.interface import egomath_inst

    egomath_inst.reset_logging()

    wiki_pages_output = env_dict["wiki"]["pages_output"]
    pickle_mathml_ok = env_dict["converters"]["latexml"]["pickle_ok"]
    pickle_mathml_fail = env_dict["converters"]["latexml"]["pickle_fail"]

    logger.info(u"Started separating pages from [%s] to [%s]",
                wiki_xml_math_output, wiki_pages_output)

    # load wiki dump
    #
    wiki_page_dumper = dump.pager(wiki_xml_math_output,
                                  env_dict["pager"]["delimiter"],
                                  env_dict["pager"]["buffer"])

    # try to load pickled mathml (ok/fail)
    #
    converted_mathml = None
    if env_dict["mathml"]["convert"] == "pickle":
        buffering = 100 * 1024 * 1024
        converted_mathml = _math.mathpickles(pickle_mathml_ok, pickle_mathml_fail, buffering=buffering)
    elif env_dict["mathml"]["convert"] == "db":
        converted_mathml = _math.mathdb(env_dict)

    latex_pattern = env_dict["pager"]["re_math"]
    title_pattern = re.compile(env_dict["pager"]["re_title"], re.DOTALL)
    total_formula_count = 0
    formula_unique = set() if env_dict["wiki"]["collect_stats"] else None
    pages_done = 0
    converted_mathml_cnt = 0
    from collections import defaultdict


    pages_formula = defaultdict(int)

    # for all pages and for all wiki maths
    #
    for pages_done, page in enumerate(wiki_page_dumper.pages(templates.htmltemplate)):
        logger.info(u'Done %d pages', pages_done)
        # if title already exists do not write
        try:
            title = title_pattern.search(page).group(1).replace(" ", "_")
            url = u"http://en.wikipedia.org/wiki/%s" % title
            assert not u"$title" in title
            page_store = _math.page_to_store(wiki_pages_output, title + ".html")
            if not env_dict["pager"]["overwrite"] and page_store.exists():
                logger.warning(u"Page exists [%s] [%d]", title, pages_done)
                continue
        except Exception, e:
            logger.error(u"Could not store page because of %s", repr(e))
            continue

        from _parser import parser
        page = parser.preprocess_page_math(env_dict, page)


        # the page we got should be wiki tag free; however, it will contain only
        # basic math &lt;math&gt; B \gt &lt;/math&gt; which can contain
        # non latex characters &gt; instead of \gt
        # - we must fix this
        #
        page_replacements = []
        page_formula_count = 0
        for wiki_math_iter in latex_pattern.finditer(page):
            page_formula_count += 1
            total_formula_count += 1

            page_replacements += \
                    _math.convert_wikimath_to_realmath(
                        env_dict,
                        wiki_math_iter,
                        converted_mathml,
                        url,
                        title,
                        total_formula_count,
                        formula_unique)
        pages_formula[page_formula_count] += 1

        info_msg = u"# of formulae on page [%s] is [%d], total [%d]" % (
            utils.ascii(title), page_formula_count, total_formula_count)
        if page_formula_count == 0:
            logger_suspicious.warning(info_msg + u" -> skipping 0.")
            logger.warning(info_msg)
            continue
        else:
            logger.warning(info_msg)

        # create the page
        #
        tmp = ""
        last = 0
        e = None
        for (s, e, r) in page_replacements:
            tmp += page[last:s] + r
            last = e
        tmp += page[e:]
        page = tmp

        # store the page
        try:
            page_store.store(page)
        except IOError, e:
            logger.error(u"Could not store [%s] page because of %s", title, repr(e))