Пример #1
0
def test_normal_entries_top_pages(index, data, mocker):
    """All entries from top_pages should be added to the index."""
    top_pages = [
        ('f/o/o', 'foo', 10),
        ('b/a/r', 'bar', 10),
        ('b/a/z', 'baz', 10),
    ]
    mocker.patch('src.preprocessing.preprocess.pages_selector',
                 mocker.Mock(top_pages=top_pages))
    cdpindex.generate_from_html(None, None)
    assert index.create.call_count == 1
    # a generator of index entries was passed to the 'create' method but wasn't consumed
    entries_gen = index.create.call_args[0][1]
    assert len(list(entries_gen)) == len(top_pages)
Пример #2
0
def test_repeated_entries_top_pages(index, data, mocker):
    """Duplicated entries from top_pages should raise an exception."""
    top_pages = [
        ('f/o/o', 'foo', 10),
        ('b/a/r', 'bar', 10),
        ('f/o/o', 'foo', 10),
    ]
    mocker.patch('src.preprocessing.preprocess.pages_selector',
                 mocker.Mock(top_pages=top_pages))
    cdpindex.generate_from_html(None, None)
    assert index.create.call_count == 1
    entries_gen = index.create.call_args[0][1]
    # duplicated entry should be detected while iterating over the entries generator
    with pytest.raises(KeyError):
        list(entries_gen)
Пример #3
0
def test_redirects_with_special_chars(index, data, mocker, title):
    """Check redirects to pages containing encoded special filesystem chars."""
    # only target chars should be quoted: '/', '.' and '%'
    filename = to3dirs.to_filename(title)
    with open(config.LOG_TITLES, 'at', encoding='utf-8') as fh:
        fh.write('{}|{}|\n'.format(filename, title))
    top_pages = [('f/o/o', filename, 10)]
    mocker.patch('src.preprocessing.preprocess.pages_selector',
                 mocker.Mock(top_pages=top_pages))
    with open(config.LOG_REDIRECTS, 'wt', encoding='utf-8') as fh:
        fh.write('redirect|{}\n'.format(title))

    cdpindex.generate_from_html(None, None)
    assert index.create.call_count == 1
    entries = list(index.create.call_args[0][1])
    assert len(entries) == 1
Пример #4
0
def test_repeated_entry_redirects(index, data, mocker):
    """Don't add repeated redirect entries to the index."""
    with open(config.LOG_TITLES, 'wt', encoding='utf-8') as fh:
        fh.write('foo_bar|foo bar|\n')
    top_pages = [('f/o/o_bar', 'foo_bar', 10)]
    mocker.patch('src.preprocessing.preprocess.pages_selector',
                 mocker.Mock(top_pages=top_pages))

    # these redirects will have similar titles after normalization, those will exact words
    # will be not included, only the one with the new word (and NOT the repeated one after that!)
    with open(config.LOG_REDIRECTS, 'wt', encoding='utf-8') as fh:
        fh.write('Foo Bar|foo_bar\n')
        fh.write('FOO BAR|foo_bar\n')
        fh.write('fOO bazzz|foo_bar\n')
        fh.write('fOO BAZZZ|foo_bar\n')
        fh.write('BAZZZ fOo|foo_bar\n')
        fh.write('bazzz_fOo|foo_bar\n')
    cdpindex.generate_from_html(None, None)
    assert index.create.call_count == 1
    entries = list(index.create.call_args[0][1])

    # should have one entry from top_pages and two entry from redirects:
    #  - YES: the original article, for sure
    #  - NO: both next redirects, which after normalization have the same words
    #  - YES: the redirect bringing new words (note ALL words are indexed, not only the different
    #         ones, as all are needed if the user search for those words doing an AND)
    #  - YES: the next redirect, that even having same words, they are in different order (the
    #         score of the selected results are order dependant!)
    #  - NO: the last redirect, again having "same words same order" of other one already included
    assert len(entries) == 1

    # the first one for sure must be the original
    title, link, score, description, orig_words, redirs = entries[0]
    assert orig_words == ('foo', 'bar')
    assert link == 'f/o/o_bar/foo_bar'
    assert len(redirs) == 2
    assert redirs == {('foo', 'bazzz'), ('bazzz', 'foo')}
Пример #5
0
        with PATH_TEMP.joinpath("page_scores_final.txt").open(
                "rt", encoding='utf8') as fh:
            for line in fh:
                page, score = line.strip().split(colsep)
                dir3, fname = to3dirs.get_path_file(page)
                all_pages.append((dir3, fname, int(score)))

        # order by score, and get top N
        all_pages.sort(key=operator.itemgetter(2), reverse=True)
        return all_pages


src.preprocessing.preprocess.pages_selector = fake_page_selector()
config.DIR_INDICE = PATH_IDX

if __name__ == "__main__":
    # main()
    help = """Creates index files.

    Default path is '{}' for index files,
    and '{}' for the scrapped files to index.""".format(PATH_IDX, PATH_TEMP)

    sqlitepath = PATH_IDX.joinpath("index.sqlite")
    if sqlitepath.exists():
        sqlitepath.unlink()
        print("Database index %s was removed" % sqlitepath)
    elif not PATH_IDX.exists():
        PATH_IDX.mkdir()

    n_pag = generate_from_html(PATH_TEMP, verbose=True)
Пример #6
0
def main(lang,
         src_info,
         version,
         lang_config,
         gendate,
         images_dump_dir,
         verbose=False,
         desconectado=False,
         process_articles=True):
    """Generate the CDPedia tarball or iso."""
    # don't affect the rest of the machine
    make_it_nicer()

    # set language in config
    if config.LANGUAGE is None:
        config.LANGUAGE = lang
        config.URL_WIKIPEDIA = config.URL_WIKIPEDIA_TPL.format(lang=lang)

    # validate lang and versions, and fix config with selected data
    logger.info("Fixing config for lang=%r version=%r", lang, version)
    try:
        _lang_conf = config.imagtypes[lang]
    except KeyError:
        available_langs = list(config.imagtypes.keys())
        logger.error("%r is not a valid language! try one of %s", lang,
                     available_langs)
        exit()
    try:
        config.imageconf = _lang_conf[version]
    except KeyError:
        available_versions = list(_lang_conf.keys())
        logger.error("%r is not a valid version! try one of %s", version,
                     available_versions)
        exit()
    config.langconf = lang_config

    logger.info("Starting!")
    prepare_temporary_dirs(process_articles)

    logger.info("Copying the assets and locale files")
    dst_assets = os.path.join(config.DIR_CDBASE, 'assets')
    copy_assets(src_info, dst_assets)
    link(os.path.join(src_info, 'portal_pages.txt'), config.DIR_TEMP)
    copy_dir('locale', path.join(config.DIR_CDBASE, "locale"))
    set_locale(lang_config.get('second_language'), record=True)

    logger.info("Copying '%s' stylesheet and associated media resources",
                config.CSS_FILENAME)
    copy_css(src_info, dst_assets)

    articulos = path.join(src_info, "articles")
    if process_articles:
        logger.info("Preprocessing")
        if not path.exists(articulos):
            logger.error("Couldn't find articles dir: %r", articulos)
            raise EnvironmentError("Directory not found, can't continue")
            sys.exit()
        preprocess.run(articulos)

        logger.info("Calculating which stay and which don't")
        preprocess.pages_selector.calculate()

        logger.info("Generating the images log")
        taken, adesc = extract.run()
        logger.info("Extracted %d images, need to download %d", taken, adesc)
    else:
        logger.info("Avoid processing articles and generating images log")

    logger.info("Recalculating the reduction percentages.")
    calculate.run()

    if not desconectado:
        logger.info("Downloading the images from the internet")
        download.retrieve(images_dump_dir)

    logger.info("Reducing the downloaded images")
    scale.run(verbose, images_dump_dir)

    if config.EMBED_IMAGES:
        logger.info("Embedding selected images")
        embed.run(images_dump_dir)

    logger.info("Putting the reduced images into blocks")
    # agrupamos las imagenes en bloques
    q_blocks, q_images = ImageManager.generar_bloques(verbose)
    logger.info("Got %d blocks with %d images", q_blocks, q_images)

    if not process_articles:
        logger.info("Not generating index and blocks (by user request)")
    elif preprocess.pages_selector.same_info_through_runs:
        logger.info("Same articles than previous run "
                    "(not generating index and blocks)")
    else:
        logger.info("Generating the index")
        result = cdpindex.generate_from_html(articulos, verbose)
        logger.info("Got %d files", result)
        logger.info("Generating the articles blocks")
        q_blocks, q_files, q_redirs = ArticleManager.generar_bloques(
            lang, verbose)
        logger.info("Got %d blocks with %d files and %d redirects", q_blocks,
                    q_files, q_redirs)

    logger.info("Copying the sources and libs")
    copy_sources()
    generate_libs()

    # Copy python docs
    pydocs.clone(lang, lang_config, os.path.dirname(src_info))

    logger.info("Generating the links to blocks and indexes")
    # pages blocks
    dest = path.join(config.DIR_CDBASE, "pages")
    if os.path.exists(dest):
        os.remove(dest)
    os.symlink(path.abspath(config.DIR_PAGES_BLOCKS), dest)
    # images blocks
    dest = path.join(config.DIR_CDBASE, "images")
    if os.path.exists(dest):
        os.remove(dest)
    os.symlink(path.abspath(config.DIR_IMAGES_BLOCKS), dest)
    # indexes
    dest = path.join(config.DIR_CDBASE, "indice")
    if os.path.exists(dest):
        os.remove(dest)
    os.symlink(path.abspath(config.DIR_INDICE), dest)

    if config.imageconf["windows"]:
        logger.info("Copying Windows stuff")
        copy_dir("resources/autorun.win/cdroot", config.DIR_CDBASE)
        # unpack embeddable python distribution for win32
        py_win_zip = "resources/autorun.win/python-win32.zip"
        py_win_dst = os.path.join(config.DIR_CDBASE, 'python')
        with zipfile.ZipFile(py_win_zip, 'r') as zh:
            zh.extractall(py_win_dst)

    logger.info("Generating runtime config")
    gen_run_config(lang_config)

    base_dest_name = "cdpedia-%s-%s-%s-%s" % (lang, config.VERSION, gendate,
                                              version)
    if config.imageconf["type"] == "iso":
        logger.info("Building the ISO: %r", base_dest_name)
        build_iso(base_dest_name)
    elif config.imageconf["type"] == "tarball":
        logger.info("Building the tarball: %r", base_dest_name)
        build_tarball(base_dest_name)
    else:
        raise ValueError("Unrecognized image type")

    logger.info("All done!")