def test_normal_entries_top_pages(index, data, mocker): """All entries from top_pages should be added to the index.""" top_pages = [ ('f/o/o', 'foo', 10), ('b/a/r', 'bar', 10), ('b/a/z', 'baz', 10), ] mocker.patch('src.preprocessing.preprocess.pages_selector', mocker.Mock(top_pages=top_pages)) cdpindex.generate_from_html(None, None) assert index.create.call_count == 1 # a generator of index entries was passed to the 'create' method but wasn't consumed entries_gen = index.create.call_args[0][1] assert len(list(entries_gen)) == len(top_pages)
def test_repeated_entries_top_pages(index, data, mocker): """Duplicated entries from top_pages should raise an exception.""" top_pages = [ ('f/o/o', 'foo', 10), ('b/a/r', 'bar', 10), ('f/o/o', 'foo', 10), ] mocker.patch('src.preprocessing.preprocess.pages_selector', mocker.Mock(top_pages=top_pages)) cdpindex.generate_from_html(None, None) assert index.create.call_count == 1 entries_gen = index.create.call_args[0][1] # duplicated entry should be detected while iterating over the entries generator with pytest.raises(KeyError): list(entries_gen)
def test_redirects_with_special_chars(index, data, mocker, title): """Check redirects to pages containing encoded special filesystem chars.""" # only target chars should be quoted: '/', '.' and '%' filename = to3dirs.to_filename(title) with open(config.LOG_TITLES, 'at', encoding='utf-8') as fh: fh.write('{}|{}|\n'.format(filename, title)) top_pages = [('f/o/o', filename, 10)] mocker.patch('src.preprocessing.preprocess.pages_selector', mocker.Mock(top_pages=top_pages)) with open(config.LOG_REDIRECTS, 'wt', encoding='utf-8') as fh: fh.write('redirect|{}\n'.format(title)) cdpindex.generate_from_html(None, None) assert index.create.call_count == 1 entries = list(index.create.call_args[0][1]) assert len(entries) == 1
def test_repeated_entry_redirects(index, data, mocker): """Don't add repeated redirect entries to the index.""" with open(config.LOG_TITLES, 'wt', encoding='utf-8') as fh: fh.write('foo_bar|foo bar|\n') top_pages = [('f/o/o_bar', 'foo_bar', 10)] mocker.patch('src.preprocessing.preprocess.pages_selector', mocker.Mock(top_pages=top_pages)) # these redirects will have similar titles after normalization, those will exact words # will be not included, only the one with the new word (and NOT the repeated one after that!) with open(config.LOG_REDIRECTS, 'wt', encoding='utf-8') as fh: fh.write('Foo Bar|foo_bar\n') fh.write('FOO BAR|foo_bar\n') fh.write('fOO bazzz|foo_bar\n') fh.write('fOO BAZZZ|foo_bar\n') fh.write('BAZZZ fOo|foo_bar\n') fh.write('bazzz_fOo|foo_bar\n') cdpindex.generate_from_html(None, None) assert index.create.call_count == 1 entries = list(index.create.call_args[0][1]) # should have one entry from top_pages and two entry from redirects: # - YES: the original article, for sure # - NO: both next redirects, which after normalization have the same words # - YES: the redirect bringing new words (note ALL words are indexed, not only the different # ones, as all are needed if the user search for those words doing an AND) # - YES: the next redirect, that even having same words, they are in different order (the # score of the selected results are order dependant!) # - NO: the last redirect, again having "same words same order" of other one already included assert len(entries) == 1 # the first one for sure must be the original title, link, score, description, orig_words, redirs = entries[0] assert orig_words == ('foo', 'bar') assert link == 'f/o/o_bar/foo_bar' assert len(redirs) == 2 assert redirs == {('foo', 'bazzz'), ('bazzz', 'foo')}
with PATH_TEMP.joinpath("page_scores_final.txt").open( "rt", encoding='utf8') as fh: for line in fh: page, score = line.strip().split(colsep) dir3, fname = to3dirs.get_path_file(page) all_pages.append((dir3, fname, int(score))) # order by score, and get top N all_pages.sort(key=operator.itemgetter(2), reverse=True) return all_pages src.preprocessing.preprocess.pages_selector = fake_page_selector() config.DIR_INDICE = PATH_IDX if __name__ == "__main__": # main() help = """Creates index files. Default path is '{}' for index files, and '{}' for the scrapped files to index.""".format(PATH_IDX, PATH_TEMP) sqlitepath = PATH_IDX.joinpath("index.sqlite") if sqlitepath.exists(): sqlitepath.unlink() print("Database index %s was removed" % sqlitepath) elif not PATH_IDX.exists(): PATH_IDX.mkdir() n_pag = generate_from_html(PATH_TEMP, verbose=True)
def main(lang, src_info, version, lang_config, gendate, images_dump_dir, verbose=False, desconectado=False, process_articles=True): """Generate the CDPedia tarball or iso.""" # don't affect the rest of the machine make_it_nicer() # set language in config if config.LANGUAGE is None: config.LANGUAGE = lang config.URL_WIKIPEDIA = config.URL_WIKIPEDIA_TPL.format(lang=lang) # validate lang and versions, and fix config with selected data logger.info("Fixing config for lang=%r version=%r", lang, version) try: _lang_conf = config.imagtypes[lang] except KeyError: available_langs = list(config.imagtypes.keys()) logger.error("%r is not a valid language! try one of %s", lang, available_langs) exit() try: config.imageconf = _lang_conf[version] except KeyError: available_versions = list(_lang_conf.keys()) logger.error("%r is not a valid version! try one of %s", version, available_versions) exit() config.langconf = lang_config logger.info("Starting!") prepare_temporary_dirs(process_articles) logger.info("Copying the assets and locale files") dst_assets = os.path.join(config.DIR_CDBASE, 'assets') copy_assets(src_info, dst_assets) link(os.path.join(src_info, 'portal_pages.txt'), config.DIR_TEMP) copy_dir('locale', path.join(config.DIR_CDBASE, "locale")) set_locale(lang_config.get('second_language'), record=True) logger.info("Copying '%s' stylesheet and associated media resources", config.CSS_FILENAME) copy_css(src_info, dst_assets) articulos = path.join(src_info, "articles") if process_articles: logger.info("Preprocessing") if not path.exists(articulos): logger.error("Couldn't find articles dir: %r", articulos) raise EnvironmentError("Directory not found, can't continue") sys.exit() preprocess.run(articulos) logger.info("Calculating which stay and which don't") preprocess.pages_selector.calculate() logger.info("Generating the images log") taken, adesc = extract.run() logger.info("Extracted %d images, need to download %d", taken, adesc) else: logger.info("Avoid processing articles and generating images log") logger.info("Recalculating the reduction percentages.") calculate.run() if not desconectado: logger.info("Downloading the images from the internet") download.retrieve(images_dump_dir) logger.info("Reducing the downloaded images") scale.run(verbose, images_dump_dir) if config.EMBED_IMAGES: logger.info("Embedding selected images") embed.run(images_dump_dir) logger.info("Putting the reduced images into blocks") # agrupamos las imagenes en bloques q_blocks, q_images = ImageManager.generar_bloques(verbose) logger.info("Got %d blocks with %d images", q_blocks, q_images) if not process_articles: logger.info("Not generating index and blocks (by user request)") elif preprocess.pages_selector.same_info_through_runs: logger.info("Same articles than previous run " "(not generating index and blocks)") else: logger.info("Generating the index") result = cdpindex.generate_from_html(articulos, verbose) logger.info("Got %d files", result) logger.info("Generating the articles blocks") q_blocks, q_files, q_redirs = ArticleManager.generar_bloques( lang, verbose) logger.info("Got %d blocks with %d files and %d redirects", q_blocks, q_files, q_redirs) logger.info("Copying the sources and libs") copy_sources() generate_libs() # Copy python docs pydocs.clone(lang, lang_config, os.path.dirname(src_info)) logger.info("Generating the links to blocks and indexes") # pages blocks dest = path.join(config.DIR_CDBASE, "pages") if os.path.exists(dest): os.remove(dest) os.symlink(path.abspath(config.DIR_PAGES_BLOCKS), dest) # images blocks dest = path.join(config.DIR_CDBASE, "images") if os.path.exists(dest): os.remove(dest) os.symlink(path.abspath(config.DIR_IMAGES_BLOCKS), dest) # indexes dest = path.join(config.DIR_CDBASE, "indice") if os.path.exists(dest): os.remove(dest) os.symlink(path.abspath(config.DIR_INDICE), dest) if config.imageconf["windows"]: logger.info("Copying Windows stuff") copy_dir("resources/autorun.win/cdroot", config.DIR_CDBASE) # unpack embeddable python distribution for win32 py_win_zip = "resources/autorun.win/python-win32.zip" py_win_dst = os.path.join(config.DIR_CDBASE, 'python') with zipfile.ZipFile(py_win_zip, 'r') as zh: zh.extractall(py_win_dst) logger.info("Generating runtime config") gen_run_config(lang_config) base_dest_name = "cdpedia-%s-%s-%s-%s" % (lang, config.VERSION, gendate, version) if config.imageconf["type"] == "iso": logger.info("Building the ISO: %r", base_dest_name) build_iso(base_dest_name) elif config.imageconf["type"] == "tarball": logger.info("Building the tarball: %r", base_dest_name) build_tarball(base_dest_name) else: raise ValueError("Unrecognized image type") logger.info("All done!")