def dump_page(source: str, target_folder: Union[Path, str] = "pages", wiki_obj: Wikipedia = None, lang: str = 'fr'): if not wiki_obj: wiki_obj = Wikipedia(lang) target_folder = Path(target_folder) if not target_folder.exists(): makedirs(target_folder) wikipage = wiki_obj.page(source) if not wikipage.exists(): print(f"page {source} does not exist") else: page_info = wiki_obj.info(wikipage) if page_info.title != wikipage.title: wikipage = wiki_obj.page(page_info.title) wiki_title = wikipage.title.replace(' ', '_') target_file = target_folder / (wiki_title.replace("/", "__SLASH__") + ".pkl") pkl.dump(wikipage, target_file.open('wb'))
def get_filtered_complete_dic(pkl_with_stats_fn, min_paragraphs=5, min_len_paragraphs=500, max_len_paragraphs=1000, draft=False, homonym=False, years=False, wiki_path=None, clean_duplicates=False): with open(pkl_with_stats_fn, 'rb') as f: stats_uncleaned = pkl.load(f) # We filter out the sections errors stats = { key: stats_uncleaned[key] for key in stats_uncleaned if stats_uncleaned[key] != 'SectionError' } filtered_stats = filter_dic(stats, min_len_paragraphs=min_len_paragraphs, draft=draft, homonym=homonym, max_len_paragraphs=max_len_paragraphs) filtered_stats = filter_min_paras(filtered_stats, min_paragraphs) # We filter the years if clean_duplicates: if wiki_path is None: print("Error : give a wikipath for duplicates cleaning") return new_ft_stats = {} wiki_obj = Wikipedia('fr') for filename, stats in filtered_stats.items(): try: with open(wiki_path + '/' + filename, 'rb') as f: page = pkl.load(f) except FileNotFoundError: print("Not found :" + filename) continue page_info = wiki_obj.info(page) new_title = title = page_info.title new_title = new_title.replace(' ', '_') new_title += '.pkl' new_ft_stats[new_title] = stats filtered_stats = new_ft_stats if not years: print("Length before year fitering :", len(filtered_stats)) if wiki_path is None: filtered_stats = { filename: filtered_stats[filename] for filename in filtered_stats if filter_years_articles(filename) } else: filtered_stats = { filename: filtered_stats[filename] for filename in filtered_stats if filter_years_articles(wiki_path + filename) } print("Final length : ", len(filtered_stats)) return filtered_stats