def test_generate_output(self): parser = XMLParserTask() xml_file = parser.run() fix_redirect = FixRedirectsTask() fixed_result = fix_redirect.run(xml_file) fix_snippets = FixSnippetsTask() fixed_snippets = fix_snippets.run(fixed_result) get_categories = GetCategoriesTask() out_entries, definitions_category, others_categories = get_categories.run( fixed_snippets) curate_titles = CurateTitlesTask() curate_results = curate_titles.run(out_entries) get_definitions = GetDefinitionsTask() definitions, others = get_definitions.run(curate_results, definitions_category) get_others = GetOthersTask() theorems, lemmas, corollaries = get_others.run(others, others_categories) check_premises = CheckPremisesTask() lemmas, corollaries, theorems = check_premises.run( definitions, lemmas, corollaries, theorems) generate_output = GenerateOutputTask() ( out_definitions, out_lemmas, out_theorems, out_corollaries, ) = generate_output.run(definitions, lemmas, corollaries, theorems)
def test_check_pemises(self): parser = XMLParserTask() xml_file = parser.run() fix_redirect = FixRedirectsTask() fixed_result = fix_redirect.run(xml_file) fix_snippets = FixSnippetsTask() fixed_snippets = fix_snippets.run(fixed_result) get_categories = GetCategoriesTask() out_entries, definitions_category, others_categories = get_categories.run( fixed_snippets ) curate_titles = CurateTitlesTask() curate_results = curate_titles.run(out_entries) get_definitions = GetDefinitionsTask() definitions, others = get_definitions.run(curate_results, definitions_category) get_others = GetOthersTask() theorems, lemmas, corollaries = get_others.run(others, others_categories) check_premises = CheckPremisesTask() lemmas, corollaries, theorems = check_premises.run( definitions, lemmas, corollaries, theorems ) logger.info("Number of Definitions:") logger.info(len(definitions)) logger.info("Number of Theorems:") logger.info(len(theorems)) logger.info("Number of Lemmas:") logger.info(len(lemmas)) logger.info("Number of Corollaries:") logger.info(len(corollaries)) title_t, content_t = random.choice(list(theorems.items())) title_l, content_l = random.choice(list(lemmas.items())) title_c, content_c = random.choice(list(corollaries.items())) title_d, content_d = random.choice(list(definitions.items()))
def test_xml_parsing(self): parser = XMLParserTask() xml_file = parser.run() self.assertIsNotNone(xml_file) logger.info(f"Total of entries in the XML file: {len(xml_file)}") logger.info("Sample xml file:") logger.info(list(xml_file.keys())[150]) logger.info(xml_file[list(xml_file.keys())[150]])
def test_fix_snippets(self): parser = XMLParserTask() xml_file = parser.run() fix_redirect = FixRedirectsTask() fixed_result = fix_redirect.run(xml_file) fix_snippets = FixSnippetsTask() fixed_snippets = fix_snippets.run(fixed_result) title, content = random.choice(list(fixed_snippets.items())) logger.info(title) logger.info(content)
def test_categories(self): parser = XMLParserTask() xml_file = parser.run() fix_redirect = FixRedirectsTask() fixed_result = fix_redirect.run(xml_file) fix_snippets = FixSnippetsTask() fixed_snippets = fix_snippets.run(fixed_result) get_categories = GetCategoriesTask() out_entries, definitions_category, top_categories = get_categories.run( fixed_snippets )
def test_get_definitions(self): parser = XMLParserTask() xml_file = parser.run() fix_redirect = FixRedirectsTask() fixed_result = fix_redirect.run(xml_file) fix_snippets = FixSnippetsTask() fixed_snippets = fix_snippets.run(fixed_result) get_categories = GetCategoriesTask() out_entries, definitions_category, top_categories = get_categories.run( fixed_snippets) curate_titles = CurateTitlesTask() curate_results = curate_titles.run(out_entries) get_definitions = GetDefinitionsTask() definitions, others = get_definitions.run(curate_results, definitions_category)
def test_fix_redirect(self): parser = XMLParserTask() xml_file = parser.run() logger.info(random.choice(list(xml_file.values()))) fix_redirect = FixRedirectsTask() fixed_result = fix_redirect.run(xml_file) for title, content in fixed_result.items(): lower_content = content.lower() self.assertFalse("#redirect" in lower_content) logger.info(random.choice(list(fixed_result.values()))) logger.info("Total of entries before removing redirects") logger.info(len(xml_file)) logger.info("Total of entries after removing redirects") logger.info(len(fixed_result))
def test_curate_titles(self): parser = XMLParserTask() xml_file = parser.run() fix_redirect = FixRedirectsTask() fixed_result = fix_redirect.run(xml_file) fix_snippets = FixSnippetsTask() fixed_snippets = fix_snippets.run(fixed_result) get_categories = GetCategoriesTask() out_entries = get_categories.run(fixed_snippets) curate_titles = CurateTitlesTask() logger.info("Before curation") logger.info(len(out_entries["out_entries"])) logger.info("After curation") curate_results = curate_titles.run(out_entries["out_entries"]) logger.info(len(curate_results))
def test_get_others(self): parser = XMLParserTask() xml_file = parser.run() fix_redirect = FixRedirectsTask() fixed_result = fix_redirect.run(xml_file) fix_snippets = FixSnippetsTask() fixed_snippets = fix_snippets.run(fixed_result) get_categories = GetCategoriesTask() out_entries = get_categories.run(fixed_snippets) curate_titles = CurateTitlesTask() curate_results = curate_titles.run(out_entries["out_entries"]) get_definitions = GetDefinitionsTask() definitions = get_definitions.run(curate_results, out_entries["definitions_category"]) get_others = GetOthersTask() theorems = get_others.run(definitions["others"], out_entries["others_categories"]) logger.info("Number of Theorems:") logger.info(len(theorems["theorems"])) logger.info("Number of Lemmas:") logger.info(len(theorems["lemmas"])) logger.info("Number of Corollaries:") logger.info(len(theorems["corollaries"]))
XMLParserTask, FixSnippetsTask, GetCategoriesTask, CurateTitlesTask, GetDefinitionsTask, GetOthersTask, GenerateOutputTask, ) cache_args = dict( target="{task_name}.pkl", checkpoint=True, result=LocalResult(dir=f"./cache/"), ) parser_task = XMLParserTask(**cache_args) fix_redirect_task = FixRedirectsTask(**cache_args) fix_snippets_task = FixSnippetsTask(**cache_args) get_categories_task = GetCategoriesTask(**cache_args) curate_titles_task = CurateTitlesTask(**cache_args) get_definitions_task = GetDefinitionsTask(**cache_args) get_others_task = GetOthersTask(**cache_args) generate_output_task = GenerateOutputTask() # generate_output_task = GenerateOutputTask() with Flow("Run extraction flow") as flow: xml_file = parser_task() fixed_result = fix_redirect_task(xml_file) fixed_snippets = fix_snippets_task(fixed_result) out_categories = get_categories_task(fixed_snippets)